From 9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00 Mon Sep 17 00:00:00 2001 From: Yunhong Jiang Date: Tue, 4 Aug 2015 12:17:53 -0700 Subject: Add the rt linux 4.1.3-rt3 as base Import the rt linux 4.1.3-rt3 as OPNFV kvm base. It's from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git linux-4.1.y-rt and the base is: commit 0917f823c59692d751951bf5ea699a2d1e2f26a2 Author: Sebastian Andrzej Siewior Date: Sat Jul 25 12:13:34 2015 +0200 Prepare v4.1.3-rt3 Signed-off-by: Sebastian Andrzej Siewior We lose all the git history this way and it's not good. We should apply another opnfv project repo in future. Change-Id: I87543d81c9df70d99c5001fbdf646b202c19f423 Signed-off-by: Yunhong Jiang --- kernel/drivers/gpu/drm/amd/amdkfd/Kconfig | 9 + kernel/drivers/gpu/drm/amd/amdkfd/Makefile | 16 + kernel/drivers/gpu/drm/amd/amdkfd/cik_regs.h | 234 ++++ kernel/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 643 ++++++++++ kernel/drivers/gpu/drm/amd/amdkfd/kfd_crat.h | 294 +++++ kernel/drivers/gpu/drm/amd/amdkfd/kfd_device.c | 522 ++++++++ .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1217 +++++++++++++++++++ .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 180 +++ .../drm/amd/amdkfd/kfd_device_queue_manager_cik.c | 135 +++ .../drm/amd/amdkfd/kfd_device_queue_manager_vi.c | 64 + kernel/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 249 ++++ .../drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 355 ++++++ .../drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 340 ++++++ .../drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h | 101 ++ .../gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c | 44 + .../gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c | 56 + kernel/drivers/gpu/drm/amd/amdkfd/kfd_module.c | 138 +++ .../drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 37 + .../drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h | 91 ++ .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 451 +++++++ .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 33 + .../gpu/drm/amd/amdkfd/kfd_packet_manager.c | 557 +++++++++ kernel/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c | 96 ++ .../drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h | 405 +++++++ .../drivers/gpu/drm/amd/amdkfd/kfd_pm4_opcodes.h | 107 ++ kernel/drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 645 ++++++++++ kernel/drivers/gpu/drm/amd/amdkfd/kfd_process.c | 433 +++++++ .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 359 ++++++ kernel/drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 85 ++ kernel/drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 1254 ++++++++++++++++++++ kernel/drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 168 +++ 31 files changed, 9318 insertions(+) create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/Kconfig create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/Makefile create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/cik_regs.h create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_crat.h create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_device.c create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_module.c create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_pm4_opcodes.h create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_priv.h create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_process.c create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_queue.c create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_topology.c create mode 100644 kernel/drivers/gpu/drm/amd/amdkfd/kfd_topology.h (limited to 'kernel/drivers/gpu/drm/amd/amdkfd') diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/Kconfig b/kernel/drivers/gpu/drm/amd/amdkfd/Kconfig new file mode 100644 index 000000000..8dfac37ff --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/Kconfig @@ -0,0 +1,9 @@ +# +# Heterogenous system architecture configuration +# + +config HSA_AMD + tristate "HSA kernel driver for AMD GPU devices" + depends on DRM_RADEON && AMD_IOMMU_V2 && X86_64 + help + Enable this if you want to use HSA features on AMD GPU devices. diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/Makefile b/kernel/drivers/gpu/drm/amd/amdkfd/Makefile new file mode 100644 index 000000000..0f4960148 --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/Makefile @@ -0,0 +1,16 @@ +# +# Makefile for Heterogenous System Architecture support for AMD GPU devices +# + +ccflags-y := -Iinclude/drm -Idrivers/gpu/drm/amd/include/ + +amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ + kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \ + kfd_process.o kfd_queue.o kfd_mqd_manager.o \ + kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \ + kfd_kernel_queue.o kfd_kernel_queue_cik.o \ + kfd_kernel_queue_vi.o kfd_packet_manager.o \ + kfd_process_queue_manager.o kfd_device_queue_manager.o \ + kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \ + +obj-$(CONFIG_HSA_AMD) += amdkfd.o diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/cik_regs.h b/kernel/drivers/gpu/drm/amd/amdkfd/cik_regs.h new file mode 100644 index 000000000..01ff332fa --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/cik_regs.h @@ -0,0 +1,234 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef CIK_REGS_H +#define CIK_REGS_H + +#define IH_VMID_0_LUT 0x3D40u + +#define BIF_DOORBELL_CNTL 0x530Cu + +#define SRBM_GFX_CNTL 0xE44 +#define PIPEID(x) ((x) << 0) +#define MEID(x) ((x) << 2) +#define VMID(x) ((x) << 4) +#define QUEUEID(x) ((x) << 8) + +#define SQ_CONFIG 0x8C00 + +#define SH_MEM_BASES 0x8C28 +/* if PTR32, these are the bases for scratch and lds */ +#define PRIVATE_BASE(x) ((x) << 0) /* scratch */ +#define SHARED_BASE(x) ((x) << 16) /* LDS */ +#define SH_MEM_APE1_BASE 0x8C2C +/* if PTR32, this is the base location of GPUVM */ +#define SH_MEM_APE1_LIMIT 0x8C30 +/* if PTR32, this is the upper limit of GPUVM */ +#define SH_MEM_CONFIG 0x8C34 +#define PTR32 (1 << 0) +#define PRIVATE_ATC (1 << 1) +#define ALIGNMENT_MODE(x) ((x) << 2) +#define SH_MEM_ALIGNMENT_MODE_DWORD 0 +#define SH_MEM_ALIGNMENT_MODE_DWORD_STRICT 1 +#define SH_MEM_ALIGNMENT_MODE_STRICT 2 +#define SH_MEM_ALIGNMENT_MODE_UNALIGNED 3 +#define DEFAULT_MTYPE(x) ((x) << 4) +#define APE1_MTYPE(x) ((x) << 7) + +/* valid for both DEFAULT_MTYPE and APE1_MTYPE */ +#define MTYPE_CACHED 0 +#define MTYPE_NONCACHED 3 + + +#define SH_STATIC_MEM_CONFIG 0x9604u + +#define TC_CFG_L1_LOAD_POLICY0 0xAC68 +#define TC_CFG_L1_LOAD_POLICY1 0xAC6C +#define TC_CFG_L1_STORE_POLICY 0xAC70 +#define TC_CFG_L2_LOAD_POLICY0 0xAC74 +#define TC_CFG_L2_LOAD_POLICY1 0xAC78 +#define TC_CFG_L2_STORE_POLICY0 0xAC7C +#define TC_CFG_L2_STORE_POLICY1 0xAC80 +#define TC_CFG_L2_ATOMIC_POLICY 0xAC84 +#define TC_CFG_L1_VOLATILE 0xAC88 +#define TC_CFG_L2_VOLATILE 0xAC8C + +#define CP_PQ_WPTR_POLL_CNTL 0xC20C +#define WPTR_POLL_EN (1 << 31) + +#define CPC_INT_CNTL 0xC2D0 +#define CP_ME1_PIPE0_INT_CNTL 0xC214 +#define CP_ME1_PIPE1_INT_CNTL 0xC218 +#define CP_ME1_PIPE2_INT_CNTL 0xC21C +#define CP_ME1_PIPE3_INT_CNTL 0xC220 +#define CP_ME2_PIPE0_INT_CNTL 0xC224 +#define CP_ME2_PIPE1_INT_CNTL 0xC228 +#define CP_ME2_PIPE2_INT_CNTL 0xC22C +#define CP_ME2_PIPE3_INT_CNTL 0xC230 +#define DEQUEUE_REQUEST_INT_ENABLE (1 << 13) +#define WRM_POLL_TIMEOUT_INT_ENABLE (1 << 17) +#define PRIV_REG_INT_ENABLE (1 << 23) +#define TIME_STAMP_INT_ENABLE (1 << 26) +#define GENERIC2_INT_ENABLE (1 << 29) +#define GENERIC1_INT_ENABLE (1 << 30) +#define GENERIC0_INT_ENABLE (1 << 31) +#define CP_ME1_PIPE0_INT_STATUS 0xC214 +#define CP_ME1_PIPE1_INT_STATUS 0xC218 +#define CP_ME1_PIPE2_INT_STATUS 0xC21C +#define CP_ME1_PIPE3_INT_STATUS 0xC220 +#define CP_ME2_PIPE0_INT_STATUS 0xC224 +#define CP_ME2_PIPE1_INT_STATUS 0xC228 +#define CP_ME2_PIPE2_INT_STATUS 0xC22C +#define CP_ME2_PIPE3_INT_STATUS 0xC230 +#define DEQUEUE_REQUEST_INT_STATUS (1 << 13) +#define WRM_POLL_TIMEOUT_INT_STATUS (1 << 17) +#define PRIV_REG_INT_STATUS (1 << 23) +#define TIME_STAMP_INT_STATUS (1 << 26) +#define GENERIC2_INT_STATUS (1 << 29) +#define GENERIC1_INT_STATUS (1 << 30) +#define GENERIC0_INT_STATUS (1 << 31) + +#define CP_HPD_EOP_BASE_ADDR 0xC904 +#define CP_HPD_EOP_BASE_ADDR_HI 0xC908 +#define CP_HPD_EOP_VMID 0xC90C +#define CP_HPD_EOP_CONTROL 0xC910 +#define EOP_SIZE(x) ((x) << 0) +#define EOP_SIZE_MASK (0x3f << 0) +#define CP_MQD_BASE_ADDR 0xC914 +#define CP_MQD_BASE_ADDR_HI 0xC918 +#define CP_HQD_ACTIVE 0xC91C +#define CP_HQD_VMID 0xC920 + +#define CP_HQD_PERSISTENT_STATE 0xC924u +#define DEFAULT_CP_HQD_PERSISTENT_STATE (0x33U << 8) +#define PRELOAD_REQ (1 << 0) + +#define CP_HQD_PIPE_PRIORITY 0xC928u +#define CP_HQD_QUEUE_PRIORITY 0xC92Cu +#define CP_HQD_QUANTUM 0xC930u +#define QUANTUM_EN 1U +#define QUANTUM_SCALE_1MS (1U << 4) +#define QUANTUM_DURATION(x) ((x) << 8) + +#define CP_HQD_PQ_BASE 0xC934 +#define CP_HQD_PQ_BASE_HI 0xC938 +#define CP_HQD_PQ_RPTR 0xC93C +#define CP_HQD_PQ_RPTR_REPORT_ADDR 0xC940 +#define CP_HQD_PQ_RPTR_REPORT_ADDR_HI 0xC944 +#define CP_HQD_PQ_WPTR_POLL_ADDR 0xC948 +#define CP_HQD_PQ_WPTR_POLL_ADDR_HI 0xC94C +#define CP_HQD_PQ_DOORBELL_CONTROL 0xC950 +#define DOORBELL_OFFSET(x) ((x) << 2) +#define DOORBELL_OFFSET_MASK (0x1fffff << 2) +#define DOORBELL_SOURCE (1 << 28) +#define DOORBELL_SCHD_HIT (1 << 29) +#define DOORBELL_EN (1 << 30) +#define DOORBELL_HIT (1 << 31) +#define CP_HQD_PQ_WPTR 0xC954 +#define CP_HQD_PQ_CONTROL 0xC958 +#define QUEUE_SIZE(x) ((x) << 0) +#define QUEUE_SIZE_MASK (0x3f << 0) +#define RPTR_BLOCK_SIZE(x) ((x) << 8) +#define RPTR_BLOCK_SIZE_MASK (0x3f << 8) +#define MIN_AVAIL_SIZE(x) ((x) << 20) +#define PQ_ATC_EN (1 << 23) +#define PQ_VOLATILE (1 << 26) +#define NO_UPDATE_RPTR (1 << 27) +#define UNORD_DISPATCH (1 << 28) +#define ROQ_PQ_IB_FLIP (1 << 29) +#define PRIV_STATE (1 << 30) +#define KMD_QUEUE (1 << 31) + +#define DEFAULT_RPTR_BLOCK_SIZE RPTR_BLOCK_SIZE(5) +#define DEFAULT_MIN_AVAIL_SIZE MIN_AVAIL_SIZE(3) + +#define CP_HQD_IB_BASE_ADDR 0xC95Cu +#define CP_HQD_IB_BASE_ADDR_HI 0xC960u +#define CP_HQD_IB_RPTR 0xC964u +#define CP_HQD_IB_CONTROL 0xC968u +#define IB_ATC_EN (1U << 23) +#define DEFAULT_MIN_IB_AVAIL_SIZE (3U << 20) + +#define AQL_ENABLE 1 + +#define CP_HQD_DEQUEUE_REQUEST 0xC974 +#define DEQUEUE_REQUEST_DRAIN 1 +#define DEQUEUE_REQUEST_RESET 2 +#define DEQUEUE_INT (1U << 8) + +#define CP_HQD_SEMA_CMD 0xC97Cu +#define CP_HQD_MSG_TYPE 0xC980u +#define CP_HQD_ATOMIC0_PREOP_LO 0xC984u +#define CP_HQD_ATOMIC0_PREOP_HI 0xC988u +#define CP_HQD_ATOMIC1_PREOP_LO 0xC98Cu +#define CP_HQD_ATOMIC1_PREOP_HI 0xC990u +#define CP_HQD_HQ_SCHEDULER0 0xC994u +#define CP_HQD_HQ_SCHEDULER1 0xC998u + + +#define CP_MQD_CONTROL 0xC99C +#define MQD_VMID(x) ((x) << 0) +#define MQD_VMID_MASK (0xf << 0) +#define MQD_CONTROL_PRIV_STATE_EN (1U << 8) + +#define SDMA_RB_VMID(x) (x << 24) +#define SDMA_RB_ENABLE (1 << 0) +#define SDMA_RB_SIZE(x) ((x) << 1) /* log2 */ +#define SDMA_RPTR_WRITEBACK_ENABLE (1 << 12) +#define SDMA_RPTR_WRITEBACK_TIMER(x) ((x) << 16) /* log2 */ +#define SDMA_OFFSET(x) (x << 0) +#define SDMA_DB_ENABLE (1 << 28) +#define SDMA_ATC (1 << 0) +#define SDMA_VA_PTR32 (1 << 4) +#define SDMA_VA_SHARED_BASE(x) (x << 8) + +#define GRBM_GFX_INDEX 0x30800 +#define INSTANCE_INDEX(x) ((x) << 0) +#define SH_INDEX(x) ((x) << 8) +#define SE_INDEX(x) ((x) << 16) +#define SH_BROADCAST_WRITES (1 << 29) +#define INSTANCE_BROADCAST_WRITES (1 << 30) +#define SE_BROADCAST_WRITES (1 << 31) + +#define SQC_CACHES 0x30d20 +#define SQC_POLICY 0x8C38u +#define SQC_VOLATILE 0x8C3Cu + +#define CP_PERFMON_CNTL 0x36020 + +#define ATC_VMID0_PASID_MAPPING 0x339Cu +#define ATC_VMID_PASID_MAPPING_UPDATE_STATUS 0x3398u +#define ATC_VMID_PASID_MAPPING_VALID (1U << 31) + +#define ATC_VM_APERTURE0_CNTL 0x3310u +#define ATS_ACCESS_MODE_NEVER 0 +#define ATS_ACCESS_MODE_ALWAYS 1 + +#define ATC_VM_APERTURE0_CNTL2 0x3318u +#define ATC_VM_APERTURE0_HIGH_ADDR 0x3308u +#define ATC_VM_APERTURE0_LOW_ADDR 0x3300u +#define ATC_VM_APERTURE1_CNTL 0x3314u +#define ATC_VM_APERTURE1_CNTL2 0x331Cu +#define ATC_VM_APERTURE1_HIGH_ADDR 0x330Cu +#define ATC_VM_APERTURE1_LOW_ADDR 0x3304u + +#endif diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c new file mode 100644 index 000000000..19a4fba46 --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -0,0 +1,643 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kfd_priv.h" +#include "kfd_device_queue_manager.h" + +static long kfd_ioctl(struct file *, unsigned int, unsigned long); +static int kfd_open(struct inode *, struct file *); +static int kfd_mmap(struct file *, struct vm_area_struct *); + +static const char kfd_dev_name[] = "kfd"; + +static const struct file_operations kfd_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = kfd_ioctl, + .compat_ioctl = kfd_ioctl, + .open = kfd_open, + .mmap = kfd_mmap, +}; + +static int kfd_char_dev_major = -1; +static struct class *kfd_class; +struct device *kfd_device; + +int kfd_chardev_init(void) +{ + int err = 0; + + kfd_char_dev_major = register_chrdev(0, kfd_dev_name, &kfd_fops); + err = kfd_char_dev_major; + if (err < 0) + goto err_register_chrdev; + + kfd_class = class_create(THIS_MODULE, kfd_dev_name); + err = PTR_ERR(kfd_class); + if (IS_ERR(kfd_class)) + goto err_class_create; + + kfd_device = device_create(kfd_class, NULL, + MKDEV(kfd_char_dev_major, 0), + NULL, kfd_dev_name); + err = PTR_ERR(kfd_device); + if (IS_ERR(kfd_device)) + goto err_device_create; + + return 0; + +err_device_create: + class_destroy(kfd_class); +err_class_create: + unregister_chrdev(kfd_char_dev_major, kfd_dev_name); +err_register_chrdev: + return err; +} + +void kfd_chardev_exit(void) +{ + device_destroy(kfd_class, MKDEV(kfd_char_dev_major, 0)); + class_destroy(kfd_class); + unregister_chrdev(kfd_char_dev_major, kfd_dev_name); +} + +struct device *kfd_chardev(void) +{ + return kfd_device; +} + + +static int kfd_open(struct inode *inode, struct file *filep) +{ + struct kfd_process *process; + bool is_32bit_user_mode; + + if (iminor(inode) != 0) + return -ENODEV; + + is_32bit_user_mode = is_compat_task(); + + if (is_32bit_user_mode == true) { + dev_warn(kfd_device, + "Process %d (32-bit) failed to open /dev/kfd\n" + "32-bit processes are not supported by amdkfd\n", + current->pid); + return -EPERM; + } + + process = kfd_create_process(current); + if (IS_ERR(process)) + return PTR_ERR(process); + + dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n", + process->pasid, process->is_32bit_user_mode); + + return 0; +} + +static int kfd_ioctl_get_version(struct file *filep, struct kfd_process *p, + void *data) +{ + struct kfd_ioctl_get_version_args *args = data; + int err = 0; + + args->major_version = KFD_IOCTL_MAJOR_VERSION; + args->minor_version = KFD_IOCTL_MINOR_VERSION; + + return err; +} + +static int set_queue_properties_from_user(struct queue_properties *q_properties, + struct kfd_ioctl_create_queue_args *args) +{ + if (args->queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) { + pr_err("kfd: queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n"); + return -EINVAL; + } + + if (args->queue_priority > KFD_MAX_QUEUE_PRIORITY) { + pr_err("kfd: queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n"); + return -EINVAL; + } + + if ((args->ring_base_address) && + (!access_ok(VERIFY_WRITE, + (const void __user *) args->ring_base_address, + sizeof(uint64_t)))) { + pr_err("kfd: can't access ring base address\n"); + return -EFAULT; + } + + if (!is_power_of_2(args->ring_size) && (args->ring_size != 0)) { + pr_err("kfd: ring size must be a power of 2 or 0\n"); + return -EINVAL; + } + + if (!access_ok(VERIFY_WRITE, + (const void __user *) args->read_pointer_address, + sizeof(uint32_t))) { + pr_err("kfd: can't access read pointer\n"); + return -EFAULT; + } + + if (!access_ok(VERIFY_WRITE, + (const void __user *) args->write_pointer_address, + sizeof(uint32_t))) { + pr_err("kfd: can't access write pointer\n"); + return -EFAULT; + } + + if (args->eop_buffer_address && + !access_ok(VERIFY_WRITE, + (const void __user *) args->eop_buffer_address, + sizeof(uint32_t))) { + pr_debug("kfd: can't access eop buffer"); + return -EFAULT; + } + + if (args->ctx_save_restore_address && + !access_ok(VERIFY_WRITE, + (const void __user *) args->ctx_save_restore_address, + sizeof(uint32_t))) { + pr_debug("kfd: can't access ctx save restore buffer"); + return -EFAULT; + } + + q_properties->is_interop = false; + q_properties->queue_percent = args->queue_percentage; + q_properties->priority = args->queue_priority; + q_properties->queue_address = args->ring_base_address; + q_properties->queue_size = args->ring_size; + q_properties->read_ptr = (uint32_t *) args->read_pointer_address; + q_properties->write_ptr = (uint32_t *) args->write_pointer_address; + q_properties->eop_ring_buffer_address = args->eop_buffer_address; + q_properties->eop_ring_buffer_size = args->eop_buffer_size; + q_properties->ctx_save_restore_area_address = + args->ctx_save_restore_address; + q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size; + if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE || + args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) + q_properties->type = KFD_QUEUE_TYPE_COMPUTE; + else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA) + q_properties->type = KFD_QUEUE_TYPE_SDMA; + else + return -ENOTSUPP; + + if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) + q_properties->format = KFD_QUEUE_FORMAT_AQL; + else + q_properties->format = KFD_QUEUE_FORMAT_PM4; + + pr_debug("Queue Percentage (%d, %d)\n", + q_properties->queue_percent, args->queue_percentage); + + pr_debug("Queue Priority (%d, %d)\n", + q_properties->priority, args->queue_priority); + + pr_debug("Queue Address (0x%llX, 0x%llX)\n", + q_properties->queue_address, args->ring_base_address); + + pr_debug("Queue Size (0x%llX, %u)\n", + q_properties->queue_size, args->ring_size); + + pr_debug("Queue r/w Pointers (0x%llX, 0x%llX)\n", + (uint64_t) q_properties->read_ptr, + (uint64_t) q_properties->write_ptr); + + pr_debug("Queue Format (%d)\n", q_properties->format); + + pr_debug("Queue EOP (0x%llX)\n", q_properties->eop_ring_buffer_address); + + pr_debug("Queue CTX save arex (0x%llX)\n", + q_properties->ctx_save_restore_area_address); + + return 0; +} + +static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, + void *data) +{ + struct kfd_ioctl_create_queue_args *args = data; + struct kfd_dev *dev; + int err = 0; + unsigned int queue_id; + struct kfd_process_device *pdd; + struct queue_properties q_properties; + + memset(&q_properties, 0, sizeof(struct queue_properties)); + + pr_debug("kfd: creating queue ioctl\n"); + + err = set_queue_properties_from_user(&q_properties, args); + if (err) + return err; + + pr_debug("kfd: looking for gpu id 0x%x\n", args->gpu_id); + dev = kfd_device_by_id(args->gpu_id); + if (dev == NULL) { + pr_debug("kfd: gpu id 0x%x was not found\n", args->gpu_id); + return -EINVAL; + } + + mutex_lock(&p->mutex); + + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd)) { + err = -ESRCH; + goto err_bind_process; + } + + pr_debug("kfd: creating queue for PASID %d on GPU 0x%x\n", + p->pasid, + dev->id); + + err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, + 0, q_properties.type, &queue_id); + if (err != 0) + goto err_create_queue; + + args->queue_id = queue_id; + + /* Return gpu_id as doorbell offset for mmap usage */ + args->doorbell_offset = args->gpu_id << PAGE_SHIFT; + + mutex_unlock(&p->mutex); + + pr_debug("kfd: queue id %d was created successfully\n", args->queue_id); + + pr_debug("ring buffer address == 0x%016llX\n", + args->ring_base_address); + + pr_debug("read ptr address == 0x%016llX\n", + args->read_pointer_address); + + pr_debug("write ptr address == 0x%016llX\n", + args->write_pointer_address); + + return 0; + +err_create_queue: +err_bind_process: + mutex_unlock(&p->mutex); + return err; +} + +static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p, + void *data) +{ + int retval; + struct kfd_ioctl_destroy_queue_args *args = data; + + pr_debug("kfd: destroying queue id %d for PASID %d\n", + args->queue_id, + p->pasid); + + mutex_lock(&p->mutex); + + retval = pqm_destroy_queue(&p->pqm, args->queue_id); + + mutex_unlock(&p->mutex); + return retval; +} + +static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, + void *data) +{ + int retval; + struct kfd_ioctl_update_queue_args *args = data; + struct queue_properties properties; + + if (args->queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) { + pr_err("kfd: queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n"); + return -EINVAL; + } + + if (args->queue_priority > KFD_MAX_QUEUE_PRIORITY) { + pr_err("kfd: queue priority must be between 0 to KFD_MAX_QUEUE_PRIORITY\n"); + return -EINVAL; + } + + if ((args->ring_base_address) && + (!access_ok(VERIFY_WRITE, + (const void __user *) args->ring_base_address, + sizeof(uint64_t)))) { + pr_err("kfd: can't access ring base address\n"); + return -EFAULT; + } + + if (!is_power_of_2(args->ring_size) && (args->ring_size != 0)) { + pr_err("kfd: ring size must be a power of 2 or 0\n"); + return -EINVAL; + } + + properties.queue_address = args->ring_base_address; + properties.queue_size = args->ring_size; + properties.queue_percent = args->queue_percentage; + properties.priority = args->queue_priority; + + pr_debug("kfd: updating queue id %d for PASID %d\n", + args->queue_id, p->pasid); + + mutex_lock(&p->mutex); + + retval = pqm_update_queue(&p->pqm, args->queue_id, &properties); + + mutex_unlock(&p->mutex); + + return retval; +} + +static int kfd_ioctl_set_memory_policy(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_set_memory_policy_args *args = data; + struct kfd_dev *dev; + int err = 0; + struct kfd_process_device *pdd; + enum cache_policy default_policy, alternate_policy; + + if (args->default_policy != KFD_IOC_CACHE_POLICY_COHERENT + && args->default_policy != KFD_IOC_CACHE_POLICY_NONCOHERENT) { + return -EINVAL; + } + + if (args->alternate_policy != KFD_IOC_CACHE_POLICY_COHERENT + && args->alternate_policy != KFD_IOC_CACHE_POLICY_NONCOHERENT) { + return -EINVAL; + } + + dev = kfd_device_by_id(args->gpu_id); + if (dev == NULL) + return -EINVAL; + + mutex_lock(&p->mutex); + + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd)) { + err = -ESRCH; + goto out; + } + + default_policy = (args->default_policy == KFD_IOC_CACHE_POLICY_COHERENT) + ? cache_policy_coherent : cache_policy_noncoherent; + + alternate_policy = + (args->alternate_policy == KFD_IOC_CACHE_POLICY_COHERENT) + ? cache_policy_coherent : cache_policy_noncoherent; + + if (!dev->dqm->ops.set_cache_memory_policy(dev->dqm, + &pdd->qpd, + default_policy, + alternate_policy, + (void __user *)args->alternate_aperture_base, + args->alternate_aperture_size)) + err = -EINVAL; + +out: + mutex_unlock(&p->mutex); + + return err; +} + +static int kfd_ioctl_get_clock_counters(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_get_clock_counters_args *args = data; + struct kfd_dev *dev; + struct timespec64 time; + + dev = kfd_device_by_id(args->gpu_id); + if (dev == NULL) + return -EINVAL; + + /* Reading GPU clock counter from KGD */ + args->gpu_clock_counter = + dev->kfd2kgd->get_gpu_clock_counter(dev->kgd); + + /* No access to rdtsc. Using raw monotonic time */ + getrawmonotonic64(&time); + args->cpu_clock_counter = (uint64_t)timespec64_to_ns(&time); + + get_monotonic_boottime64(&time); + args->system_clock_counter = (uint64_t)timespec64_to_ns(&time); + + /* Since the counter is in nano-seconds we use 1GHz frequency */ + args->system_clock_freq = 1000000000; + + return 0; +} + + +static int kfd_ioctl_get_process_apertures(struct file *filp, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_get_process_apertures_args *args = data; + struct kfd_process_device_apertures *pAperture; + struct kfd_process_device *pdd; + + dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid); + + args->num_of_nodes = 0; + + mutex_lock(&p->mutex); + + /*if the process-device list isn't empty*/ + if (kfd_has_process_device_data(p)) { + /* Run over all pdd of the process */ + pdd = kfd_get_first_process_device_data(p); + do { + pAperture = + &args->process_apertures[args->num_of_nodes]; + pAperture->gpu_id = pdd->dev->id; + pAperture->lds_base = pdd->lds_base; + pAperture->lds_limit = pdd->lds_limit; + pAperture->gpuvm_base = pdd->gpuvm_base; + pAperture->gpuvm_limit = pdd->gpuvm_limit; + pAperture->scratch_base = pdd->scratch_base; + pAperture->scratch_limit = pdd->scratch_limit; + + dev_dbg(kfd_device, + "node id %u\n", args->num_of_nodes); + dev_dbg(kfd_device, + "gpu id %u\n", pdd->dev->id); + dev_dbg(kfd_device, + "lds_base %llX\n", pdd->lds_base); + dev_dbg(kfd_device, + "lds_limit %llX\n", pdd->lds_limit); + dev_dbg(kfd_device, + "gpuvm_base %llX\n", pdd->gpuvm_base); + dev_dbg(kfd_device, + "gpuvm_limit %llX\n", pdd->gpuvm_limit); + dev_dbg(kfd_device, + "scratch_base %llX\n", pdd->scratch_base); + dev_dbg(kfd_device, + "scratch_limit %llX\n", pdd->scratch_limit); + + args->num_of_nodes++; + } while ((pdd = kfd_get_next_process_device_data(p, pdd)) != NULL && + (args->num_of_nodes < NUM_OF_SUPPORTED_GPUS)); + } + + mutex_unlock(&p->mutex); + + return 0; +} + +#define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ + [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, .cmd_drv = 0, .name = #ioctl} + +/** Ioctl table */ +static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { + AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_VERSION, + kfd_ioctl_get_version, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_CREATE_QUEUE, + kfd_ioctl_create_queue, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_DESTROY_QUEUE, + kfd_ioctl_destroy_queue, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_MEMORY_POLICY, + kfd_ioctl_set_memory_policy, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_CLOCK_COUNTERS, + kfd_ioctl_get_clock_counters, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES, + kfd_ioctl_get_process_apertures, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_UPDATE_QUEUE, + kfd_ioctl_update_queue, 0), +}; + +#define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) + +static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + struct kfd_process *process; + amdkfd_ioctl_t *func; + const struct amdkfd_ioctl_desc *ioctl = NULL; + unsigned int nr = _IOC_NR(cmd); + char stack_kdata[128]; + char *kdata = NULL; + unsigned int usize, asize; + int retcode = -EINVAL; + + if (nr >= AMDKFD_CORE_IOCTL_COUNT) + goto err_i1; + + if ((nr >= AMDKFD_COMMAND_START) && (nr < AMDKFD_COMMAND_END)) { + u32 amdkfd_size; + + ioctl = &amdkfd_ioctls[nr]; + + amdkfd_size = _IOC_SIZE(ioctl->cmd); + usize = asize = _IOC_SIZE(cmd); + if (amdkfd_size > asize) + asize = amdkfd_size; + + cmd = ioctl->cmd; + } else + goto err_i1; + + dev_dbg(kfd_device, "ioctl cmd 0x%x (#%d), arg 0x%lx\n", cmd, nr, arg); + + process = kfd_get_process(current); + if (IS_ERR(process)) { + dev_dbg(kfd_device, "no process\n"); + goto err_i1; + } + + /* Do not trust userspace, use our own definition */ + func = ioctl->func; + + if (unlikely(!func)) { + dev_dbg(kfd_device, "no function\n"); + retcode = -EINVAL; + goto err_i1; + } + + if (cmd & (IOC_IN | IOC_OUT)) { + if (asize <= sizeof(stack_kdata)) { + kdata = stack_kdata; + } else { + kdata = kmalloc(asize, GFP_KERNEL); + if (!kdata) { + retcode = -ENOMEM; + goto err_i1; + } + } + if (asize > usize) + memset(kdata + usize, 0, asize - usize); + } + + if (cmd & IOC_IN) { + if (copy_from_user(kdata, (void __user *)arg, usize) != 0) { + retcode = -EFAULT; + goto err_i1; + } + } else if (cmd & IOC_OUT) { + memset(kdata, 0, usize); + } + + retcode = func(filep, process, kdata); + + if (cmd & IOC_OUT) + if (copy_to_user((void __user *)arg, kdata, usize) != 0) + retcode = -EFAULT; + +err_i1: + if (!ioctl) + dev_dbg(kfd_device, "invalid ioctl: pid=%d, cmd=0x%02x, nr=0x%02x\n", + task_pid_nr(current), cmd, nr); + + if (kdata != stack_kdata) + kfree(kdata); + + if (retcode) + dev_dbg(kfd_device, "ret = %d\n", retcode); + + return retcode; +} + +static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct kfd_process *process; + + process = kfd_get_process(current); + if (IS_ERR(process)) + return PTR_ERR(process); + + return kfd_doorbell_mmap(process, vma); +} diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_crat.h new file mode 100644 index 000000000..a374fa3d3 --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_crat.h @@ -0,0 +1,294 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef KFD_CRAT_H_INCLUDED +#define KFD_CRAT_H_INCLUDED + +#include + +#pragma pack(1) + +/* + * 4CC signature values for the CRAT and CDIT ACPI tables + */ + +#define CRAT_SIGNATURE "CRAT" +#define CDIT_SIGNATURE "CDIT" + +/* + * Component Resource Association Table (CRAT) + */ + +#define CRAT_OEMID_LENGTH 6 +#define CRAT_OEMTABLEID_LENGTH 8 +#define CRAT_RESERVED_LENGTH 6 + +#define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1) + +struct crat_header { + uint32_t signature; + uint32_t length; + uint8_t revision; + uint8_t checksum; + uint8_t oem_id[CRAT_OEMID_LENGTH]; + uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH]; + uint32_t oem_revision; + uint32_t creator_id; + uint32_t creator_revision; + uint32_t total_entries; + uint16_t num_domains; + uint8_t reserved[CRAT_RESERVED_LENGTH]; +}; + +/* + * The header structure is immediately followed by total_entries of the + * data definitions + */ + +/* + * The currently defined subtype entries in the CRAT + */ +#define CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY 0 +#define CRAT_SUBTYPE_MEMORY_AFFINITY 1 +#define CRAT_SUBTYPE_CACHE_AFFINITY 2 +#define CRAT_SUBTYPE_TLB_AFFINITY 3 +#define CRAT_SUBTYPE_CCOMPUTE_AFFINITY 4 +#define CRAT_SUBTYPE_IOLINK_AFFINITY 5 +#define CRAT_SUBTYPE_MAX 6 + +#define CRAT_SIBLINGMAP_SIZE 32 + +/* + * ComputeUnit Affinity structure and definitions + */ +#define CRAT_CU_FLAGS_ENABLED 0x00000001 +#define CRAT_CU_FLAGS_HOT_PLUGGABLE 0x00000002 +#define CRAT_CU_FLAGS_CPU_PRESENT 0x00000004 +#define CRAT_CU_FLAGS_GPU_PRESENT 0x00000008 +#define CRAT_CU_FLAGS_IOMMU_PRESENT 0x00000010 +#define CRAT_CU_FLAGS_RESERVED 0xffffffe0 + +#define CRAT_COMPUTEUNIT_RESERVED_LENGTH 4 + +struct crat_subtype_computeunit { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; + uint32_t proximity_domain; + uint32_t processor_id_low; + uint16_t num_cpu_cores; + uint16_t num_simd_cores; + uint16_t max_waves_simd; + uint16_t io_count; + uint16_t hsa_capability; + uint16_t lds_size_in_kb; + uint8_t wave_front_size; + uint8_t num_banks; + uint16_t micro_engine_id; + uint8_t num_arrays; + uint8_t num_cu_per_array; + uint8_t num_simd_per_cu; + uint8_t max_slots_scatch_cu; + uint8_t reserved2[CRAT_COMPUTEUNIT_RESERVED_LENGTH]; +}; + +/* + * HSA Memory Affinity structure and definitions + */ +#define CRAT_MEM_FLAGS_ENABLED 0x00000001 +#define CRAT_MEM_FLAGS_HOT_PLUGGABLE 0x00000002 +#define CRAT_MEM_FLAGS_NON_VOLATILE 0x00000004 +#define CRAT_MEM_FLAGS_RESERVED 0xfffffff8 + +#define CRAT_MEMORY_RESERVED_LENGTH 8 + +struct crat_subtype_memory { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; + uint32_t promixity_domain; + uint32_t base_addr_low; + uint32_t base_addr_high; + uint32_t length_low; + uint32_t length_high; + uint32_t width; + uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH]; +}; + +/* + * HSA Cache Affinity structure and definitions + */ +#define CRAT_CACHE_FLAGS_ENABLED 0x00000001 +#define CRAT_CACHE_FLAGS_DATA_CACHE 0x00000002 +#define CRAT_CACHE_FLAGS_INST_CACHE 0x00000004 +#define CRAT_CACHE_FLAGS_CPU_CACHE 0x00000008 +#define CRAT_CACHE_FLAGS_SIMD_CACHE 0x00000010 +#define CRAT_CACHE_FLAGS_RESERVED 0xffffffe0 + +#define CRAT_CACHE_RESERVED_LENGTH 8 + +struct crat_subtype_cache { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; + uint32_t processor_id_low; + uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE]; + uint32_t cache_size; + uint8_t cache_level; + uint8_t lines_per_tag; + uint16_t cache_line_size; + uint8_t associativity; + uint8_t cache_properties; + uint16_t cache_latency; + uint8_t reserved2[CRAT_CACHE_RESERVED_LENGTH]; +}; + +/* + * HSA TLB Affinity structure and definitions + */ +#define CRAT_TLB_FLAGS_ENABLED 0x00000001 +#define CRAT_TLB_FLAGS_DATA_TLB 0x00000002 +#define CRAT_TLB_FLAGS_INST_TLB 0x00000004 +#define CRAT_TLB_FLAGS_CPU_TLB 0x00000008 +#define CRAT_TLB_FLAGS_SIMD_TLB 0x00000010 +#define CRAT_TLB_FLAGS_RESERVED 0xffffffe0 + +#define CRAT_TLB_RESERVED_LENGTH 4 + +struct crat_subtype_tlb { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; + uint32_t processor_id_low; + uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE]; + uint32_t tlb_level; + uint8_t data_tlb_associativity_2mb; + uint8_t data_tlb_size_2mb; + uint8_t instruction_tlb_associativity_2mb; + uint8_t instruction_tlb_size_2mb; + uint8_t data_tlb_associativity_4k; + uint8_t data_tlb_size_4k; + uint8_t instruction_tlb_associativity_4k; + uint8_t instruction_tlb_size_4k; + uint8_t data_tlb_associativity_1gb; + uint8_t data_tlb_size_1gb; + uint8_t instruction_tlb_associativity_1gb; + uint8_t instruction_tlb_size_1gb; + uint8_t reserved2[CRAT_TLB_RESERVED_LENGTH]; +}; + +/* + * HSA CCompute/APU Affinity structure and definitions + */ +#define CRAT_CCOMPUTE_FLAGS_ENABLED 0x00000001 +#define CRAT_CCOMPUTE_FLAGS_RESERVED 0xfffffffe + +#define CRAT_CCOMPUTE_RESERVED_LENGTH 16 + +struct crat_subtype_ccompute { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; + uint32_t processor_id_low; + uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE]; + uint32_t apu_size; + uint8_t reserved2[CRAT_CCOMPUTE_RESERVED_LENGTH]; +}; + +/* + * HSA IO Link Affinity structure and definitions + */ +#define CRAT_IOLINK_FLAGS_ENABLED 0x00000001 +#define CRAT_IOLINK_FLAGS_COHERENCY 0x00000002 +#define CRAT_IOLINK_FLAGS_RESERVED 0xfffffffc + +/* + * IO interface types + */ +#define CRAT_IOLINK_TYPE_UNDEFINED 0 +#define CRAT_IOLINK_TYPE_HYPERTRANSPORT 1 +#define CRAT_IOLINK_TYPE_PCIEXPRESS 2 +#define CRAT_IOLINK_TYPE_OTHER 3 +#define CRAT_IOLINK_TYPE_MAX 255 + +#define CRAT_IOLINK_RESERVED_LENGTH 24 + +struct crat_subtype_iolink { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; + uint32_t proximity_domain_from; + uint32_t proximity_domain_to; + uint8_t io_interface_type; + uint8_t version_major; + uint16_t version_minor; + uint32_t minimum_latency; + uint32_t maximum_latency; + uint32_t minimum_bandwidth_mbs; + uint32_t maximum_bandwidth_mbs; + uint32_t recommended_transfer_size; + uint8_t reserved2[CRAT_IOLINK_RESERVED_LENGTH]; +}; + +/* + * HSA generic sub-type header + */ + +#define CRAT_SUBTYPE_FLAGS_ENABLED 0x00000001 + +struct crat_subtype_generic { + uint8_t type; + uint8_t length; + uint16_t reserved; + uint32_t flags; +}; + +/* + * Component Locality Distance Information Table (CDIT) + */ +#define CDIT_OEMID_LENGTH 6 +#define CDIT_OEMTABLEID_LENGTH 8 + +struct cdit_header { + uint32_t signature; + uint32_t length; + uint8_t revision; + uint8_t checksum; + uint8_t oem_id[CDIT_OEMID_LENGTH]; + uint8_t oem_table_id[CDIT_OEMTABLEID_LENGTH]; + uint32_t oem_revision; + uint32_t creator_id; + uint32_t creator_revision; + uint32_t total_entries; + uint16_t num_domains; + uint8_t entry[1]; +}; + +#pragma pack() + +#endif /* KFD_CRAT_H_INCLUDED */ diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_device.c new file mode 100644 index 000000000..ca7f2d3af --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -0,0 +1,522 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include "kfd_priv.h" +#include "kfd_device_queue_manager.h" +#include "kfd_pm4_headers.h" + +#define MQD_SIZE_ALIGNED 768 + +static const struct kfd_device_info kaveri_device_info = { + .asic_family = CHIP_KAVERI, + .max_pasid_bits = 16, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .mqd_size_aligned = MQD_SIZE_ALIGNED +}; + +static const struct kfd_device_info carrizo_device_info = { + .asic_family = CHIP_CARRIZO, + .max_pasid_bits = 16, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED +}; + +struct kfd_deviceid { + unsigned short did; + const struct kfd_device_info *device_info; +}; + +/* Please keep this sorted by increasing device id. */ +static const struct kfd_deviceid supported_devices[] = { + { 0x1304, &kaveri_device_info }, /* Kaveri */ + { 0x1305, &kaveri_device_info }, /* Kaveri */ + { 0x1306, &kaveri_device_info }, /* Kaveri */ + { 0x1307, &kaveri_device_info }, /* Kaveri */ + { 0x1309, &kaveri_device_info }, /* Kaveri */ + { 0x130A, &kaveri_device_info }, /* Kaveri */ + { 0x130B, &kaveri_device_info }, /* Kaveri */ + { 0x130C, &kaveri_device_info }, /* Kaveri */ + { 0x130D, &kaveri_device_info }, /* Kaveri */ + { 0x130E, &kaveri_device_info }, /* Kaveri */ + { 0x130F, &kaveri_device_info }, /* Kaveri */ + { 0x1310, &kaveri_device_info }, /* Kaveri */ + { 0x1311, &kaveri_device_info }, /* Kaveri */ + { 0x1312, &kaveri_device_info }, /* Kaveri */ + { 0x1313, &kaveri_device_info }, /* Kaveri */ + { 0x1315, &kaveri_device_info }, /* Kaveri */ + { 0x1316, &kaveri_device_info }, /* Kaveri */ + { 0x1317, &kaveri_device_info }, /* Kaveri */ + { 0x1318, &kaveri_device_info }, /* Kaveri */ + { 0x131B, &kaveri_device_info }, /* Kaveri */ + { 0x131C, &kaveri_device_info }, /* Kaveri */ + { 0x131D, &kaveri_device_info } /* Kaveri */ +}; + +static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, + unsigned int chunk_size); +static void kfd_gtt_sa_fini(struct kfd_dev *kfd); + +static const struct kfd_device_info *lookup_device_info(unsigned short did) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(supported_devices); i++) { + if (supported_devices[i].did == did) { + BUG_ON(supported_devices[i].device_info == NULL); + return supported_devices[i].device_info; + } + } + + return NULL; +} + +struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, + struct pci_dev *pdev, const struct kfd2kgd_calls *f2g) +{ + struct kfd_dev *kfd; + + const struct kfd_device_info *device_info = + lookup_device_info(pdev->device); + + if (!device_info) + return NULL; + + kfd = kzalloc(sizeof(*kfd), GFP_KERNEL); + if (!kfd) + return NULL; + + kfd->kgd = kgd; + kfd->device_info = device_info; + kfd->pdev = pdev; + kfd->init_complete = false; + kfd->kfd2kgd = f2g; + + mutex_init(&kfd->doorbell_mutex); + memset(&kfd->doorbell_available_index, 0, + sizeof(kfd->doorbell_available_index)); + + return kfd; +} + +static bool device_iommu_pasid_init(struct kfd_dev *kfd) +{ + const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP | + AMD_IOMMU_DEVICE_FLAG_PRI_SUP | + AMD_IOMMU_DEVICE_FLAG_PASID_SUP; + + struct amd_iommu_device_info iommu_info; + unsigned int pasid_limit; + int err; + + err = amd_iommu_device_info(kfd->pdev, &iommu_info); + if (err < 0) { + dev_err(kfd_device, + "error getting iommu info. is the iommu enabled?\n"); + return false; + } + + if ((iommu_info.flags & required_iommu_flags) != required_iommu_flags) { + dev_err(kfd_device, "error required iommu flags ats(%i), pri(%i), pasid(%i)\n", + (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP) != 0, + (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) != 0, + (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) != 0); + return false; + } + + pasid_limit = min_t(unsigned int, + (unsigned int)1 << kfd->device_info->max_pasid_bits, + iommu_info.max_pasids); + /* + * last pasid is used for kernel queues doorbells + * in the future the last pasid might be used for a kernel thread. + */ + pasid_limit = min_t(unsigned int, + pasid_limit, + kfd->doorbell_process_limit - 1); + + err = amd_iommu_init_device(kfd->pdev, pasid_limit); + if (err < 0) { + dev_err(kfd_device, "error initializing iommu device\n"); + return false; + } + + if (!kfd_set_pasid_limit(pasid_limit)) { + dev_err(kfd_device, "error setting pasid limit\n"); + amd_iommu_free_device(kfd->pdev); + return false; + } + + return true; +} + +static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid) +{ + struct kfd_dev *dev = kfd_device_by_pci_dev(pdev); + + if (dev) + kfd_unbind_process_from_device(dev, pasid); +} + +bool kgd2kfd_device_init(struct kfd_dev *kfd, + const struct kgd2kfd_shared_resources *gpu_resources) +{ + unsigned int size; + + kfd->shared_resources = *gpu_resources; + + /* calculate max size of mqds needed for queues */ + size = max_num_of_queues_per_device * + kfd->device_info->mqd_size_aligned; + + /* + * calculate max size of runlist packet. + * There can be only 2 packets at once + */ + size += (KFD_MAX_NUM_OF_PROCESSES * sizeof(struct pm4_map_process) + + max_num_of_queues_per_device * + sizeof(struct pm4_map_queues) + sizeof(struct pm4_runlist)) * 2; + + /* Add size of HIQ & DIQ */ + size += KFD_KERNEL_QUEUE_SIZE * 2; + + /* add another 512KB for all other allocations on gart (HPD, fences) */ + size += 512 * 1024; + + if (kfd->kfd2kgd->init_gtt_mem_allocation( + kfd->kgd, size, &kfd->gtt_mem, + &kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr)){ + dev_err(kfd_device, + "Could not allocate %d bytes for device (%x:%x)\n", + size, kfd->pdev->vendor, kfd->pdev->device); + goto out; + } + + dev_info(kfd_device, + "Allocated %d bytes on gart for device(%x:%x)\n", + size, kfd->pdev->vendor, kfd->pdev->device); + + /* Initialize GTT sa with 512 byte chunk size */ + if (kfd_gtt_sa_init(kfd, size, 512) != 0) { + dev_err(kfd_device, + "Error initializing gtt sub-allocator\n"); + goto kfd_gtt_sa_init_error; + } + + kfd_doorbell_init(kfd); + + if (kfd_topology_add_device(kfd) != 0) { + dev_err(kfd_device, + "Error adding device (%x:%x) to topology\n", + kfd->pdev->vendor, kfd->pdev->device); + goto kfd_topology_add_device_error; + } + + if (!device_iommu_pasid_init(kfd)) { + dev_err(kfd_device, + "Error initializing iommuv2 for device (%x:%x)\n", + kfd->pdev->vendor, kfd->pdev->device); + goto device_iommu_pasid_error; + } + amd_iommu_set_invalidate_ctx_cb(kfd->pdev, + iommu_pasid_shutdown_callback); + + kfd->dqm = device_queue_manager_init(kfd); + if (!kfd->dqm) { + dev_err(kfd_device, + "Error initializing queue manager for device (%x:%x)\n", + kfd->pdev->vendor, kfd->pdev->device); + goto device_queue_manager_error; + } + + if (kfd->dqm->ops.start(kfd->dqm) != 0) { + dev_err(kfd_device, + "Error starting queuen manager for device (%x:%x)\n", + kfd->pdev->vendor, kfd->pdev->device); + goto dqm_start_error; + } + + kfd->init_complete = true; + dev_info(kfd_device, "added device (%x:%x)\n", kfd->pdev->vendor, + kfd->pdev->device); + + pr_debug("kfd: Starting kfd with the following scheduling policy %d\n", + sched_policy); + + goto out; + +dqm_start_error: + device_queue_manager_uninit(kfd->dqm); +device_queue_manager_error: + amd_iommu_free_device(kfd->pdev); +device_iommu_pasid_error: + kfd_topology_remove_device(kfd); +kfd_topology_add_device_error: + kfd_gtt_sa_fini(kfd); +kfd_gtt_sa_init_error: + kfd->kfd2kgd->free_gtt_mem(kfd->kgd, kfd->gtt_mem); + dev_err(kfd_device, + "device (%x:%x) NOT added due to errors\n", + kfd->pdev->vendor, kfd->pdev->device); +out: + return kfd->init_complete; +} + +void kgd2kfd_device_exit(struct kfd_dev *kfd) +{ + if (kfd->init_complete) { + device_queue_manager_uninit(kfd->dqm); + amd_iommu_free_device(kfd->pdev); + kfd_topology_remove_device(kfd); + kfd_gtt_sa_fini(kfd); + kfd->kfd2kgd->free_gtt_mem(kfd->kgd, kfd->gtt_mem); + } + + kfree(kfd); +} + +void kgd2kfd_suspend(struct kfd_dev *kfd) +{ + BUG_ON(kfd == NULL); + + if (kfd->init_complete) { + kfd->dqm->ops.stop(kfd->dqm); + amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); + amd_iommu_free_device(kfd->pdev); + } +} + +int kgd2kfd_resume(struct kfd_dev *kfd) +{ + unsigned int pasid_limit; + int err; + + BUG_ON(kfd == NULL); + + pasid_limit = kfd_get_pasid_limit(); + + if (kfd->init_complete) { + err = amd_iommu_init_device(kfd->pdev, pasid_limit); + if (err < 0) + return -ENXIO; + amd_iommu_set_invalidate_ctx_cb(kfd->pdev, + iommu_pasid_shutdown_callback); + kfd->dqm->ops.start(kfd->dqm); + } + + return 0; +} + +/* This is called directly from KGD at ISR. */ +void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) +{ + /* Process interrupts / schedule work as necessary */ +} + +static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, + unsigned int chunk_size) +{ + unsigned int num_of_bits; + + BUG_ON(!kfd); + BUG_ON(!kfd->gtt_mem); + BUG_ON(buf_size < chunk_size); + BUG_ON(buf_size == 0); + BUG_ON(chunk_size == 0); + + kfd->gtt_sa_chunk_size = chunk_size; + kfd->gtt_sa_num_of_chunks = buf_size / chunk_size; + + num_of_bits = kfd->gtt_sa_num_of_chunks / BITS_PER_BYTE; + BUG_ON(num_of_bits == 0); + + kfd->gtt_sa_bitmap = kzalloc(num_of_bits, GFP_KERNEL); + + if (!kfd->gtt_sa_bitmap) + return -ENOMEM; + + pr_debug("kfd: gtt_sa_num_of_chunks = %d, gtt_sa_bitmap = %p\n", + kfd->gtt_sa_num_of_chunks, kfd->gtt_sa_bitmap); + + mutex_init(&kfd->gtt_sa_lock); + + return 0; + +} + +static void kfd_gtt_sa_fini(struct kfd_dev *kfd) +{ + mutex_destroy(&kfd->gtt_sa_lock); + kfree(kfd->gtt_sa_bitmap); +} + +static inline uint64_t kfd_gtt_sa_calc_gpu_addr(uint64_t start_addr, + unsigned int bit_num, + unsigned int chunk_size) +{ + return start_addr + bit_num * chunk_size; +} + +static inline uint32_t *kfd_gtt_sa_calc_cpu_addr(void *start_addr, + unsigned int bit_num, + unsigned int chunk_size) +{ + return (uint32_t *) ((uint64_t) start_addr + bit_num * chunk_size); +} + +int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, + struct kfd_mem_obj **mem_obj) +{ + unsigned int found, start_search, cur_size; + + BUG_ON(!kfd); + + if (size == 0) + return -EINVAL; + + if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size) + return -ENOMEM; + + *mem_obj = kmalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); + if ((*mem_obj) == NULL) + return -ENOMEM; + + pr_debug("kfd: allocated mem_obj = %p for size = %d\n", *mem_obj, size); + + start_search = 0; + + mutex_lock(&kfd->gtt_sa_lock); + +kfd_gtt_restart_search: + /* Find the first chunk that is free */ + found = find_next_zero_bit(kfd->gtt_sa_bitmap, + kfd->gtt_sa_num_of_chunks, + start_search); + + pr_debug("kfd: found = %d\n", found); + + /* If there wasn't any free chunk, bail out */ + if (found == kfd->gtt_sa_num_of_chunks) + goto kfd_gtt_no_free_chunk; + + /* Update fields of mem_obj */ + (*mem_obj)->range_start = found; + (*mem_obj)->range_end = found; + (*mem_obj)->gpu_addr = kfd_gtt_sa_calc_gpu_addr( + kfd->gtt_start_gpu_addr, + found, + kfd->gtt_sa_chunk_size); + (*mem_obj)->cpu_ptr = kfd_gtt_sa_calc_cpu_addr( + kfd->gtt_start_cpu_ptr, + found, + kfd->gtt_sa_chunk_size); + + pr_debug("kfd: gpu_addr = %p, cpu_addr = %p\n", + (uint64_t *) (*mem_obj)->gpu_addr, (*mem_obj)->cpu_ptr); + + /* If we need only one chunk, mark it as allocated and get out */ + if (size <= kfd->gtt_sa_chunk_size) { + pr_debug("kfd: single bit\n"); + set_bit(found, kfd->gtt_sa_bitmap); + goto kfd_gtt_out; + } + + /* Otherwise, try to see if we have enough contiguous chunks */ + cur_size = size - kfd->gtt_sa_chunk_size; + do { + (*mem_obj)->range_end = + find_next_zero_bit(kfd->gtt_sa_bitmap, + kfd->gtt_sa_num_of_chunks, ++found); + /* + * If next free chunk is not contiguous than we need to + * restart our search from the last free chunk we found (which + * wasn't contiguous to the previous ones + */ + if ((*mem_obj)->range_end != found) { + start_search = found; + goto kfd_gtt_restart_search; + } + + /* + * If we reached end of buffer, bail out with error + */ + if (found == kfd->gtt_sa_num_of_chunks) + goto kfd_gtt_no_free_chunk; + + /* Check if we don't need another chunk */ + if (cur_size <= kfd->gtt_sa_chunk_size) + cur_size = 0; + else + cur_size -= kfd->gtt_sa_chunk_size; + + } while (cur_size > 0); + + pr_debug("kfd: range_start = %d, range_end = %d\n", + (*mem_obj)->range_start, (*mem_obj)->range_end); + + /* Mark the chunks as allocated */ + for (found = (*mem_obj)->range_start; + found <= (*mem_obj)->range_end; + found++) + set_bit(found, kfd->gtt_sa_bitmap); + +kfd_gtt_out: + mutex_unlock(&kfd->gtt_sa_lock); + return 0; + +kfd_gtt_no_free_chunk: + pr_debug("kfd: allocation failed with mem_obj = %p\n", mem_obj); + mutex_unlock(&kfd->gtt_sa_lock); + kfree(mem_obj); + return -ENOMEM; +} + +int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj) +{ + unsigned int bit; + + BUG_ON(!kfd); + + /* Act like kfree when trying to free a NULL object */ + if (!mem_obj) + return 0; + + pr_debug("kfd: free mem_obj = %p, range_start = %d, range_end = %d\n", + mem_obj, mem_obj->range_start, mem_obj->range_end); + + mutex_lock(&kfd->gtt_sa_lock); + + /* Mark the chunks as free */ + for (bit = mem_obj->range_start; + bit <= mem_obj->range_end; + bit++) + clear_bit(bit, kfd->gtt_sa_bitmap); + + mutex_unlock(&kfd->gtt_sa_lock); + + kfree(mem_obj); + return 0; +} diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c new file mode 100644 index 000000000..596ee5cd3 --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -0,0 +1,1217 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include "kfd_priv.h" +#include "kfd_device_queue_manager.h" +#include "kfd_mqd_manager.h" +#include "cik_regs.h" +#include "kfd_kernel_queue.h" + +/* Size of the per-pipe EOP queue */ +#define CIK_HPD_EOP_BYTES_LOG2 11 +#define CIK_HPD_EOP_BYTES (1U << CIK_HPD_EOP_BYTES_LOG2) + +static int set_pasid_vmid_mapping(struct device_queue_manager *dqm, + unsigned int pasid, unsigned int vmid); + +static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd); + +static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock); +static int destroy_queues_cpsch(struct device_queue_manager *dqm, bool lock); + +static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd); + +static void deallocate_sdma_queue(struct device_queue_manager *dqm, + unsigned int sdma_queue_id); + +static inline +enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type) +{ + if (type == KFD_QUEUE_TYPE_SDMA) + return KFD_MQD_TYPE_SDMA; + return KFD_MQD_TYPE_CP; +} + +unsigned int get_first_pipe(struct device_queue_manager *dqm) +{ + BUG_ON(!dqm || !dqm->dev); + return dqm->dev->shared_resources.first_compute_pipe; +} + +unsigned int get_pipes_num(struct device_queue_manager *dqm) +{ + BUG_ON(!dqm || !dqm->dev); + return dqm->dev->shared_resources.compute_pipe_count; +} + +static inline unsigned int get_pipes_num_cpsch(void) +{ + return PIPE_PER_ME_CP_SCHEDULING; +} + +void program_sh_mem_settings(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + return dqm->dev->kfd2kgd->program_sh_mem_settings( + dqm->dev->kgd, qpd->vmid, + qpd->sh_mem_config, + qpd->sh_mem_ape1_base, + qpd->sh_mem_ape1_limit, + qpd->sh_mem_bases); +} + +static int allocate_vmid(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) +{ + int bit, allocated_vmid; + + if (dqm->vmid_bitmap == 0) + return -ENOMEM; + + bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap, CIK_VMID_NUM); + clear_bit(bit, (unsigned long *)&dqm->vmid_bitmap); + + /* Kaveri kfd vmid's starts from vmid 8 */ + allocated_vmid = bit + KFD_VMID_START_OFFSET; + pr_debug("kfd: vmid allocation %d\n", allocated_vmid); + qpd->vmid = allocated_vmid; + q->properties.vmid = allocated_vmid; + + set_pasid_vmid_mapping(dqm, q->process->pasid, q->properties.vmid); + program_sh_mem_settings(dqm, qpd); + + return 0; +} + +static void deallocate_vmid(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) +{ + int bit = qpd->vmid - KFD_VMID_START_OFFSET; + + /* Release the vmid mapping */ + set_pasid_vmid_mapping(dqm, 0, qpd->vmid); + + set_bit(bit, (unsigned long *)&dqm->vmid_bitmap); + qpd->vmid = 0; + q->properties.vmid = 0; +} + +static int create_queue_nocpsch(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd, + int *allocated_vmid) +{ + int retval; + + BUG_ON(!dqm || !q || !qpd || !allocated_vmid); + + pr_debug("kfd: In func %s\n", __func__); + print_queue(q); + + mutex_lock(&dqm->lock); + + if (dqm->total_queue_count >= max_num_of_queues_per_device) { + pr_warn("amdkfd: Can't create new usermode queue because %d queues were already created\n", + dqm->total_queue_count); + mutex_unlock(&dqm->lock); + return -EPERM; + } + + if (list_empty(&qpd->queues_list)) { + retval = allocate_vmid(dqm, qpd, q); + if (retval != 0) { + mutex_unlock(&dqm->lock); + return retval; + } + } + *allocated_vmid = qpd->vmid; + q->properties.vmid = qpd->vmid; + + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) + retval = create_compute_queue_nocpsch(dqm, q, qpd); + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + retval = create_sdma_queue_nocpsch(dqm, q, qpd); + + if (retval != 0) { + if (list_empty(&qpd->queues_list)) { + deallocate_vmid(dqm, qpd, q); + *allocated_vmid = 0; + } + mutex_unlock(&dqm->lock); + return retval; + } + + list_add(&q->list, &qpd->queues_list); + if (q->properties.is_active) + dqm->queue_count++; + + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + dqm->sdma_queue_count++; + + /* + * Unconditionally increment this counter, regardless of the queue's + * type or whether the queue is active. + */ + dqm->total_queue_count++; + pr_debug("Total of %d queues are accountable so far\n", + dqm->total_queue_count); + + mutex_unlock(&dqm->lock); + return 0; +} + +static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q) +{ + bool set; + int pipe, bit, i; + + set = false; + + for (pipe = dqm->next_pipe_to_allocate, i = 0; i < get_pipes_num(dqm); + pipe = ((pipe + 1) % get_pipes_num(dqm)), ++i) { + if (dqm->allocated_queues[pipe] != 0) { + bit = find_first_bit( + (unsigned long *)&dqm->allocated_queues[pipe], + QUEUES_PER_PIPE); + + clear_bit(bit, + (unsigned long *)&dqm->allocated_queues[pipe]); + q->pipe = pipe; + q->queue = bit; + set = true; + break; + } + } + + if (set == false) + return -EBUSY; + + pr_debug("kfd: DQM %s hqd slot - pipe (%d) queue(%d)\n", + __func__, q->pipe, q->queue); + /* horizontal hqd allocation */ + dqm->next_pipe_to_allocate = (pipe + 1) % get_pipes_num(dqm); + + return 0; +} + +static inline void deallocate_hqd(struct device_queue_manager *dqm, + struct queue *q) +{ + set_bit(q->queue, (unsigned long *)&dqm->allocated_queues[q->pipe]); +} + +static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd) +{ + int retval; + struct mqd_manager *mqd; + + BUG_ON(!dqm || !q || !qpd); + + mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); + if (mqd == NULL) + return -ENOMEM; + + retval = allocate_hqd(dqm, q); + if (retval != 0) + return retval; + + retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); + if (retval != 0) { + deallocate_hqd(dqm, q); + return retval; + } + + pr_debug("kfd: loading mqd to hqd on pipe (%d) queue (%d)\n", + q->pipe, + q->queue); + + retval = mqd->load_mqd(mqd, q->mqd, q->pipe, + q->queue, (uint32_t __user *) q->properties.write_ptr); + if (retval != 0) { + deallocate_hqd(dqm, q); + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); + return retval; + } + + return 0; +} + +static int destroy_queue_nocpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) +{ + int retval; + struct mqd_manager *mqd; + + BUG_ON(!dqm || !q || !q->mqd || !qpd); + + retval = 0; + + pr_debug("kfd: In Func %s\n", __func__); + + mutex_lock(&dqm->lock); + + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) { + mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); + if (mqd == NULL) { + retval = -ENOMEM; + goto out; + } + deallocate_hqd(dqm, q); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA); + if (mqd == NULL) { + retval = -ENOMEM; + goto out; + } + dqm->sdma_queue_count--; + deallocate_sdma_queue(dqm, q->sdma_id); + } else { + pr_debug("q->properties.type is invalid (%d)\n", + q->properties.type); + retval = -EINVAL; + goto out; + } + + retval = mqd->destroy_mqd(mqd, q->mqd, + KFD_PREEMPT_TYPE_WAVEFRONT_RESET, + QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, + q->pipe, q->queue); + + if (retval != 0) + goto out; + + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); + + list_del(&q->list); + if (list_empty(&qpd->queues_list)) + deallocate_vmid(dqm, qpd, q); + if (q->properties.is_active) + dqm->queue_count--; + + /* + * Unconditionally decrement this counter, regardless of the queue's + * type + */ + dqm->total_queue_count--; + pr_debug("Total of %d queues are accountable so far\n", + dqm->total_queue_count); + +out: + mutex_unlock(&dqm->lock); + return retval; +} + +static int update_queue(struct device_queue_manager *dqm, struct queue *q) +{ + int retval; + struct mqd_manager *mqd; + bool prev_active = false; + + BUG_ON(!dqm || !q || !q->mqd); + + mutex_lock(&dqm->lock); + mqd = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + if (mqd == NULL) { + mutex_unlock(&dqm->lock); + return -ENOMEM; + } + + if (q->properties.is_active == true) + prev_active = true; + + /* + * + * check active state vs. the previous state + * and modify counter accordingly + */ + retval = mqd->update_mqd(mqd, q->mqd, &q->properties); + if ((q->properties.is_active == true) && (prev_active == false)) + dqm->queue_count++; + else if ((q->properties.is_active == false) && (prev_active == true)) + dqm->queue_count--; + + if (sched_policy != KFD_SCHED_POLICY_NO_HWS) + retval = execute_queues_cpsch(dqm, false); + + mutex_unlock(&dqm->lock); + return retval; +} + +static struct mqd_manager *get_mqd_manager_nocpsch( + struct device_queue_manager *dqm, enum KFD_MQD_TYPE type) +{ + struct mqd_manager *mqd; + + BUG_ON(!dqm || type >= KFD_MQD_TYPE_MAX); + + pr_debug("kfd: In func %s mqd type %d\n", __func__, type); + + mqd = dqm->mqds[type]; + if (!mqd) { + mqd = mqd_manager_init(type, dqm->dev); + if (mqd == NULL) + pr_err("kfd: mqd manager is NULL"); + dqm->mqds[type] = mqd; + } + + return mqd; +} + +static int register_process_nocpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct device_process_node *n; + int retval; + + BUG_ON(!dqm || !qpd); + + pr_debug("kfd: In func %s\n", __func__); + + n = kzalloc(sizeof(struct device_process_node), GFP_KERNEL); + if (!n) + return -ENOMEM; + + n->qpd = qpd; + + mutex_lock(&dqm->lock); + list_add(&n->list, &dqm->queues); + + retval = dqm->ops_asic_specific.register_process(dqm, qpd); + + dqm->processes_count++; + + mutex_unlock(&dqm->lock); + + return retval; +} + +static int unregister_process_nocpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + int retval; + struct device_process_node *cur, *next; + + BUG_ON(!dqm || !qpd); + + pr_debug("In func %s\n", __func__); + + pr_debug("qpd->queues_list is %s\n", + list_empty(&qpd->queues_list) ? "empty" : "not empty"); + + retval = 0; + mutex_lock(&dqm->lock); + + list_for_each_entry_safe(cur, next, &dqm->queues, list) { + if (qpd == cur->qpd) { + list_del(&cur->list); + kfree(cur); + dqm->processes_count--; + goto out; + } + } + /* qpd not found in dqm list */ + retval = 1; +out: + mutex_unlock(&dqm->lock); + return retval; +} + +static int +set_pasid_vmid_mapping(struct device_queue_manager *dqm, unsigned int pasid, + unsigned int vmid) +{ + uint32_t pasid_mapping; + + pasid_mapping = (pasid == 0) ? 0 : + (uint32_t)pasid | + ATC_VMID_PASID_MAPPING_VALID; + + return dqm->dev->kfd2kgd->set_pasid_vmid_mapping( + dqm->dev->kgd, pasid_mapping, + vmid); +} + +int init_pipelines(struct device_queue_manager *dqm, + unsigned int pipes_num, unsigned int first_pipe) +{ + void *hpdptr; + struct mqd_manager *mqd; + unsigned int i, err, inx; + uint64_t pipe_hpd_addr; + + BUG_ON(!dqm || !dqm->dev); + + pr_debug("kfd: In func %s\n", __func__); + + /* + * Allocate memory for the HPDs. This is hardware-owned per-pipe data. + * The driver never accesses this memory after zeroing it. + * It doesn't even have to be saved/restored on suspend/resume + * because it contains no data when there are no active queues. + */ + + err = kfd_gtt_sa_allocate(dqm->dev, CIK_HPD_EOP_BYTES * pipes_num, + &dqm->pipeline_mem); + + if (err) { + pr_err("kfd: error allocate vidmem num pipes: %d\n", + pipes_num); + return -ENOMEM; + } + + hpdptr = dqm->pipeline_mem->cpu_ptr; + dqm->pipelines_addr = dqm->pipeline_mem->gpu_addr; + + memset(hpdptr, 0, CIK_HPD_EOP_BYTES * pipes_num); + + mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); + if (mqd == NULL) { + kfd_gtt_sa_free(dqm->dev, dqm->pipeline_mem); + return -ENOMEM; + } + + for (i = 0; i < pipes_num; i++) { + inx = i + first_pipe; + /* + * HPD buffer on GTT is allocated by amdkfd, no need to waste + * space in GTT for pipelines we don't initialize + */ + pipe_hpd_addr = dqm->pipelines_addr + i * CIK_HPD_EOP_BYTES; + pr_debug("kfd: pipeline address %llX\n", pipe_hpd_addr); + /* = log2(bytes/4)-1 */ + dqm->dev->kfd2kgd->init_pipeline(dqm->dev->kgd, inx, + CIK_HPD_EOP_BYTES_LOG2 - 3, pipe_hpd_addr); + } + + return 0; +} + +static int init_scheduler(struct device_queue_manager *dqm) +{ + int retval; + + BUG_ON(!dqm); + + pr_debug("kfd: In %s\n", __func__); + + retval = init_pipelines(dqm, get_pipes_num(dqm), get_first_pipe(dqm)); + return retval; +} + +static int initialize_nocpsch(struct device_queue_manager *dqm) +{ + int i; + + BUG_ON(!dqm); + + pr_debug("kfd: In func %s num of pipes: %d\n", + __func__, get_pipes_num(dqm)); + + mutex_init(&dqm->lock); + INIT_LIST_HEAD(&dqm->queues); + dqm->queue_count = dqm->next_pipe_to_allocate = 0; + dqm->sdma_queue_count = 0; + dqm->allocated_queues = kcalloc(get_pipes_num(dqm), + sizeof(unsigned int), GFP_KERNEL); + if (!dqm->allocated_queues) { + mutex_destroy(&dqm->lock); + return -ENOMEM; + } + + for (i = 0; i < get_pipes_num(dqm); i++) + dqm->allocated_queues[i] = (1 << QUEUES_PER_PIPE) - 1; + + dqm->vmid_bitmap = (1 << VMID_PER_DEVICE) - 1; + dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; + + init_scheduler(dqm); + return 0; +} + +static void uninitialize_nocpsch(struct device_queue_manager *dqm) +{ + int i; + + BUG_ON(!dqm); + + BUG_ON(dqm->queue_count > 0 || dqm->processes_count > 0); + + kfree(dqm->allocated_queues); + for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++) + kfree(dqm->mqds[i]); + mutex_destroy(&dqm->lock); + kfd_gtt_sa_free(dqm->dev, dqm->pipeline_mem); +} + +static int start_nocpsch(struct device_queue_manager *dqm) +{ + return 0; +} + +static int stop_nocpsch(struct device_queue_manager *dqm) +{ + return 0; +} + +static int allocate_sdma_queue(struct device_queue_manager *dqm, + unsigned int *sdma_queue_id) +{ + int bit; + + if (dqm->sdma_bitmap == 0) + return -ENOMEM; + + bit = find_first_bit((unsigned long *)&dqm->sdma_bitmap, + CIK_SDMA_QUEUES); + + clear_bit(bit, (unsigned long *)&dqm->sdma_bitmap); + *sdma_queue_id = bit; + + return 0; +} + +static void deallocate_sdma_queue(struct device_queue_manager *dqm, + unsigned int sdma_queue_id) +{ + if (sdma_queue_id >= CIK_SDMA_QUEUES) + return; + set_bit(sdma_queue_id, (unsigned long *)&dqm->sdma_bitmap); +} + +static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) +{ + uint32_t value = SDMA_ATC; + + if (q->process->is_32bit_user_mode) + value |= SDMA_VA_PTR32 | get_sh_mem_bases_32(qpd_to_pdd(qpd)); + else + value |= SDMA_VA_SHARED_BASE(get_sh_mem_bases_nybble_64( + qpd_to_pdd(qpd))); + q->properties.sdma_vm_addr = value; +} + +static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd) +{ + struct mqd_manager *mqd; + int retval; + + mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA); + if (!mqd) + return -ENOMEM; + + retval = allocate_sdma_queue(dqm, &q->sdma_id); + if (retval != 0) + return retval; + + q->properties.sdma_queue_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; + q->properties.sdma_engine_id = q->sdma_id / CIK_SDMA_ENGINE_NUM; + + pr_debug("kfd: sdma id is: %d\n", q->sdma_id); + pr_debug(" sdma queue id: %d\n", q->properties.sdma_queue_id); + pr_debug(" sdma engine id: %d\n", q->properties.sdma_engine_id); + + init_sdma_vm(dqm, q, qpd); + retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); + if (retval != 0) { + deallocate_sdma_queue(dqm, q->sdma_id); + return retval; + } + + retval = mqd->load_mqd(mqd, q->mqd, 0, + 0, NULL); + if (retval != 0) { + deallocate_sdma_queue(dqm, q->sdma_id); + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); + return retval; + } + + return 0; +} + +/* + * Device Queue Manager implementation for cp scheduler + */ + +static int set_sched_resources(struct device_queue_manager *dqm) +{ + struct scheduling_resources res; + unsigned int queue_num, queue_mask; + + BUG_ON(!dqm); + + pr_debug("kfd: In func %s\n", __func__); + + queue_num = get_pipes_num_cpsch() * QUEUES_PER_PIPE; + queue_mask = (1 << queue_num) - 1; + res.vmid_mask = (1 << VMID_PER_DEVICE) - 1; + res.vmid_mask <<= KFD_VMID_START_OFFSET; + res.queue_mask = queue_mask << (get_first_pipe(dqm) * QUEUES_PER_PIPE); + res.gws_mask = res.oac_mask = res.gds_heap_base = + res.gds_heap_size = 0; + + pr_debug("kfd: scheduling resources:\n" + " vmid mask: 0x%8X\n" + " queue mask: 0x%8llX\n", + res.vmid_mask, res.queue_mask); + + return pm_send_set_resources(&dqm->packets, &res); +} + +static int initialize_cpsch(struct device_queue_manager *dqm) +{ + int retval; + + BUG_ON(!dqm); + + pr_debug("kfd: In func %s num of pipes: %d\n", + __func__, get_pipes_num_cpsch()); + + mutex_init(&dqm->lock); + INIT_LIST_HEAD(&dqm->queues); + dqm->queue_count = dqm->processes_count = 0; + dqm->sdma_queue_count = 0; + dqm->active_runlist = false; + retval = dqm->ops_asic_specific.initialize(dqm); + if (retval != 0) + goto fail_init_pipelines; + + return 0; + +fail_init_pipelines: + mutex_destroy(&dqm->lock); + return retval; +} + +static int start_cpsch(struct device_queue_manager *dqm) +{ + struct device_process_node *node; + int retval; + + BUG_ON(!dqm); + + retval = 0; + + retval = pm_init(&dqm->packets, dqm); + if (retval != 0) + goto fail_packet_manager_init; + + retval = set_sched_resources(dqm); + if (retval != 0) + goto fail_set_sched_resources; + + pr_debug("kfd: allocating fence memory\n"); + + /* allocate fence memory on the gart */ + retval = kfd_gtt_sa_allocate(dqm->dev, sizeof(*dqm->fence_addr), + &dqm->fence_mem); + + if (retval != 0) + goto fail_allocate_vidmem; + + dqm->fence_addr = dqm->fence_mem->cpu_ptr; + dqm->fence_gpu_addr = dqm->fence_mem->gpu_addr; + list_for_each_entry(node, &dqm->queues, list) + if (node->qpd->pqm->process && dqm->dev) + kfd_bind_process_to_device(dqm->dev, + node->qpd->pqm->process); + + execute_queues_cpsch(dqm, true); + + return 0; +fail_allocate_vidmem: +fail_set_sched_resources: + pm_uninit(&dqm->packets); +fail_packet_manager_init: + return retval; +} + +static int stop_cpsch(struct device_queue_manager *dqm) +{ + struct device_process_node *node; + struct kfd_process_device *pdd; + + BUG_ON(!dqm); + + destroy_queues_cpsch(dqm, true); + + list_for_each_entry(node, &dqm->queues, list) { + pdd = qpd_to_pdd(node->qpd); + pdd->bound = false; + } + kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); + pm_uninit(&dqm->packets); + + return 0; +} + +static int create_kernel_queue_cpsch(struct device_queue_manager *dqm, + struct kernel_queue *kq, + struct qcm_process_device *qpd) +{ + BUG_ON(!dqm || !kq || !qpd); + + pr_debug("kfd: In func %s\n", __func__); + + mutex_lock(&dqm->lock); + if (dqm->total_queue_count >= max_num_of_queues_per_device) { + pr_warn("amdkfd: Can't create new kernel queue because %d queues were already created\n", + dqm->total_queue_count); + mutex_unlock(&dqm->lock); + return -EPERM; + } + + /* + * Unconditionally increment this counter, regardless of the queue's + * type or whether the queue is active. + */ + dqm->total_queue_count++; + pr_debug("Total of %d queues are accountable so far\n", + dqm->total_queue_count); + + list_add(&kq->list, &qpd->priv_queue_list); + dqm->queue_count++; + qpd->is_debug = true; + execute_queues_cpsch(dqm, false); + mutex_unlock(&dqm->lock); + + return 0; +} + +static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, + struct kernel_queue *kq, + struct qcm_process_device *qpd) +{ + BUG_ON(!dqm || !kq); + + pr_debug("kfd: In %s\n", __func__); + + mutex_lock(&dqm->lock); + destroy_queues_cpsch(dqm, false); + list_del(&kq->list); + dqm->queue_count--; + qpd->is_debug = false; + execute_queues_cpsch(dqm, false); + /* + * Unconditionally decrement this counter, regardless of the queue's + * type. + */ + dqm->total_queue_count--; + pr_debug("Total of %d queues are accountable so far\n", + dqm->total_queue_count); + mutex_unlock(&dqm->lock); +} + +static void select_sdma_engine_id(struct queue *q) +{ + static int sdma_id; + + q->sdma_id = sdma_id; + sdma_id = (sdma_id + 1) % 2; +} + +static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd, int *allocate_vmid) +{ + int retval; + struct mqd_manager *mqd; + + BUG_ON(!dqm || !q || !qpd); + + retval = 0; + + if (allocate_vmid) + *allocate_vmid = 0; + + mutex_lock(&dqm->lock); + + if (dqm->total_queue_count >= max_num_of_queues_per_device) { + pr_warn("amdkfd: Can't create new usermode queue because %d queues were already created\n", + dqm->total_queue_count); + retval = -EPERM; + goto out; + } + + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + select_sdma_engine_id(q); + + mqd = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + + if (mqd == NULL) { + mutex_unlock(&dqm->lock); + return -ENOMEM; + } + + init_sdma_vm(dqm, q, qpd); + + retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); + if (retval != 0) + goto out; + + list_add(&q->list, &qpd->queues_list); + if (q->properties.is_active) { + dqm->queue_count++; + retval = execute_queues_cpsch(dqm, false); + } + + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + dqm->sdma_queue_count++; + /* + * Unconditionally increment this counter, regardless of the queue's + * type or whether the queue is active. + */ + dqm->total_queue_count++; + + pr_debug("Total of %d queues are accountable so far\n", + dqm->total_queue_count); + +out: + mutex_unlock(&dqm->lock); + return retval; +} + +static int amdkfd_fence_wait_timeout(unsigned int *fence_addr, + unsigned int fence_value, + unsigned long timeout) +{ + BUG_ON(!fence_addr); + timeout += jiffies; + + while (*fence_addr != fence_value) { + if (time_after(jiffies, timeout)) { + pr_err("kfd: qcm fence wait loop timeout expired\n"); + return -ETIME; + } + schedule(); + } + + return 0; +} + +static int destroy_sdma_queues(struct device_queue_manager *dqm, + unsigned int sdma_engine) +{ + return pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA, + KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES, 0, false, + sdma_engine); +} + +static int destroy_queues_cpsch(struct device_queue_manager *dqm, bool lock) +{ + int retval; + + BUG_ON(!dqm); + + retval = 0; + + if (lock) + mutex_lock(&dqm->lock); + if (dqm->active_runlist == false) + goto out; + + pr_debug("kfd: Before destroying queues, sdma queue count is : %u\n", + dqm->sdma_queue_count); + + if (dqm->sdma_queue_count > 0) { + destroy_sdma_queues(dqm, 0); + destroy_sdma_queues(dqm, 1); + } + + retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE, + KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES, 0, false, 0); + if (retval != 0) + goto out; + + *dqm->fence_addr = KFD_FENCE_INIT; + pm_send_query_status(&dqm->packets, dqm->fence_gpu_addr, + KFD_FENCE_COMPLETED); + /* should be timed out */ + amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED, + QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS); + pm_release_ib(&dqm->packets); + dqm->active_runlist = false; + +out: + if (lock) + mutex_unlock(&dqm->lock); + return retval; +} + +static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock) +{ + int retval; + + BUG_ON(!dqm); + + if (lock) + mutex_lock(&dqm->lock); + + retval = destroy_queues_cpsch(dqm, false); + if (retval != 0) { + pr_err("kfd: the cp might be in an unrecoverable state due to an unsuccessful queues preemption"); + goto out; + } + + if (dqm->queue_count <= 0 || dqm->processes_count <= 0) { + retval = 0; + goto out; + } + + if (dqm->active_runlist) { + retval = 0; + goto out; + } + + retval = pm_send_runlist(&dqm->packets, &dqm->queues); + if (retval != 0) { + pr_err("kfd: failed to execute runlist"); + goto out; + } + dqm->active_runlist = true; + +out: + if (lock) + mutex_unlock(&dqm->lock); + return retval; +} + +static int destroy_queue_cpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) +{ + int retval; + struct mqd_manager *mqd; + + BUG_ON(!dqm || !qpd || !q); + + retval = 0; + + /* remove queue from list to prevent rescheduling after preemption */ + mutex_lock(&dqm->lock); + mqd = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + if (!mqd) { + retval = -ENOMEM; + goto failed; + } + + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) + dqm->sdma_queue_count--; + + list_del(&q->list); + if (q->properties.is_active) + dqm->queue_count--; + + execute_queues_cpsch(dqm, false); + + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); + + /* + * Unconditionally decrement this counter, regardless of the queue's + * type + */ + dqm->total_queue_count--; + pr_debug("Total of %d queues are accountable so far\n", + dqm->total_queue_count); + + mutex_unlock(&dqm->lock); + + return 0; + +failed: + mutex_unlock(&dqm->lock); + return retval; +} + +/* + * Low bits must be 0000/FFFF as required by HW, high bits must be 0 to + * stay in user mode. + */ +#define APE1_FIXED_BITS_MASK 0xFFFF80000000FFFFULL +/* APE1 limit is inclusive and 64K aligned. */ +#define APE1_LIMIT_ALIGNMENT 0xFFFF + +static bool set_cache_memory_policy(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size) +{ + bool retval; + + pr_debug("kfd: In func %s\n", __func__); + + mutex_lock(&dqm->lock); + + if (alternate_aperture_size == 0) { + /* base > limit disables APE1 */ + qpd->sh_mem_ape1_base = 1; + qpd->sh_mem_ape1_limit = 0; + } else { + /* + * In FSA64, APE1_Base[63:0] = { 16{SH_MEM_APE1_BASE[31]}, + * SH_MEM_APE1_BASE[31:0], 0x0000 } + * APE1_Limit[63:0] = { 16{SH_MEM_APE1_LIMIT[31]}, + * SH_MEM_APE1_LIMIT[31:0], 0xFFFF } + * Verify that the base and size parameters can be + * represented in this format and convert them. + * Additionally restrict APE1 to user-mode addresses. + */ + + uint64_t base = (uintptr_t)alternate_aperture_base; + uint64_t limit = base + alternate_aperture_size - 1; + + if (limit <= base) + goto out; + + if ((base & APE1_FIXED_BITS_MASK) != 0) + goto out; + + if ((limit & APE1_FIXED_BITS_MASK) != APE1_LIMIT_ALIGNMENT) + goto out; + + qpd->sh_mem_ape1_base = base >> 16; + qpd->sh_mem_ape1_limit = limit >> 16; + } + + retval = dqm->ops_asic_specific.set_cache_memory_policy( + dqm, + qpd, + default_policy, + alternate_policy, + alternate_aperture_base, + alternate_aperture_size); + + if ((sched_policy == KFD_SCHED_POLICY_NO_HWS) && (qpd->vmid != 0)) + program_sh_mem_settings(dqm, qpd); + + pr_debug("kfd: sh_mem_config: 0x%x, ape1_base: 0x%x, ape1_limit: 0x%x\n", + qpd->sh_mem_config, qpd->sh_mem_ape1_base, + qpd->sh_mem_ape1_limit); + + mutex_unlock(&dqm->lock); + return retval; + +out: + mutex_unlock(&dqm->lock); + return false; +} + +struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) +{ + struct device_queue_manager *dqm; + + BUG_ON(!dev); + + pr_debug("kfd: loading device queue manager\n"); + + dqm = kzalloc(sizeof(struct device_queue_manager), GFP_KERNEL); + if (!dqm) + return NULL; + + dqm->dev = dev; + switch (sched_policy) { + case KFD_SCHED_POLICY_HWS: + case KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION: + /* initialize dqm for cp scheduling */ + dqm->ops.create_queue = create_queue_cpsch; + dqm->ops.initialize = initialize_cpsch; + dqm->ops.start = start_cpsch; + dqm->ops.stop = stop_cpsch; + dqm->ops.destroy_queue = destroy_queue_cpsch; + dqm->ops.update_queue = update_queue; + dqm->ops.get_mqd_manager = get_mqd_manager_nocpsch; + dqm->ops.register_process = register_process_nocpsch; + dqm->ops.unregister_process = unregister_process_nocpsch; + dqm->ops.uninitialize = uninitialize_nocpsch; + dqm->ops.create_kernel_queue = create_kernel_queue_cpsch; + dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch; + dqm->ops.set_cache_memory_policy = set_cache_memory_policy; + break; + case KFD_SCHED_POLICY_NO_HWS: + /* initialize dqm for no cp scheduling */ + dqm->ops.start = start_nocpsch; + dqm->ops.stop = stop_nocpsch; + dqm->ops.create_queue = create_queue_nocpsch; + dqm->ops.destroy_queue = destroy_queue_nocpsch; + dqm->ops.update_queue = update_queue; + dqm->ops.get_mqd_manager = get_mqd_manager_nocpsch; + dqm->ops.register_process = register_process_nocpsch; + dqm->ops.unregister_process = unregister_process_nocpsch; + dqm->ops.initialize = initialize_nocpsch; + dqm->ops.uninitialize = uninitialize_nocpsch; + dqm->ops.set_cache_memory_policy = set_cache_memory_policy; + break; + default: + BUG(); + break; + } + + switch (dev->device_info->asic_family) { + case CHIP_CARRIZO: + device_queue_manager_init_vi(&dqm->ops_asic_specific); + break; + + case CHIP_KAVERI: + device_queue_manager_init_cik(&dqm->ops_asic_specific); + break; + } + + if (dqm->ops.initialize(dqm) != 0) { + kfree(dqm); + return NULL; + } + + return dqm; +} + +void device_queue_manager_uninit(struct device_queue_manager *dqm) +{ + BUG_ON(!dqm); + + dqm->ops.uninitialize(dqm); + kfree(dqm); +} diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h new file mode 100644 index 000000000..488f51d19 --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -0,0 +1,180 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef KFD_DEVICE_QUEUE_MANAGER_H_ +#define KFD_DEVICE_QUEUE_MANAGER_H_ + +#include +#include +#include "kfd_priv.h" +#include "kfd_mqd_manager.h" + +#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (500) +#define QUEUES_PER_PIPE (8) +#define PIPE_PER_ME_CP_SCHEDULING (3) +#define CIK_VMID_NUM (8) +#define KFD_VMID_START_OFFSET (8) +#define VMID_PER_DEVICE CIK_VMID_NUM +#define KFD_DQM_FIRST_PIPE (0) +#define CIK_SDMA_QUEUES (4) +#define CIK_SDMA_QUEUES_PER_ENGINE (2) +#define CIK_SDMA_ENGINE_NUM (2) + +struct device_process_node { + struct qcm_process_device *qpd; + struct list_head list; +}; + +/** + * struct device_queue_manager_ops + * + * @create_queue: Queue creation routine. + * + * @destroy_queue: Queue destruction routine. + * + * @update_queue: Queue update routine. + * + * @get_mqd_manager: Returns the mqd manager according to the mqd type. + * + * @exeute_queues: Dispatches the queues list to the H/W. + * + * @register_process: This routine associates a specific process with device. + * + * @unregister_process: destroys the associations between process to device. + * + * @initialize: Initializes the pipelines and memory module for that device. + * + * @start: Initializes the resources/modules the the device needs for queues + * execution. This function is called on device initialization and after the + * system woke up after suspension. + * + * @stop: This routine stops execution of all the active queue running on the + * H/W and basically this function called on system suspend. + * + * @uninitialize: Destroys all the device queue manager resources allocated in + * initialize routine. + * + * @create_kernel_queue: Creates kernel queue. Used for debug queue. + * + * @destroy_kernel_queue: Destroys kernel queue. Used for debug queue. + * + * @set_cache_memory_policy: Sets memory policy (cached/ non cached) for the + * memory apertures. + * + */ + +struct device_queue_manager_ops { + int (*create_queue)(struct device_queue_manager *dqm, + struct queue *q, + struct qcm_process_device *qpd, + int *allocate_vmid); + int (*destroy_queue)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q); + int (*update_queue)(struct device_queue_manager *dqm, + struct queue *q); + + struct mqd_manager * (*get_mqd_manager) + (struct device_queue_manager *dqm, + enum KFD_MQD_TYPE type); + + int (*register_process)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); + int (*unregister_process)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); + int (*initialize)(struct device_queue_manager *dqm); + int (*start)(struct device_queue_manager *dqm); + int (*stop)(struct device_queue_manager *dqm); + void (*uninitialize)(struct device_queue_manager *dqm); + int (*create_kernel_queue)(struct device_queue_manager *dqm, + struct kernel_queue *kq, + struct qcm_process_device *qpd); + void (*destroy_kernel_queue)(struct device_queue_manager *dqm, + struct kernel_queue *kq, + struct qcm_process_device *qpd); + bool (*set_cache_memory_policy)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size); +}; + +/** + * struct device_queue_manager + * + * This struct is a base class for the kfd queues scheduler in the + * device level. The device base class should expose the basic operations + * for queue creation and queue destruction. This base class hides the + * scheduling mode of the driver and the specific implementation of the + * concrete device. This class is the only class in the queues scheduler + * that configures the H/W. + * + */ + +struct device_queue_manager { + struct device_queue_manager_ops ops; + struct device_queue_manager_ops ops_asic_specific; + + struct mqd_manager *mqds[KFD_MQD_TYPE_MAX]; + struct packet_manager packets; + struct kfd_dev *dev; + struct mutex lock; + struct list_head queues; + unsigned int processes_count; + unsigned int queue_count; + unsigned int sdma_queue_count; + unsigned int total_queue_count; + unsigned int next_pipe_to_allocate; + unsigned int *allocated_queues; + unsigned int sdma_bitmap; + unsigned int vmid_bitmap; + uint64_t pipelines_addr; + struct kfd_mem_obj *pipeline_mem; + uint64_t fence_gpu_addr; + unsigned int *fence_addr; + struct kfd_mem_obj *fence_mem; + bool active_runlist; +}; + +void device_queue_manager_init_cik(struct device_queue_manager_ops *ops); +void device_queue_manager_init_vi(struct device_queue_manager_ops *ops); +void program_sh_mem_settings(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +int init_pipelines(struct device_queue_manager *dqm, + unsigned int pipes_num, unsigned int first_pipe); +unsigned int get_first_pipe(struct device_queue_manager *dqm); +unsigned int get_pipes_num(struct device_queue_manager *dqm); + +extern inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) +{ + return (pdd->lds_base >> 16) & 0xFF; +} + +extern inline unsigned int +get_sh_mem_bases_nybble_64(struct kfd_process_device *pdd) +{ + return (pdd->lds_base >> 60) & 0x0E; +} + +#endif /* KFD_DEVICE_QUEUE_MANAGER_H_ */ diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c new file mode 100644 index 000000000..5469efe05 --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c @@ -0,0 +1,135 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_device_queue_manager.h" +#include "cik_regs.h" + +static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size); +static int register_process_cik(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +static int initialize_cpsch_cik(struct device_queue_manager *dqm); + +void device_queue_manager_init_cik(struct device_queue_manager_ops *ops) +{ + ops->set_cache_memory_policy = set_cache_memory_policy_cik; + ops->register_process = register_process_cik; + ops->initialize = initialize_cpsch_cik; +} + +static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) +{ + /* In 64-bit mode, we can only control the top 3 bits of the LDS, + * scratch and GPUVM apertures. + * The hardware fills in the remaining 59 bits according to the + * following pattern: + * LDS: X0000000'00000000 - X0000001'00000000 (4GB) + * Scratch: X0000001'00000000 - X0000002'00000000 (4GB) + * GPUVM: Y0010000'00000000 - Y0020000'00000000 (1TB) + * + * (where X/Y is the configurable nybble with the low-bit 0) + * + * LDS and scratch will have the same top nybble programmed in the + * top 3 bits of SH_MEM_BASES.PRIVATE_BASE. + * GPUVM can have a different top nybble programmed in the + * top 3 bits of SH_MEM_BASES.SHARED_BASE. + * We don't bother to support different top nybbles + * for LDS/Scratch and GPUVM. + */ + + BUG_ON((top_address_nybble & 1) || top_address_nybble > 0xE || + top_address_nybble == 0); + + return PRIVATE_BASE(top_address_nybble << 12) | + SHARED_BASE(top_address_nybble << 12); +} + +static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size) +{ + uint32_t default_mtype; + uint32_t ape1_mtype; + + default_mtype = (default_policy == cache_policy_coherent) ? + MTYPE_NONCACHED : + MTYPE_CACHED; + + ape1_mtype = (alternate_policy == cache_policy_coherent) ? + MTYPE_NONCACHED : + MTYPE_CACHED; + + qpd->sh_mem_config = (qpd->sh_mem_config & PTR32) + | ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) + | DEFAULT_MTYPE(default_mtype) + | APE1_MTYPE(ape1_mtype); + + return true; +} + +static int register_process_cik(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct kfd_process_device *pdd; + unsigned int temp; + + BUG_ON(!dqm || !qpd); + + pdd = qpd_to_pdd(qpd); + + /* check if sh_mem_config register already configured */ + if (qpd->sh_mem_config == 0) { + qpd->sh_mem_config = + ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) | + DEFAULT_MTYPE(MTYPE_NONCACHED) | + APE1_MTYPE(MTYPE_NONCACHED); + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + } + + if (qpd->pqm->process->is_32bit_user_mode) { + temp = get_sh_mem_bases_32(pdd); + qpd->sh_mem_bases = SHARED_BASE(temp); + qpd->sh_mem_config |= PTR32; + } else { + temp = get_sh_mem_bases_nybble_64(pdd); + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); + } + + pr_debug("kfd: is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", + qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases); + + return 0; +} + +static int initialize_cpsch_cik(struct device_queue_manager *dqm) +{ + return init_pipelines(dqm, get_pipes_num(dqm), get_first_pipe(dqm)); +} diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c new file mode 100644 index 000000000..20553dcd2 --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c @@ -0,0 +1,64 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_device_queue_manager.h" + +static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size); +static int register_process_vi(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +static int initialize_cpsch_vi(struct device_queue_manager *dqm); + +void device_queue_manager_init_vi(struct device_queue_manager_ops *ops) +{ + pr_warn("amdkfd: VI DQM is not currently supported\n"); + + ops->set_cache_memory_policy = set_cache_memory_policy_vi; + ops->register_process = register_process_vi; + ops->initialize = initialize_cpsch_vi; +} + +static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + enum cache_policy default_policy, + enum cache_policy alternate_policy, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size) +{ + return false; +} + +static int register_process_vi(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + return -1; +} + +static int initialize_cpsch_vi(struct device_queue_manager *dqm) +{ + return 0; +} diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c new file mode 100644 index 000000000..17e56dcc8 --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c @@ -0,0 +1,249 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "kfd_priv.h" +#include +#include +#include +#include + +/* + * This extension supports a kernel level doorbells management for + * the kernel queues. + * Basically the last doorbells page is devoted to kernel queues + * and that's assures that any user process won't get access to the + * kernel doorbells page + */ + +#define KERNEL_DOORBELL_PASID 1 +#define KFD_SIZE_OF_DOORBELL_IN_BYTES 4 + +/* + * Each device exposes a doorbell aperture, a PCI MMIO aperture that + * receives 32-bit writes that are passed to queues as wptr values. + * The doorbells are intended to be written by applications as part + * of queueing work on user-mode queues. + * We assign doorbells to applications in PAGE_SIZE-sized and aligned chunks. + * We map the doorbell address space into user-mode when a process creates + * its first queue on each device. + * Although the mapping is done by KFD, it is equivalent to an mmap of + * the /dev/kfd with the particular device encoded in the mmap offset. + * There will be other uses for mmap of /dev/kfd, so only a range of + * offsets (KFD_MMAP_DOORBELL_START-END) is used for doorbells. + */ + +/* # of doorbell bytes allocated for each process. */ +static inline size_t doorbell_process_allocation(void) +{ + return roundup(KFD_SIZE_OF_DOORBELL_IN_BYTES * + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, + PAGE_SIZE); +} + +/* Doorbell calculations for device init. */ +void kfd_doorbell_init(struct kfd_dev *kfd) +{ + size_t doorbell_start_offset; + size_t doorbell_aperture_size; + size_t doorbell_process_limit; + + /* + * We start with calculations in bytes because the input data might + * only be byte-aligned. + * Only after we have done the rounding can we assume any alignment. + */ + + doorbell_start_offset = + roundup(kfd->shared_resources.doorbell_start_offset, + doorbell_process_allocation()); + + doorbell_aperture_size = + rounddown(kfd->shared_resources.doorbell_aperture_size, + doorbell_process_allocation()); + + if (doorbell_aperture_size > doorbell_start_offset) + doorbell_process_limit = + (doorbell_aperture_size - doorbell_start_offset) / + doorbell_process_allocation(); + else + doorbell_process_limit = 0; + + kfd->doorbell_base = kfd->shared_resources.doorbell_physical_address + + doorbell_start_offset; + + kfd->doorbell_id_offset = doorbell_start_offset / sizeof(u32); + kfd->doorbell_process_limit = doorbell_process_limit - 1; + + kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base, + doorbell_process_allocation()); + + BUG_ON(!kfd->doorbell_kernel_ptr); + + pr_debug("kfd: doorbell initialization:\n"); + pr_debug("kfd: doorbell base == 0x%08lX\n", + (uintptr_t)kfd->doorbell_base); + + pr_debug("kfd: doorbell_id_offset == 0x%08lX\n", + kfd->doorbell_id_offset); + + pr_debug("kfd: doorbell_process_limit == 0x%08lX\n", + doorbell_process_limit); + + pr_debug("kfd: doorbell_kernel_offset == 0x%08lX\n", + (uintptr_t)kfd->doorbell_base); + + pr_debug("kfd: doorbell aperture size == 0x%08lX\n", + kfd->shared_resources.doorbell_aperture_size); + + pr_debug("kfd: doorbell kernel address == 0x%08lX\n", + (uintptr_t)kfd->doorbell_kernel_ptr); +} + +int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) +{ + phys_addr_t address; + struct kfd_dev *dev; + + /* + * For simplicitly we only allow mapping of the entire doorbell + * allocation of a single device & process. + */ + if (vma->vm_end - vma->vm_start != doorbell_process_allocation()) + return -EINVAL; + + /* Find kfd device according to gpu id */ + dev = kfd_device_by_id(vma->vm_pgoff); + if (dev == NULL) + return -EINVAL; + + /* Calculate physical address of doorbell */ + address = kfd_get_process_doorbells(dev, process); + + vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | + VM_DONTDUMP | VM_PFNMAP; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + pr_debug("kfd: mapping doorbell page in kfd_doorbell_mmap\n" + " target user address == 0x%08llX\n" + " physical address == 0x%08llX\n" + " vm_flags == 0x%04lX\n" + " size == 0x%04lX\n", + (unsigned long long) vma->vm_start, address, vma->vm_flags, + doorbell_process_allocation()); + + + return io_remap_pfn_range(vma, + vma->vm_start, + address >> PAGE_SHIFT, + doorbell_process_allocation(), + vma->vm_page_prot); +} + + +/* get kernel iomem pointer for a doorbell */ +u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, + unsigned int *doorbell_off) +{ + u32 inx; + + BUG_ON(!kfd || !doorbell_off); + + mutex_lock(&kfd->doorbell_mutex); + inx = find_first_zero_bit(kfd->doorbell_available_index, + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); + + __set_bit(inx, kfd->doorbell_available_index); + mutex_unlock(&kfd->doorbell_mutex); + + if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) + return NULL; + + /* + * Calculating the kernel doorbell offset using "faked" kernel + * pasid that allocated for kernel queues only + */ + *doorbell_off = KERNEL_DOORBELL_PASID * (doorbell_process_allocation() / + sizeof(u32)) + inx; + + pr_debug("kfd: get kernel queue doorbell\n" + " doorbell offset == 0x%08d\n" + " kernel address == 0x%08lX\n", + *doorbell_off, (uintptr_t)(kfd->doorbell_kernel_ptr + inx)); + + return kfd->doorbell_kernel_ptr + inx; +} + +void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr) +{ + unsigned int inx; + + BUG_ON(!kfd || !db_addr); + + inx = (unsigned int)(db_addr - kfd->doorbell_kernel_ptr); + + mutex_lock(&kfd->doorbell_mutex); + __clear_bit(inx, kfd->doorbell_available_index); + mutex_unlock(&kfd->doorbell_mutex); +} + +inline void write_kernel_doorbell(u32 __iomem *db, u32 value) +{ + if (db) { + writel(value, db); + pr_debug("writing %d to doorbell address 0x%p\n", value, db); + } +} + +/* + * queue_ids are in the range [0,MAX_PROCESS_QUEUES) and are mapped 1:1 + * to doorbells with the process's doorbell page + */ +unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, + struct kfd_process *process, + unsigned int queue_id) +{ + /* + * doorbell_id_offset accounts for doorbells taken by KGD. + * pasid * doorbell_process_allocation/sizeof(u32) adjusts + * to the process's doorbells + */ + return kfd->doorbell_id_offset + + process->pasid * (doorbell_process_allocation()/sizeof(u32)) + + queue_id; +} + +uint64_t kfd_get_number_elems(struct kfd_dev *kfd) +{ + uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size - + kfd->shared_resources.doorbell_start_offset) / + doorbell_process_allocation() + 1; + + return num_of_elems; + +} + +phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, + struct kfd_process *process) +{ + return dev->doorbell_base + + process->pasid * doorbell_process_allocation(); +} diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c new file mode 100644 index 000000000..35b987574 --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -0,0 +1,355 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kfd_priv.h" +#include +#include +#include + +/* + * The primary memory I/O features being added for revisions of gfxip + * beyond 7.0 (Kaveri) are: + * + * Access to ATC/IOMMU mapped memory w/ associated extension of VA to 48b + * + * “Flat” shader memory access – These are new shader vector memory + * operations that do not reference a T#/V# so a “pointer” is what is + * sourced from the vector gprs for direct access to memory. + * This pointer space has the Shared(LDS) and Private(Scratch) memory + * mapped into this pointer space as apertures. + * The hardware then determines how to direct the memory request + * based on what apertures the request falls in. + * + * Unaligned support and alignment check + * + * + * System Unified Address - SUA + * + * The standard usage for GPU virtual addresses are that they are mapped by + * a set of page tables we call GPUVM and these page tables are managed by + * a combination of vidMM/driver software components. The current virtual + * address (VA) range for GPUVM is 40b. + * + * As of gfxip7.1 and beyond we’re adding the ability for compute memory + * clients (CP/RLC, DMA, SHADER(ifetch, scalar, and vector ops)) to access + * the same page tables used by host x86 processors and that are managed by + * the operating system. This is via a technique and hardware called ATC/IOMMU. + * The GPU has the capability of accessing both the GPUVM and ATC address + * spaces for a given VMID (process) simultaneously and we call this feature + * system unified address (SUA). + * + * There are three fundamental address modes of operation for a given VMID + * (process) on the GPU: + * + * HSA64 – 64b pointers and the default address space is ATC + * HSA32 – 32b pointers and the default address space is ATC + * GPUVM – 64b pointers and the default address space is GPUVM (driver + * model mode) + * + * + * HSA64 - ATC/IOMMU 64b + * + * A 64b pointer in the AMD64/IA64 CPU architecture is not fully utilized + * by the CPU so an AMD CPU can only access the high area + * (VA[63:47] == 0x1FFFF) and low area (VA[63:47 == 0) of the address space + * so the actual VA carried to translation is 48b. There is a “hole” in + * the middle of the 64b VA space. + * + * The GPU not only has access to all of the CPU accessible address space via + * ATC/IOMMU, but it also has access to the GPUVM address space. The “system + * unified address” feature (SUA) is the mapping of GPUVM and ATC address + * spaces into a unified pointer space. The method we take for 64b mode is + * to map the full 40b GPUVM address space into the hole of the 64b address + * space. + + * The GPUVM_Base/GPUVM_Limit defines the aperture in the 64b space where we + * direct requests to be translated via GPUVM page tables instead of the + * IOMMU path. + * + * + * 64b to 49b Address conversion + * + * Note that there are still significant portions of unused regions (holes) + * in the 64b address space even for the GPU. There are several places in + * the pipeline (sw and hw), we wish to compress the 64b virtual address + * to a 49b address. This 49b address is constituted of an “ATC” bit + * plus a 48b virtual address. This 49b address is what is passed to the + * translation hardware. ATC==0 means the 48b address is a GPUVM address + * (max of 2^40 – 1) intended to be translated via GPUVM page tables. + * ATC==1 means the 48b address is intended to be translated via IOMMU + * page tables. + * + * A 64b pointer is compared to the apertures that are defined (Base/Limit), in + * this case the GPUVM aperture (red) is defined and if a pointer falls in this + * aperture, we subtract the GPUVM_Base address and set the ATC bit to zero + * as part of the 64b to 49b conversion. + * + * Where this 64b to 49b conversion is done is a function of the usage. + * Most GPU memory access is via memory objects where the driver builds + * a descriptor which consists of a base address and a memory access by + * the GPU usually consists of some kind of an offset or Cartesian coordinate + * that references this memory descriptor. This is the case for shader + * instructions that reference the T# or V# constants, or for specified + * locations of assets (ex. the shader program location). In these cases + * the driver is what handles the 64b to 49b conversion and the base + * address in the descriptor (ex. V# or T# or shader program location) + * is defined as a 48b address w/ an ATC bit. For this usage a given + * memory object cannot straddle multiple apertures in the 64b address + * space. For example a shader program cannot jump in/out between ATC + * and GPUVM space. + * + * In some cases we wish to pass a 64b pointer to the GPU hardware and + * the GPU hw does the 64b to 49b conversion before passing memory + * requests to the cache/memory system. This is the case for the + * S_LOAD and FLAT_* shader memory instructions where we have 64b pointers + * in scalar and vector GPRs respectively. + * + * In all cases (no matter where the 64b -> 49b conversion is done), the gfxip + * hardware sends a 48b address along w/ an ATC bit, to the memory controller + * on the memory request interfaces. + * + * _MC_rdreq_atc // read request ATC bit + * + * 0 : _MC_rdreq_addr is a GPUVM VA + * + * 1 : _MC_rdreq_addr is a ATC VA + * + * + * “Spare” aperture (APE1) + * + * We use the GPUVM aperture to differentiate ATC vs. GPUVM, but we also use + * apertures to set the Mtype field for S_LOAD/FLAT_* ops which is input to the + * config tables for setting cache policies. The “spare” (APE1) aperture is + * motivated by getting a different Mtype from the default. + * The default aperture isn’t an actual base/limit aperture; it is just the + * address space that doesn’t hit any defined base/limit apertures. + * The following diagram is a complete picture of the gfxip7.x SUA apertures. + * The APE1 can be placed either below or above + * the hole (cannot be in the hole). + * + * + * General Aperture definitions and rules + * + * An aperture register definition consists of a Base, Limit, Mtype, and + * usually an ATC bit indicating which translation tables that aperture uses. + * In all cases (for SUA and DUA apertures discussed later), aperture base + * and limit definitions are 64KB aligned. + * + * _Base[63:0] = { _Base_register[63:16], 0x0000 } + * + * _Limit[63:0] = { _Limit_register[63:16], 0xFFFF } + * + * The base and limit are considered inclusive to an aperture so being + * inside an aperture means (address >= Base) AND (address <= Limit). + * + * In no case is a payload that straddles multiple apertures expected to work. + * For example a load_dword_x4 that starts in one aperture and ends in another, + * does not work. For the vector FLAT_* ops we have detection capability in + * the shader for reporting a “memory violation” back to the + * SQ block for use in traps. + * A memory violation results when an op falls into the hole, + * or a payload straddles multiple apertures. The S_LOAD instruction + * does not have this detection. + * + * Apertures cannot overlap. + * + * + * + * HSA32 - ATC/IOMMU 32b + * + * For HSA32 mode, the pointers are interpreted as 32 bits and use a single GPR + * instead of two for the S_LOAD and FLAT_* ops. The entire GPUVM space of 40b + * will not fit so there is only partial visibility to the GPUVM + * space (defined by the aperture) for S_LOAD and FLAT_* ops. + * There is no spare (APE1) aperture for HSA32 mode. + * + * + * GPUVM 64b mode (driver model) + * + * This mode is related to HSA64 in that the difference really is that + * the default aperture is GPUVM (ATC==0) and not ATC space. + * We have gfxip7.x hardware that has FLAT_* and S_LOAD support for + * SUA GPUVM mode, but does not support HSA32/HSA64. + * + * + * Device Unified Address - DUA + * + * Device unified address (DUA) is the name of the feature that maps the + * Shared(LDS) memory and Private(Scratch) memory into the overall address + * space for use by the new FLAT_* vector memory ops. The Shared and + * Private memories are mapped as apertures into the address space, + * and the hardware detects when a FLAT_* memory request is to be redirected + * to the LDS or Scratch memory when it falls into one of these apertures. + * Like the SUA apertures, the Shared/Private apertures are 64KB aligned and + * the base/limit is “in” the aperture. For both HSA64 and GPUVM SUA modes, + * the Shared/Private apertures are always placed in a limited selection of + * options in the hole of the 64b address space. For HSA32 mode, the + * Shared/Private apertures can be placed anywhere in the 32b space + * except at 0. + * + * + * HSA64 Apertures for FLAT_* vector ops + * + * For HSA64 SUA mode, the Shared and Private apertures are always placed + * in the hole w/ a limited selection of possible locations. The requests + * that fall in the private aperture are expanded as a function of the + * work-item id (tid) and redirected to the location of the + * “hidden private memory”. The hidden private can be placed in either GPUVM + * or ATC space. The addresses that fall in the shared aperture are + * re-directed to the on-chip LDS memory hardware. + * + * + * HSA32 Apertures for FLAT_* vector ops + * + * In HSA32 mode, the Private and Shared apertures can be placed anywhere + * in the 32b space except at 0 (Private or Shared Base at zero disables + * the apertures). If the base address of the apertures are non-zero + * (ie apertures exists), the size is always 64KB. + * + * + * GPUVM Apertures for FLAT_* vector ops + * + * In GPUVM mode, the Shared/Private apertures are specified identically + * to HSA64 mode where they are always in the hole at a limited selection + * of locations. + * + * + * Aperture Definitions for SUA and DUA + * + * The interpretation of the aperture register definitions for a given + * VMID is a function of the “SUA Mode” which is one of HSA64, HSA32, or + * GPUVM64 discussed in previous sections. The mode is first decoded, and + * then the remaining register decode is a function of the mode. + * + * + * SUA Mode Decode + * + * For the S_LOAD and FLAT_* shader operations, the SUA mode is decoded from + * the COMPUTE_DISPATCH_INITIATOR:DATA_ATC bit and + * the SH_MEM_CONFIG:PTR32 bits. + * + * COMPUTE_DISPATCH_INITIATOR:DATA_ATC SH_MEM_CONFIG:PTR32 Mode + * + * 1 0 HSA64 + * + * 1 1 HSA32 + * + * 0 X GPUVM64 + * + * In general the hardware will ignore the PTR32 bit and treat + * as “0” whenever DATA_ATC = “0”, but sw should set PTR32=0 + * when DATA_ATC=0. + * + * The DATA_ATC bit is only set for compute dispatches. + * All “Draw” dispatches are hardcoded to GPUVM64 mode + * for FLAT_* / S_LOAD operations. + */ + +#define MAKE_GPUVM_APP_BASE(gpu_num) \ + (((uint64_t)(gpu_num) << 61) + 0x1000000000000L) + +#define MAKE_GPUVM_APP_LIMIT(base) \ + (((uint64_t)(base) & \ + 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL) + +#define MAKE_SCRATCH_APP_BASE(gpu_num) \ + (((uint64_t)(gpu_num) << 61) + 0x100000000L) + +#define MAKE_SCRATCH_APP_LIMIT(base) \ + (((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) + +#define MAKE_LDS_APP_BASE(gpu_num) \ + (((uint64_t)(gpu_num) << 61) + 0x0) +#define MAKE_LDS_APP_LIMIT(base) \ + (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) + +int kfd_init_apertures(struct kfd_process *process) +{ + uint8_t id = 0; + struct kfd_dev *dev; + struct kfd_process_device *pdd; + + /*Iterating over all devices*/ + while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL && + id < NUM_OF_SUPPORTED_GPUS) { + + pdd = kfd_create_process_device_data(dev, process); + if (pdd == NULL) { + pr_err("Failed to create process device data\n"); + return -1; + } + /* + * For 64 bit process aperture will be statically reserved in + * the x86_64 non canonical process address space + * amdkfd doesn't currently support apertures for 32 bit process + */ + if (process->is_32bit_user_mode) { + pdd->lds_base = pdd->lds_limit = 0; + pdd->gpuvm_base = pdd->gpuvm_limit = 0; + pdd->scratch_base = pdd->scratch_limit = 0; + } else { + /* + * node id couldn't be 0 - the three MSB bits of + * aperture shoudn't be 0 + */ + pdd->lds_base = MAKE_LDS_APP_BASE(id + 1); + + pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); + + pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1); + + pdd->gpuvm_limit = + MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base); + + pdd->scratch_base = MAKE_SCRATCH_APP_BASE(id + 1); + + pdd->scratch_limit = + MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); + } + + dev_dbg(kfd_device, "node id %u\n", id); + dev_dbg(kfd_device, "gpu id %u\n", pdd->dev->id); + dev_dbg(kfd_device, "lds_base %llX\n", pdd->lds_base); + dev_dbg(kfd_device, "lds_limit %llX\n", pdd->lds_limit); + dev_dbg(kfd_device, "gpuvm_base %llX\n", pdd->gpuvm_base); + dev_dbg(kfd_device, "gpuvm_limit %llX\n", pdd->gpuvm_limit); + dev_dbg(kfd_device, "scratch_base %llX\n", pdd->scratch_base); + dev_dbg(kfd_device, "scratch_limit %llX\n", pdd->scratch_limit); + + id++; + } + + return 0; +} + + diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c new file mode 100644 index 000000000..c7d298e62 --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -0,0 +1,340 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include "kfd_kernel_queue.h" +#include "kfd_priv.h" +#include "kfd_device_queue_manager.h" +#include "kfd_pm4_headers.h" +#include "kfd_pm4_opcodes.h" + +#define PM4_COUNT_ZERO (((1 << 15) - 1) << 16) + +static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size) +{ + struct queue_properties prop; + int retval; + union PM4_MES_TYPE_3_HEADER nop; + + BUG_ON(!kq || !dev); + BUG_ON(type != KFD_QUEUE_TYPE_DIQ && type != KFD_QUEUE_TYPE_HIQ); + + pr_debug("amdkfd: In func %s initializing queue type %d size %d\n", + __func__, KFD_QUEUE_TYPE_HIQ, queue_size); + + nop.opcode = IT_NOP; + nop.type = PM4_TYPE_3; + nop.u32all |= PM4_COUNT_ZERO; + + kq->dev = dev; + kq->nop_packet = nop.u32all; + switch (type) { + case KFD_QUEUE_TYPE_DIQ: + case KFD_QUEUE_TYPE_HIQ: + kq->mqd = dev->dqm->ops.get_mqd_manager(dev->dqm, + KFD_MQD_TYPE_HIQ); + break; + default: + BUG(); + break; + } + + if (kq->mqd == NULL) + return false; + + prop.doorbell_ptr = kfd_get_kernel_doorbell(dev, &prop.doorbell_off); + + if (prop.doorbell_ptr == NULL) { + pr_err("amdkfd: error init doorbell"); + goto err_get_kernel_doorbell; + } + + retval = kfd_gtt_sa_allocate(dev, queue_size, &kq->pq); + if (retval != 0) { + pr_err("amdkfd: error init pq queues size (%d)\n", queue_size); + goto err_pq_allocate_vidmem; + } + + kq->pq_kernel_addr = kq->pq->cpu_ptr; + kq->pq_gpu_addr = kq->pq->gpu_addr; + + retval = kq->ops_asic_specific.initialize(kq, dev, type, queue_size); + if (retval == false) + goto err_eop_allocate_vidmem; + + retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->rptr_kernel), + &kq->rptr_mem); + + if (retval != 0) + goto err_rptr_allocate_vidmem; + + kq->rptr_kernel = kq->rptr_mem->cpu_ptr; + kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr; + + retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->wptr_kernel), + &kq->wptr_mem); + + if (retval != 0) + goto err_wptr_allocate_vidmem; + + kq->wptr_kernel = kq->wptr_mem->cpu_ptr; + kq->wptr_gpu_addr = kq->wptr_mem->gpu_addr; + + memset(kq->pq_kernel_addr, 0, queue_size); + memset(kq->rptr_kernel, 0, sizeof(*kq->rptr_kernel)); + memset(kq->wptr_kernel, 0, sizeof(*kq->wptr_kernel)); + + prop.queue_size = queue_size; + prop.is_interop = false; + prop.priority = 1; + prop.queue_percent = 100; + prop.type = type; + prop.vmid = 0; + prop.queue_address = kq->pq_gpu_addr; + prop.read_ptr = (uint32_t *) kq->rptr_gpu_addr; + prop.write_ptr = (uint32_t *) kq->wptr_gpu_addr; + prop.eop_ring_buffer_address = kq->eop_gpu_addr; + prop.eop_ring_buffer_size = PAGE_SIZE; + + if (init_queue(&kq->queue, prop) != 0) + goto err_init_queue; + + kq->queue->device = dev; + kq->queue->process = kfd_get_process(current); + + retval = kq->mqd->init_mqd(kq->mqd, &kq->queue->mqd, + &kq->queue->mqd_mem_obj, + &kq->queue->gart_mqd_addr, + &kq->queue->properties); + if (retval != 0) + goto err_init_mqd; + + /* assign HIQ to HQD */ + if (type == KFD_QUEUE_TYPE_HIQ) { + pr_debug("assigning hiq to hqd\n"); + kq->queue->pipe = KFD_CIK_HIQ_PIPE; + kq->queue->queue = KFD_CIK_HIQ_QUEUE; + kq->mqd->load_mqd(kq->mqd, kq->queue->mqd, kq->queue->pipe, + kq->queue->queue, NULL); + } else { + /* allocate fence for DIQ */ + + retval = kfd_gtt_sa_allocate(dev, sizeof(uint32_t), + &kq->fence_mem_obj); + + if (retval != 0) + goto err_alloc_fence; + + kq->fence_kernel_address = kq->fence_mem_obj->cpu_ptr; + kq->fence_gpu_addr = kq->fence_mem_obj->gpu_addr; + } + + print_queue(kq->queue); + + return true; +err_alloc_fence: +err_init_mqd: + uninit_queue(kq->queue); +err_init_queue: + kfd_gtt_sa_free(dev, kq->wptr_mem); +err_wptr_allocate_vidmem: + kfd_gtt_sa_free(dev, kq->rptr_mem); +err_rptr_allocate_vidmem: + kfd_gtt_sa_free(dev, kq->eop_mem); +err_eop_allocate_vidmem: + kfd_gtt_sa_free(dev, kq->pq); +err_pq_allocate_vidmem: + kfd_release_kernel_doorbell(dev, prop.doorbell_ptr); +err_get_kernel_doorbell: + return false; + +} + +static void uninitialize(struct kernel_queue *kq) +{ + BUG_ON(!kq); + + if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ) + kq->mqd->destroy_mqd(kq->mqd, + NULL, + false, + QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS, + kq->queue->pipe, + kq->queue->queue); + else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ) + kfd_gtt_sa_free(kq->dev, kq->fence_mem_obj); + + kq->mqd->uninit_mqd(kq->mqd, kq->queue->mqd, kq->queue->mqd_mem_obj); + + kfd_gtt_sa_free(kq->dev, kq->rptr_mem); + kfd_gtt_sa_free(kq->dev, kq->wptr_mem); + kq->ops_asic_specific.uninitialize(kq); + kfd_gtt_sa_free(kq->dev, kq->pq); + kfd_release_kernel_doorbell(kq->dev, + kq->queue->properties.doorbell_ptr); + uninit_queue(kq->queue); +} + +static int acquire_packet_buffer(struct kernel_queue *kq, + size_t packet_size_in_dwords, unsigned int **buffer_ptr) +{ + size_t available_size; + size_t queue_size_dwords; + uint32_t wptr, rptr; + unsigned int *queue_address; + + BUG_ON(!kq || !buffer_ptr); + + rptr = *kq->rptr_kernel; + wptr = *kq->wptr_kernel; + queue_address = (unsigned int *)kq->pq_kernel_addr; + queue_size_dwords = kq->queue->properties.queue_size / sizeof(uint32_t); + + pr_debug("amdkfd: In func %s\nrptr: %d\nwptr: %d\nqueue_address 0x%p\n", + __func__, rptr, wptr, queue_address); + + available_size = (rptr - 1 - wptr + queue_size_dwords) % + queue_size_dwords; + + if (packet_size_in_dwords >= queue_size_dwords || + packet_size_in_dwords >= available_size) { + /* + * make sure calling functions know + * acquire_packet_buffer() failed + */ + *buffer_ptr = NULL; + return -ENOMEM; + } + + if (wptr + packet_size_in_dwords >= queue_size_dwords) { + while (wptr > 0) { + queue_address[wptr] = kq->nop_packet; + wptr = (wptr + 1) % queue_size_dwords; + } + } + + *buffer_ptr = &queue_address[wptr]; + kq->pending_wptr = wptr + packet_size_in_dwords; + + return 0; +} + +static void submit_packet(struct kernel_queue *kq) +{ +#ifdef DEBUG + int i; +#endif + + BUG_ON(!kq); + +#ifdef DEBUG + for (i = *kq->wptr_kernel; i < kq->pending_wptr; i++) { + pr_debug("0x%2X ", kq->pq_kernel_addr[i]); + if (i % 15 == 0) + pr_debug("\n"); + } + pr_debug("\n"); +#endif + + *kq->wptr_kernel = kq->pending_wptr; + write_kernel_doorbell(kq->queue->properties.doorbell_ptr, + kq->pending_wptr); +} + +static void rollback_packet(struct kernel_queue *kq) +{ + BUG_ON(!kq); + kq->pending_wptr = *kq->queue->properties.write_ptr; +} + +struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, + enum kfd_queue_type type) +{ + struct kernel_queue *kq; + + BUG_ON(!dev); + + kq = kzalloc(sizeof(struct kernel_queue), GFP_KERNEL); + if (!kq) + return NULL; + + kq->ops.initialize = initialize; + kq->ops.uninitialize = uninitialize; + kq->ops.acquire_packet_buffer = acquire_packet_buffer; + kq->ops.submit_packet = submit_packet; + kq->ops.rollback_packet = rollback_packet; + + switch (dev->device_info->asic_family) { + case CHIP_CARRIZO: + kernel_queue_init_vi(&kq->ops_asic_specific); + break; + + case CHIP_KAVERI: + kernel_queue_init_cik(&kq->ops_asic_specific); + break; + } + + if (kq->ops.initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE) == false) { + pr_err("amdkfd: failed to init kernel queue\n"); + kfree(kq); + return NULL; + } + return kq; +} + +void kernel_queue_uninit(struct kernel_queue *kq) +{ + BUG_ON(!kq); + + kq->ops.uninitialize(kq); + kfree(kq); +} + +static __attribute__((unused)) void test_kq(struct kfd_dev *dev) +{ + struct kernel_queue *kq; + uint32_t *buffer, i; + int retval; + + BUG_ON(!dev); + + pr_err("amdkfd: starting kernel queue test\n"); + + kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_HIQ); + BUG_ON(!kq); + + retval = kq->ops.acquire_packet_buffer(kq, 5, &buffer); + BUG_ON(retval != 0); + for (i = 0; i < 5; i++) + buffer[i] = kq->nop_packet; + kq->ops.submit_packet(kq); + + pr_err("amdkfd: ending kernel queue test\n"); +} + + diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h new file mode 100644 index 000000000..594053136 --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h @@ -0,0 +1,101 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef KFD_KERNEL_QUEUE_H_ +#define KFD_KERNEL_QUEUE_H_ + +#include +#include +#include "kfd_priv.h" + +/** + * struct kernel_queue_ops + * + * @initialize: Initialize a kernel queue, including allocations of GART memory + * needed for the queue. + * + * @uninitialize: Uninitialize a kernel queue and free all its memory usages. + * + * @acquire_packet_buffer: Returns a pointer to the location in the kernel + * queue ring buffer where the calling function can write its packet. It is + * Guaranteed that there is enough space for that packet. It also updates the + * pending write pointer to that location so subsequent calls to + * acquire_packet_buffer will get a correct write pointer + * + * @submit_packet: Update the write pointer and doorbell of a kernel queue. + * + * @sync_with_hw: Wait until the write pointer and the read pointer of a kernel + * queue are equal, which means the CP has read all the submitted packets. + * + * @rollback_packet: This routine is called if we failed to build an acquired + * packet for some reason. It just overwrites the pending wptr with the current + * one + * + */ +struct kernel_queue_ops { + bool (*initialize)(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size); + void (*uninitialize)(struct kernel_queue *kq); + int (*acquire_packet_buffer)(struct kernel_queue *kq, + size_t packet_size_in_dwords, + unsigned int **buffer_ptr); + + void (*submit_packet)(struct kernel_queue *kq); + void (*rollback_packet)(struct kernel_queue *kq); +}; + +struct kernel_queue { + struct kernel_queue_ops ops; + struct kernel_queue_ops ops_asic_specific; + + /* data */ + struct kfd_dev *dev; + struct mqd_manager *mqd; + struct queue *queue; + uint32_t pending_wptr; + unsigned int nop_packet; + + struct kfd_mem_obj *rptr_mem; + uint32_t *rptr_kernel; + uint64_t rptr_gpu_addr; + struct kfd_mem_obj *wptr_mem; + uint32_t *wptr_kernel; + uint64_t wptr_gpu_addr; + struct kfd_mem_obj *pq; + uint64_t pq_gpu_addr; + uint32_t *pq_kernel_addr; + struct kfd_mem_obj *eop_mem; + uint64_t eop_gpu_addr; + uint32_t *eop_kernel_addr; + + struct kfd_mem_obj *fence_mem_obj; + uint64_t fence_gpu_addr; + void *fence_kernel_address; + + struct list_head list; +}; + +void kernel_queue_init_cik(struct kernel_queue_ops *ops); +void kernel_queue_init_vi(struct kernel_queue_ops *ops); + +#endif /* KFD_KERNEL_QUEUE_H_ */ diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c new file mode 100644 index 000000000..a90eb440b --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c @@ -0,0 +1,44 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_kernel_queue.h" + +static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size); +static void uninitialize_cik(struct kernel_queue *kq); + +void kernel_queue_init_cik(struct kernel_queue_ops *ops) +{ + ops->initialize = initialize_cik; + ops->uninitialize = uninitialize_cik; +} + +static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size) +{ + return true; +} + +static void uninitialize_cik(struct kernel_queue *kq) +{ +} diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c new file mode 100644 index 000000000..f1d48281e --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c @@ -0,0 +1,56 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_kernel_queue.h" + +static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size); +static void uninitialize_vi(struct kernel_queue *kq); + +void kernel_queue_init_vi(struct kernel_queue_ops *ops) +{ + ops->initialize = initialize_vi; + ops->uninitialize = uninitialize_vi; +} + +static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size) +{ + int retval; + + retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem); + if (retval != 0) + return false; + + kq->eop_gpu_addr = kq->eop_mem->gpu_addr; + kq->eop_kernel_addr = kq->eop_mem->cpu_ptr; + + memset(kq->eop_kernel_addr, 0, PAGE_SIZE); + + return true; +} + +static void uninitialize_vi(struct kernel_queue *kq) +{ + kfd_gtt_sa_free(kq->dev, kq->eop_mem); +} diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_module.c new file mode 100644 index 000000000..4e0a68f13 --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -0,0 +1,138 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include "kfd_priv.h" + +#define KFD_DRIVER_AUTHOR "AMD Inc. and others" + +#define KFD_DRIVER_DESC "Standalone HSA driver for AMD's GPUs" +#define KFD_DRIVER_DATE "20150122" +#define KFD_DRIVER_MAJOR 0 +#define KFD_DRIVER_MINOR 7 +#define KFD_DRIVER_PATCHLEVEL 1 + +static const struct kgd2kfd_calls kgd2kfd = { + .exit = kgd2kfd_exit, + .probe = kgd2kfd_probe, + .device_init = kgd2kfd_device_init, + .device_exit = kgd2kfd_device_exit, + .interrupt = kgd2kfd_interrupt, + .suspend = kgd2kfd_suspend, + .resume = kgd2kfd_resume, +}; + +int sched_policy = KFD_SCHED_POLICY_HWS; +module_param(sched_policy, int, 0444); +MODULE_PARM_DESC(sched_policy, + "Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)"); + +int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT; +module_param(max_num_of_queues_per_device, int, 0444); +MODULE_PARM_DESC(max_num_of_queues_per_device, + "Maximum number of supported queues per device (1 = Minimum, 4096 = default)"); + +bool kgd2kfd_init(unsigned interface_version, const struct kgd2kfd_calls **g2f) +{ + /* + * Only one interface version is supported, + * no kfd/kgd version skew allowed. + */ + if (interface_version != KFD_INTERFACE_VERSION) + return false; + + *g2f = &kgd2kfd; + + return true; +} +EXPORT_SYMBOL(kgd2kfd_init); + +void kgd2kfd_exit(void) +{ +} + +static int __init kfd_module_init(void) +{ + int err; + + /* Verify module parameters */ + if ((sched_policy < KFD_SCHED_POLICY_HWS) || + (sched_policy > KFD_SCHED_POLICY_NO_HWS)) { + pr_err("kfd: sched_policy has invalid value\n"); + return -1; + } + + /* Verify module parameters */ + if ((max_num_of_queues_per_device < 1) || + (max_num_of_queues_per_device > + KFD_MAX_NUM_OF_QUEUES_PER_DEVICE)) { + pr_err("kfd: max_num_of_queues_per_device must be between 1 to KFD_MAX_NUM_OF_QUEUES_PER_DEVICE\n"); + return -1; + } + + err = kfd_pasid_init(); + if (err < 0) + goto err_pasid; + + err = kfd_chardev_init(); + if (err < 0) + goto err_ioctl; + + err = kfd_topology_init(); + if (err < 0) + goto err_topology; + + kfd_process_create_wq(); + + dev_info(kfd_device, "Initialized module\n"); + + return 0; + +err_topology: + kfd_chardev_exit(); +err_ioctl: + kfd_pasid_exit(); +err_pasid: + return err; +} + +static void __exit kfd_module_exit(void) +{ + kfd_process_destroy_wq(); + kfd_topology_shutdown(); + kfd_chardev_exit(); + kfd_pasid_exit(); + dev_info(kfd_device, "Removed module\n"); +} + +module_init(kfd_module_init); +module_exit(kfd_module_exit); + +MODULE_AUTHOR(KFD_DRIVER_AUTHOR); +MODULE_DESCRIPTION(KFD_DRIVER_DESC); +MODULE_LICENSE("GPL and additional rights"); +MODULE_VERSION(__stringify(KFD_DRIVER_MAJOR) "." + __stringify(KFD_DRIVER_MINOR) "." + __stringify(KFD_DRIVER_PATCHLEVEL)); diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c new file mode 100644 index 000000000..b1ef1368c --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -0,0 +1,37 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_priv.h" + +struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) +{ + switch (dev->device_info->asic_family) { + case CHIP_KAVERI: + return mqd_manager_init_cik(type, dev); + case CHIP_CARRIZO: + return mqd_manager_init_vi(type, dev); + } + + return NULL; +} diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h new file mode 100644 index 000000000..213a71e0b --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h @@ -0,0 +1,91 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef KFD_MQD_MANAGER_H_ +#define KFD_MQD_MANAGER_H_ + +#include "kfd_priv.h" + +/** + * struct mqd_manager + * + * @init_mqd: Allocates the mqd buffer on local gpu memory and initialize it. + * + * @load_mqd: Loads the mqd to a concrete hqd slot. Used only for no cp + * scheduling mode. + * + * @update_mqd: Handles a update call for the MQD + * + * @destroy_mqd: Destroys the HQD slot and by that preempt the relevant queue. + * Used only for no cp scheduling. + * + * @uninit_mqd: Releases the mqd buffer from local gpu memory. + * + * @is_occupied: Checks if the relevant HQD slot is occupied. + * + * @mqd_mutex: Mqd manager mutex. + * + * @dev: The kfd device structure coupled with this module. + * + * MQD stands for Memory Queue Descriptor which represents the current queue + * state in the memory and initiate the HQD (Hardware Queue Descriptor) state. + * This structure is actually a base class for the different types of MQDs + * structures for the variant ASICs that should be supported in the future. + * This base class is also contains all the MQD specific operations. + * Another important thing to mention is that each queue has a MQD that keeps + * his state (or context) after each preemption or reassignment. + * Basically there are a instances of the mqd manager class per MQD type per + * ASIC. Currently the kfd driver supports only Kaveri so there are instances + * per KFD_MQD_TYPE for each device. + * + */ + +struct mqd_manager { + int (*init_mqd)(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q); + + int (*load_mqd)(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + uint32_t __user *wptr); + + int (*update_mqd)(struct mqd_manager *mm, void *mqd, + struct queue_properties *q); + + int (*destroy_mqd)(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id); + + void (*uninit_mqd)(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj); + + bool (*is_occupied)(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id); + + struct mutex mqd_mutex; + struct kfd_dev *dev; +}; + +#endif /* KFD_MQD_MANAGER_H_ */ diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c new file mode 100644 index 000000000..434979428 --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c @@ -0,0 +1,451 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include +#include +#include "kfd_priv.h" +#include "kfd_mqd_manager.h" +#include "cik_regs.h" +#include "cik_structs.h" + +static inline struct cik_mqd *get_mqd(void *mqd) +{ + return (struct cik_mqd *)mqd; +} + +static int init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + uint64_t addr; + struct cik_mqd *m; + int retval; + + BUG_ON(!mm || !q || !mqd); + + pr_debug("kfd: In func %s\n", __func__); + + retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd), + mqd_mem_obj); + + if (retval != 0) + return -ENOMEM; + + m = (struct cik_mqd *) (*mqd_mem_obj)->cpu_ptr; + addr = (*mqd_mem_obj)->gpu_addr; + + memset(m, 0, ALIGN(sizeof(struct cik_mqd), 256)); + + m->header = 0xC0310800; + m->compute_pipelinestat_enable = 1; + m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; + + /* + * Make sure to use the last queue state saved on mqd when the cp + * reassigns the queue, so when queue is switched on/off (e.g over + * subscription or quantum timeout) the context will be consistent + */ + m->cp_hqd_persistent_state = + DEFAULT_CP_HQD_PERSISTENT_STATE | PRELOAD_REQ; + + m->cp_mqd_control = MQD_CONTROL_PRIV_STATE_EN; + m->cp_mqd_base_addr_lo = lower_32_bits(addr); + m->cp_mqd_base_addr_hi = upper_32_bits(addr); + + m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE | IB_ATC_EN; + /* Although WinKFD writes this, I suspect it should not be necessary */ + m->cp_hqd_ib_control = IB_ATC_EN | DEFAULT_MIN_IB_AVAIL_SIZE; + + m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS | + QUANTUM_DURATION(10); + + /* + * Pipe Priority + * Identifies the pipe relative priority when this queue is connected + * to the pipeline. The pipe priority is against the GFX pipe and HP3D. + * In KFD we are using a fixed pipe priority set to CS_MEDIUM. + * 0 = CS_LOW (typically below GFX) + * 1 = CS_MEDIUM (typically between HP3D and GFX + * 2 = CS_HIGH (typically above HP3D) + */ + m->cp_hqd_pipe_priority = 1; + m->cp_hqd_queue_priority = 15; + + if (q->format == KFD_QUEUE_FORMAT_AQL) + m->cp_hqd_iq_rptr = AQL_ENABLE; + + *mqd = m; + if (gart_addr != NULL) + *gart_addr = addr; + retval = mm->update_mqd(mm, m, q); + + return retval; +} + +static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + int retval; + struct cik_sdma_rlc_registers *m; + + BUG_ON(!mm || !mqd || !mqd_mem_obj); + + retval = kfd_gtt_sa_allocate(mm->dev, + sizeof(struct cik_sdma_rlc_registers), + mqd_mem_obj); + + if (retval != 0) + return -ENOMEM; + + m = (struct cik_sdma_rlc_registers *) (*mqd_mem_obj)->cpu_ptr; + + memset(m, 0, sizeof(struct cik_sdma_rlc_registers)); + + *mqd = m; + if (gart_addr != NULL) + *gart_addr = (*mqd_mem_obj)->gpu_addr; + + retval = mm->update_mqd(mm, m, q); + + return retval; +} + +static void uninit_mqd(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) +{ + BUG_ON(!mm || !mqd); + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); +} + +static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) +{ + BUG_ON(!mm || !mqd); + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); +} + +static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, + uint32_t queue_id, uint32_t __user *wptr) +{ + return mm->dev->kfd2kgd->hqd_load + (mm->dev->kgd, mqd, pipe_id, queue_id, wptr); +} + +static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + uint32_t __user *wptr) +{ + return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd); +} + +static int update_mqd(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct cik_mqd *m; + + BUG_ON(!mm || !q || !mqd); + + pr_debug("kfd: In func %s\n", __func__); + + m = get_mqd(mqd); + m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE | + DEFAULT_MIN_AVAIL_SIZE | PQ_ATC_EN; + + /* + * Calculating queue size which is log base 2 of actual queue size -1 + * dwords and another -1 for ffs + */ + m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int)) + - 1 - 1; + m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_doorbell_control = DOORBELL_EN | + DOORBELL_OFFSET(q->doorbell_off); + + m->cp_hqd_vmid = q->vmid; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + m->cp_hqd_pq_control |= NO_UPDATE_RPTR; + } + + m->cp_hqd_active = 0; + q->is_active = false; + if (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0) { + m->cp_hqd_active = 1; + q->is_active = true; + } + + return 0; +} + +static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct cik_sdma_rlc_registers *m; + + BUG_ON(!mm || !mqd || !q); + + m = get_sdma_mqd(mqd); + m->sdma_rlc_rb_cntl = + SDMA_RB_SIZE((ffs(q->queue_size / sizeof(unsigned int)))) | + SDMA_RB_VMID(q->vmid) | + SDMA_RPTR_WRITEBACK_ENABLE | + SDMA_RPTR_WRITEBACK_TIMER(6); + + m->sdma_rlc_rb_base = lower_32_bits(q->queue_address >> 8); + m->sdma_rlc_rb_base_hi = upper_32_bits(q->queue_address >> 8); + m->sdma_rlc_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->sdma_rlc_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->sdma_rlc_doorbell = SDMA_OFFSET(q->doorbell_off) | SDMA_DB_ENABLE; + m->sdma_rlc_virtual_addr = q->sdma_vm_addr; + + m->sdma_engine_id = q->sdma_engine_id; + m->sdma_queue_id = q->sdma_queue_id; + + q->is_active = false; + if (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0) { + m->sdma_rlc_rb_cntl |= SDMA_RB_ENABLE; + q->is_active = true; + } + + return 0; +} + +static int destroy_mqd(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_destroy(mm->dev->kgd, type, timeout, + pipe_id, queue_id); +} + +/* + * preempt type here is ignored because there is only one way + * to preempt sdma queue + */ +static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); +} + +static bool is_occupied(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id) +{ + + return mm->dev->kfd2kgd->hqd_is_occupied(mm->dev->kgd, queue_address, + pipe_id, queue_id); + +} + +static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); +} + +/* + * HIQ MQD Implementation, concrete implementation for HIQ MQD implementation. + * The HIQ queue in Kaveri is using the same MQD structure as all the user mode + * queues but with different initial values. + */ + +static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + uint64_t addr; + struct cik_mqd *m; + int retval; + + BUG_ON(!mm || !q || !mqd || !mqd_mem_obj); + + pr_debug("kfd: In func %s\n", __func__); + + retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd), + mqd_mem_obj); + + if (retval != 0) + return -ENOMEM; + + m = (struct cik_mqd *) (*mqd_mem_obj)->cpu_ptr; + addr = (*mqd_mem_obj)->gpu_addr; + + memset(m, 0, ALIGN(sizeof(struct cik_mqd), 256)); + + m->header = 0xC0310800; + m->compute_pipelinestat_enable = 1; + m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; + + m->cp_hqd_persistent_state = DEFAULT_CP_HQD_PERSISTENT_STATE | + PRELOAD_REQ; + m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS | + QUANTUM_DURATION(10); + + m->cp_mqd_control = MQD_CONTROL_PRIV_STATE_EN; + m->cp_mqd_base_addr_lo = lower_32_bits(addr); + m->cp_mqd_base_addr_hi = upper_32_bits(addr); + + m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE; + + /* + * Pipe Priority + * Identifies the pipe relative priority when this queue is connected + * to the pipeline. The pipe priority is against the GFX pipe and HP3D. + * In KFD we are using a fixed pipe priority set to CS_MEDIUM. + * 0 = CS_LOW (typically below GFX) + * 1 = CS_MEDIUM (typically between HP3D and GFX + * 2 = CS_HIGH (typically above HP3D) + */ + m->cp_hqd_pipe_priority = 1; + m->cp_hqd_queue_priority = 15; + + *mqd = m; + if (gart_addr) + *gart_addr = addr; + retval = mm->update_mqd(mm, m, q); + + return retval; +} + +static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct cik_mqd *m; + + BUG_ON(!mm || !q || !mqd); + + pr_debug("kfd: In func %s\n", __func__); + + m = get_mqd(mqd); + m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE | + DEFAULT_MIN_AVAIL_SIZE | + PRIV_STATE | + KMD_QUEUE; + + /* + * Calculating queue size which is log base 2 of actual queue + * size -1 dwords + */ + m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int)) + - 1 - 1; + m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_doorbell_control = DOORBELL_EN | + DOORBELL_OFFSET(q->doorbell_off); + + m->cp_hqd_vmid = q->vmid; + + m->cp_hqd_active = 0; + q->is_active = false; + if (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0) { + m->cp_hqd_active = 1; + q->is_active = true; + } + + return 0; +} + +struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) +{ + struct cik_sdma_rlc_registers *m; + + BUG_ON(!mqd); + + m = (struct cik_sdma_rlc_registers *)mqd; + + return m; +} + +struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) +{ + struct mqd_manager *mqd; + + BUG_ON(!dev); + BUG_ON(type >= KFD_MQD_TYPE_MAX); + + pr_debug("kfd: In func %s\n", __func__); + + mqd = kzalloc(sizeof(struct mqd_manager), GFP_KERNEL); + if (!mqd) + return NULL; + + mqd->dev = dev; + + switch (type) { + case KFD_MQD_TYPE_CP: + case KFD_MQD_TYPE_COMPUTE: + mqd->init_mqd = init_mqd; + mqd->uninit_mqd = uninit_mqd; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; + break; + case KFD_MQD_TYPE_HIQ: + mqd->init_mqd = init_mqd_hiq; + mqd->uninit_mqd = uninit_mqd; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd_hiq; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; + break; + case KFD_MQD_TYPE_SDMA: + mqd->init_mqd = init_mqd_sdma; + mqd->uninit_mqd = uninit_mqd_sdma; + mqd->load_mqd = load_mqd_sdma; + mqd->update_mqd = update_mqd_sdma; + mqd->destroy_mqd = destroy_mqd_sdma; + mqd->is_occupied = is_occupied_sdma; + break; + default: + kfree(mqd); + return NULL; + } + + return mqd; +} + diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c new file mode 100644 index 000000000..b3a7e3ba1 --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c @@ -0,0 +1,33 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include +#include "kfd_priv.h" +#include "kfd_mqd_manager.h" + +struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) +{ + pr_warn("amdkfd: VI MQD is not currently supported\n"); + return NULL; +} diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c new file mode 100644 index 000000000..e2533d875 --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -0,0 +1,557 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include +#include +#include "kfd_device_queue_manager.h" +#include "kfd_kernel_queue.h" +#include "kfd_priv.h" +#include "kfd_pm4_headers.h" +#include "kfd_pm4_opcodes.h" + +static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, + unsigned int buffer_size_bytes) +{ + unsigned int temp = *wptr + increment_bytes / sizeof(uint32_t); + + BUG_ON((temp * sizeof(uint32_t)) > buffer_size_bytes); + *wptr = temp; +} + +static unsigned int build_pm4_header(unsigned int opcode, size_t packet_size) +{ + union PM4_MES_TYPE_3_HEADER header; + + header.u32all = 0; + header.opcode = opcode; + header.count = packet_size/sizeof(uint32_t) - 2; + header.type = PM4_TYPE_3; + + return header.u32all; +} + +static void pm_calc_rlib_size(struct packet_manager *pm, + unsigned int *rlib_size, + bool *over_subscription) +{ + unsigned int process_count, queue_count; + + BUG_ON(!pm || !rlib_size || !over_subscription); + + process_count = pm->dqm->processes_count; + queue_count = pm->dqm->queue_count; + + /* check if there is over subscription*/ + *over_subscription = false; + if ((process_count > 1) || + queue_count > PIPE_PER_ME_CP_SCHEDULING * QUEUES_PER_PIPE) { + *over_subscription = true; + pr_debug("kfd: over subscribed runlist\n"); + } + + /* calculate run list ib allocation size */ + *rlib_size = process_count * sizeof(struct pm4_map_process) + + queue_count * sizeof(struct pm4_map_queues); + + /* + * Increase the allocation size in case we need a chained run list + * when over subscription + */ + if (*over_subscription) + *rlib_size += sizeof(struct pm4_runlist); + + pr_debug("kfd: runlist ib size %d\n", *rlib_size); +} + +static int pm_allocate_runlist_ib(struct packet_manager *pm, + unsigned int **rl_buffer, + uint64_t *rl_gpu_buffer, + unsigned int *rl_buffer_size, + bool *is_over_subscription) +{ + int retval; + + BUG_ON(!pm); + BUG_ON(pm->allocated == true); + BUG_ON(is_over_subscription == NULL); + + pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription); + + retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size, + &pm->ib_buffer_obj); + + if (retval != 0) { + pr_err("kfd: failed to allocate runlist IB\n"); + return retval; + } + + *(void **)rl_buffer = pm->ib_buffer_obj->cpu_ptr; + *rl_gpu_buffer = pm->ib_buffer_obj->gpu_addr; + + memset(*rl_buffer, 0, *rl_buffer_size); + pm->allocated = true; + return retval; +} + +static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer, + uint64_t ib, size_t ib_size_in_dwords, bool chain) +{ + struct pm4_runlist *packet; + + BUG_ON(!pm || !buffer || !ib); + + packet = (struct pm4_runlist *)buffer; + + memset(buffer, 0, sizeof(struct pm4_runlist)); + packet->header.u32all = build_pm4_header(IT_RUN_LIST, + sizeof(struct pm4_runlist)); + + packet->bitfields4.ib_size = ib_size_in_dwords; + packet->bitfields4.chain = chain ? 1 : 0; + packet->bitfields4.offload_polling = 0; + packet->bitfields4.valid = 1; + packet->ordinal2 = lower_32_bits(ib); + packet->bitfields3.ib_base_hi = upper_32_bits(ib); + + return 0; +} + +static int pm_create_map_process(struct packet_manager *pm, uint32_t *buffer, + struct qcm_process_device *qpd) +{ + struct pm4_map_process *packet; + struct queue *cur; + uint32_t num_queues; + + BUG_ON(!pm || !buffer || !qpd); + + packet = (struct pm4_map_process *)buffer; + + pr_debug("kfd: In func %s\n", __func__); + + memset(buffer, 0, sizeof(struct pm4_map_process)); + + packet->header.u32all = build_pm4_header(IT_MAP_PROCESS, + sizeof(struct pm4_map_process)); + packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; + packet->bitfields2.process_quantum = 1; + packet->bitfields2.pasid = qpd->pqm->process->pasid; + packet->bitfields3.page_table_base = qpd->page_table_base; + packet->bitfields10.gds_size = qpd->gds_size; + packet->bitfields10.num_gws = qpd->num_gws; + packet->bitfields10.num_oac = qpd->num_oac; + num_queues = 0; + list_for_each_entry(cur, &qpd->queues_list, list) + num_queues++; + packet->bitfields10.num_queues = num_queues; + + packet->sh_mem_config = qpd->sh_mem_config; + packet->sh_mem_bases = qpd->sh_mem_bases; + packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; + packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; + + packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); + packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); + + return 0; +} + +static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer, + struct queue *q) +{ + struct pm4_map_queues *packet; + + BUG_ON(!pm || !buffer || !q); + + pr_debug("kfd: In func %s\n", __func__); + + packet = (struct pm4_map_queues *)buffer; + memset(buffer, 0, sizeof(struct pm4_map_queues)); + + packet->header.u32all = build_pm4_header(IT_MAP_QUEUES, + sizeof(struct pm4_map_queues)); + packet->bitfields2.alloc_format = + alloc_format__mes_map_queues__one_per_pipe; + packet->bitfields2.num_queues = 1; + packet->bitfields2.queue_sel = + queue_sel__mes_map_queues__map_to_hws_determined_queue_slots; + + packet->bitfields2.vidmem = (q->properties.is_interop) ? + vidmem__mes_map_queues__uses_video_memory : + vidmem__mes_map_queues__uses_no_video_memory; + + switch (q->properties.type) { + case KFD_QUEUE_TYPE_COMPUTE: + case KFD_QUEUE_TYPE_DIQ: + packet->bitfields2.engine_sel = + engine_sel__mes_map_queues__compute; + break; + case KFD_QUEUE_TYPE_SDMA: + packet->bitfields2.engine_sel = + engine_sel__mes_map_queues__sdma0; + break; + default: + BUG(); + break; + } + + packet->mes_map_queues_ordinals[0].bitfields3.doorbell_offset = + q->properties.doorbell_off; + + packet->mes_map_queues_ordinals[0].mqd_addr_lo = + lower_32_bits(q->gart_mqd_addr); + + packet->mes_map_queues_ordinals[0].mqd_addr_hi = + upper_32_bits(q->gart_mqd_addr); + + packet->mes_map_queues_ordinals[0].wptr_addr_lo = + lower_32_bits((uint64_t)q->properties.write_ptr); + + packet->mes_map_queues_ordinals[0].wptr_addr_hi = + upper_32_bits((uint64_t)q->properties.write_ptr); + + return 0; +} + +static int pm_create_runlist_ib(struct packet_manager *pm, + struct list_head *queues, + uint64_t *rl_gpu_addr, + size_t *rl_size_bytes) +{ + unsigned int alloc_size_bytes; + unsigned int *rl_buffer, rl_wptr, i; + int retval, proccesses_mapped; + struct device_process_node *cur; + struct qcm_process_device *qpd; + struct queue *q; + struct kernel_queue *kq; + bool is_over_subscription; + + BUG_ON(!pm || !queues || !rl_size_bytes || !rl_gpu_addr); + + rl_wptr = retval = proccesses_mapped = 0; + + retval = pm_allocate_runlist_ib(pm, &rl_buffer, rl_gpu_addr, + &alloc_size_bytes, &is_over_subscription); + if (retval != 0) + return retval; + + *rl_size_bytes = alloc_size_bytes; + + pr_debug("kfd: In func %s\n", __func__); + pr_debug("kfd: building runlist ib process count: %d queues count %d\n", + pm->dqm->processes_count, pm->dqm->queue_count); + + /* build the run list ib packet */ + list_for_each_entry(cur, queues, list) { + qpd = cur->qpd; + /* build map process packet */ + if (proccesses_mapped >= pm->dqm->processes_count) { + pr_debug("kfd: not enough space left in runlist IB\n"); + pm_release_ib(pm); + return -ENOMEM; + } + retval = pm_create_map_process(pm, &rl_buffer[rl_wptr], qpd); + if (retval != 0) + return retval; + proccesses_mapped++; + inc_wptr(&rl_wptr, sizeof(struct pm4_map_process), + alloc_size_bytes); + + list_for_each_entry(kq, &qpd->priv_queue_list, list) { + if (kq->queue->properties.is_active != true) + continue; + retval = pm_create_map_queue(pm, &rl_buffer[rl_wptr], + kq->queue); + if (retval != 0) + return retval; + inc_wptr(&rl_wptr, sizeof(struct pm4_map_queues), + alloc_size_bytes); + } + + list_for_each_entry(q, &qpd->queues_list, list) { + if (q->properties.is_active != true) + continue; + retval = pm_create_map_queue(pm, + &rl_buffer[rl_wptr], q); + if (retval != 0) + return retval; + inc_wptr(&rl_wptr, sizeof(struct pm4_map_queues), + alloc_size_bytes); + } + } + + pr_debug("kfd: finished map process and queues to runlist\n"); + + if (is_over_subscription) + pm_create_runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr, + alloc_size_bytes / sizeof(uint32_t), true); + + for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++) + pr_debug("0x%2X ", rl_buffer[i]); + pr_debug("\n"); + + return 0; +} + +int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) +{ + BUG_ON(!dqm); + + pm->dqm = dqm; + mutex_init(&pm->lock); + pm->priv_queue = kernel_queue_init(dqm->dev, KFD_QUEUE_TYPE_HIQ); + if (pm->priv_queue == NULL) { + mutex_destroy(&pm->lock); + return -ENOMEM; + } + pm->allocated = false; + + return 0; +} + +void pm_uninit(struct packet_manager *pm) +{ + BUG_ON(!pm); + + mutex_destroy(&pm->lock); + kernel_queue_uninit(pm->priv_queue); +} + +int pm_send_set_resources(struct packet_manager *pm, + struct scheduling_resources *res) +{ + struct pm4_set_resources *packet; + + BUG_ON(!pm || !res); + + pr_debug("kfd: In func %s\n", __func__); + + mutex_lock(&pm->lock); + pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, + sizeof(*packet) / sizeof(uint32_t), + (unsigned int **)&packet); + if (packet == NULL) { + mutex_unlock(&pm->lock); + pr_err("kfd: failed to allocate buffer on kernel queue\n"); + return -ENOMEM; + } + + memset(packet, 0, sizeof(struct pm4_set_resources)); + packet->header.u32all = build_pm4_header(IT_SET_RESOURCES, + sizeof(struct pm4_set_resources)); + + packet->bitfields2.queue_type = + queue_type__mes_set_resources__hsa_interface_queue_hiq; + packet->bitfields2.vmid_mask = res->vmid_mask; + packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY; + packet->bitfields7.oac_mask = res->oac_mask; + packet->bitfields8.gds_heap_base = res->gds_heap_base; + packet->bitfields8.gds_heap_size = res->gds_heap_size; + + packet->gws_mask_lo = lower_32_bits(res->gws_mask); + packet->gws_mask_hi = upper_32_bits(res->gws_mask); + + packet->queue_mask_lo = lower_32_bits(res->queue_mask); + packet->queue_mask_hi = upper_32_bits(res->queue_mask); + + pm->priv_queue->ops.submit_packet(pm->priv_queue); + + mutex_unlock(&pm->lock); + + return 0; +} + +int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) +{ + uint64_t rl_gpu_ib_addr; + uint32_t *rl_buffer; + size_t rl_ib_size, packet_size_dwords; + int retval; + + BUG_ON(!pm || !dqm_queues); + + retval = pm_create_runlist_ib(pm, dqm_queues, &rl_gpu_ib_addr, + &rl_ib_size); + if (retval != 0) + goto fail_create_runlist_ib; + + pr_debug("kfd: runlist IB address: 0x%llX\n", rl_gpu_ib_addr); + + packet_size_dwords = sizeof(struct pm4_runlist) / sizeof(uint32_t); + mutex_lock(&pm->lock); + + retval = pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, + packet_size_dwords, &rl_buffer); + if (retval != 0) + goto fail_acquire_packet_buffer; + + retval = pm_create_runlist(pm, rl_buffer, rl_gpu_ib_addr, + rl_ib_size / sizeof(uint32_t), false); + if (retval != 0) + goto fail_create_runlist; + + pm->priv_queue->ops.submit_packet(pm->priv_queue); + + mutex_unlock(&pm->lock); + + return retval; + +fail_create_runlist: + pm->priv_queue->ops.rollback_packet(pm->priv_queue); +fail_acquire_packet_buffer: + mutex_unlock(&pm->lock); +fail_create_runlist_ib: + if (pm->allocated == true) + pm_release_ib(pm); + return retval; +} + +int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, + uint32_t fence_value) +{ + int retval; + struct pm4_query_status *packet; + + BUG_ON(!pm || !fence_address); + + mutex_lock(&pm->lock); + retval = pm->priv_queue->ops.acquire_packet_buffer( + pm->priv_queue, + sizeof(struct pm4_query_status) / sizeof(uint32_t), + (unsigned int **)&packet); + if (retval != 0) + goto fail_acquire_packet_buffer; + + packet->header.u32all = build_pm4_header(IT_QUERY_STATUS, + sizeof(struct pm4_query_status)); + + packet->bitfields2.context_id = 0; + packet->bitfields2.interrupt_sel = + interrupt_sel__mes_query_status__completion_status; + packet->bitfields2.command = + command__mes_query_status__fence_only_after_write_ack; + + packet->addr_hi = upper_32_bits((uint64_t)fence_address); + packet->addr_lo = lower_32_bits((uint64_t)fence_address); + packet->data_hi = upper_32_bits((uint64_t)fence_value); + packet->data_lo = lower_32_bits((uint64_t)fence_value); + + pm->priv_queue->ops.submit_packet(pm->priv_queue); + mutex_unlock(&pm->lock); + + return 0; + +fail_acquire_packet_buffer: + mutex_unlock(&pm->lock); + return retval; +} + +int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, + enum kfd_preempt_type_filter mode, + uint32_t filter_param, bool reset, + unsigned int sdma_engine) +{ + int retval; + uint32_t *buffer; + struct pm4_unmap_queues *packet; + + BUG_ON(!pm); + + mutex_lock(&pm->lock); + retval = pm->priv_queue->ops.acquire_packet_buffer( + pm->priv_queue, + sizeof(struct pm4_unmap_queues) / sizeof(uint32_t), + &buffer); + if (retval != 0) + goto err_acquire_packet_buffer; + + packet = (struct pm4_unmap_queues *)buffer; + memset(buffer, 0, sizeof(struct pm4_unmap_queues)); + + packet->header.u32all = build_pm4_header(IT_UNMAP_QUEUES, + sizeof(struct pm4_unmap_queues)); + switch (type) { + case KFD_QUEUE_TYPE_COMPUTE: + case KFD_QUEUE_TYPE_DIQ: + packet->bitfields2.engine_sel = + engine_sel__mes_unmap_queues__compute; + break; + case KFD_QUEUE_TYPE_SDMA: + packet->bitfields2.engine_sel = + engine_sel__mes_unmap_queues__sdma0 + sdma_engine; + break; + default: + BUG(); + break; + } + + if (reset) + packet->bitfields2.action = + action__mes_unmap_queues__reset_queues; + else + packet->bitfields2.action = + action__mes_unmap_queues__preempt_queues; + + switch (mode) { + case KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_specified_queues; + packet->bitfields2.num_queues = 1; + packet->bitfields3b.doorbell_offset0 = filter_param; + break; + case KFD_PREEMPT_TYPE_FILTER_BY_PASID: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; + packet->bitfields3a.pasid = filter_param; + break; + case KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_all_active_queues; + break; + default: + BUG(); + break; + }; + + pm->priv_queue->ops.submit_packet(pm->priv_queue); + + mutex_unlock(&pm->lock); + return 0; + +err_acquire_packet_buffer: + mutex_unlock(&pm->lock); + return retval; +} + +void pm_release_ib(struct packet_manager *pm) +{ + BUG_ON(!pm); + + mutex_lock(&pm->lock); + if (pm->allocated) { + kfd_gtt_sa_free(pm->dqm->dev, pm->ib_buffer_obj); + pm->allocated = false; + } + mutex_unlock(&pm->lock); +} diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c new file mode 100644 index 000000000..6cfe7f1f1 --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c @@ -0,0 +1,96 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include "kfd_priv.h" + +static unsigned long *pasid_bitmap; +static unsigned int pasid_limit; +static DEFINE_MUTEX(pasid_mutex); + +int kfd_pasid_init(void) +{ + pasid_limit = KFD_MAX_NUM_OF_PROCESSES; + + pasid_bitmap = kcalloc(BITS_TO_LONGS(pasid_limit), sizeof(long), GFP_KERNEL); + if (!pasid_bitmap) + return -ENOMEM; + + set_bit(0, pasid_bitmap); /* PASID 0 is reserved. */ + + return 0; +} + +void kfd_pasid_exit(void) +{ + kfree(pasid_bitmap); +} + +bool kfd_set_pasid_limit(unsigned int new_limit) +{ + if (new_limit < pasid_limit) { + bool ok; + + mutex_lock(&pasid_mutex); + + /* ensure that no pasids >= new_limit are in-use */ + ok = (find_next_bit(pasid_bitmap, pasid_limit, new_limit) == + pasid_limit); + if (ok) + pasid_limit = new_limit; + + mutex_unlock(&pasid_mutex); + + return ok; + } + + return true; +} + +inline unsigned int kfd_get_pasid_limit(void) +{ + return pasid_limit; +} + +unsigned int kfd_pasid_alloc(void) +{ + unsigned int found; + + mutex_lock(&pasid_mutex); + + found = find_first_zero_bit(pasid_bitmap, pasid_limit); + if (found == pasid_limit) + found = 0; + else + set_bit(found, pasid_bitmap); + + mutex_unlock(&pasid_mutex); + + return found; +} + +void kfd_pasid_free(unsigned int pasid) +{ + BUG_ON(pasid == 0 || pasid >= pasid_limit); + clear_bit(pasid, pasid_bitmap); +} diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h new file mode 100644 index 000000000..071ad5724 --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers.h @@ -0,0 +1,405 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef KFD_PM4_HEADERS_H_ +#define KFD_PM4_HEADERS_H_ + +#ifndef PM4_MES_HEADER_DEFINED +#define PM4_MES_HEADER_DEFINED +union PM4_MES_TYPE_3_HEADER { + struct { + uint32_t reserved1:8; /* < reserved */ + uint32_t opcode:8; /* < IT opcode */ + uint32_t count:14; /* < number of DWORDs - 1 + * in the information body. + */ + uint32_t type:2; /* < packet identifier. + * It should be 3 for type 3 packets + */ + }; + uint32_t u32all; +}; +#endif /* PM4_MES_HEADER_DEFINED */ + +/* --------------------MES_SET_RESOURCES-------------------- */ + +#ifndef PM4_MES_SET_RESOURCES_DEFINED +#define PM4_MES_SET_RESOURCES_DEFINED +enum set_resources_queue_type_enum { + queue_type__mes_set_resources__kernel_interface_queue_kiq = 0, + queue_type__mes_set_resources__hsa_interface_queue_hiq = 1, + queue_type__mes_set_resources__hsa_debug_interface_queue = 4 +}; + +struct pm4_set_resources { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t vmid_mask:16; + uint32_t unmap_latency:8; + uint32_t reserved1:5; + enum set_resources_queue_type_enum queue_type:3; + } bitfields2; + uint32_t ordinal2; + }; + + uint32_t queue_mask_lo; + uint32_t queue_mask_hi; + uint32_t gws_mask_lo; + uint32_t gws_mask_hi; + + union { + struct { + uint32_t oac_mask:16; + uint32_t reserved2:16; + } bitfields7; + uint32_t ordinal7; + }; + + union { + struct { + uint32_t gds_heap_base:6; + uint32_t reserved3:5; + uint32_t gds_heap_size:6; + uint32_t reserved4:15; + } bitfields8; + uint32_t ordinal8; + }; + +}; +#endif + +/*--------------------MES_RUN_LIST-------------------- */ + +#ifndef PM4_MES_RUN_LIST_DEFINED +#define PM4_MES_RUN_LIST_DEFINED + +struct pm4_runlist { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t reserved1:2; + uint32_t ib_base_lo:30; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t ib_base_hi:16; + uint32_t reserved2:16; + } bitfields3; + uint32_t ordinal3; + }; + + union { + struct { + uint32_t ib_size:20; + uint32_t chain:1; + uint32_t offload_polling:1; + uint32_t reserved3:1; + uint32_t valid:1; + uint32_t reserved4:8; + } bitfields4; + uint32_t ordinal4; + }; + +}; +#endif + +/*--------------------MES_MAP_PROCESS-------------------- */ + +#ifndef PM4_MES_MAP_PROCESS_DEFINED +#define PM4_MES_MAP_PROCESS_DEFINED + +struct pm4_map_process { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t pasid:16; + uint32_t reserved1:8; + uint32_t diq_enable:1; + uint32_t process_quantum:7; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t page_table_base:28; + uint32_t reserved3:4; + } bitfields3; + uint32_t ordinal3; + }; + + uint32_t sh_mem_bases; + uint32_t sh_mem_ape1_base; + uint32_t sh_mem_ape1_limit; + uint32_t sh_mem_config; + uint32_t gds_addr_lo; + uint32_t gds_addr_hi; + + union { + struct { + uint32_t num_gws:6; + uint32_t reserved4:2; + uint32_t num_oac:4; + uint32_t reserved5:4; + uint32_t gds_size:6; + uint32_t num_queues:10; + } bitfields10; + uint32_t ordinal10; + }; + +}; +#endif + +/*--------------------MES_MAP_QUEUES--------------------*/ + +#ifndef PM4_MES_MAP_QUEUES_DEFINED +#define PM4_MES_MAP_QUEUES_DEFINED +enum map_queues_queue_sel_enum { + queue_sel__mes_map_queues__map_to_specified_queue_slots = 0, + queue_sel__mes_map_queues__map_to_hws_determined_queue_slots = 1, + queue_sel__mes_map_queues__enable_process_queues = 2 +}; + +enum map_queues_vidmem_enum { + vidmem__mes_map_queues__uses_no_video_memory = 0, + vidmem__mes_map_queues__uses_video_memory = 1 +}; + +enum map_queues_alloc_format_enum { + alloc_format__mes_map_queues__one_per_pipe = 0, + alloc_format__mes_map_queues__all_on_one_pipe = 1 +}; + +enum map_queues_engine_sel_enum { + engine_sel__mes_map_queues__compute = 0, + engine_sel__mes_map_queues__sdma0 = 2, + engine_sel__mes_map_queues__sdma1 = 3 +}; + +struct pm4_map_queues { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t reserved1:4; + enum map_queues_queue_sel_enum queue_sel:2; + uint32_t reserved2:2; + uint32_t vmid:4; + uint32_t reserved3:4; + enum map_queues_vidmem_enum vidmem:2; + uint32_t reserved4:6; + enum map_queues_alloc_format_enum alloc_format:2; + enum map_queues_engine_sel_enum engine_sel:3; + uint32_t num_queues:3; + } bitfields2; + uint32_t ordinal2; + }; + + struct { + union { + struct { + uint32_t reserved5:2; + uint32_t doorbell_offset:21; + uint32_t reserved6:3; + uint32_t queue:6; + } bitfields3; + uint32_t ordinal3; + }; + + uint32_t mqd_addr_lo; + uint32_t mqd_addr_hi; + uint32_t wptr_addr_lo; + uint32_t wptr_addr_hi; + + } mes_map_queues_ordinals[1]; /* 1..N of these ordinal groups */ + +}; +#endif + +/*--------------------MES_QUERY_STATUS--------------------*/ + +#ifndef PM4_MES_QUERY_STATUS_DEFINED +#define PM4_MES_QUERY_STATUS_DEFINED +enum query_status_interrupt_sel_enum { + interrupt_sel__mes_query_status__completion_status = 0, + interrupt_sel__mes_query_status__process_status = 1, + interrupt_sel__mes_query_status__queue_status = 2 +}; + +enum query_status_command_enum { + command__mes_query_status__interrupt_only = 0, + command__mes_query_status__fence_only_immediate = 1, + command__mes_query_status__fence_only_after_write_ack = 2, + command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3 +}; + +enum query_status_engine_sel_enum { + engine_sel__mes_query_status__compute = 0, + engine_sel__mes_query_status__sdma0_queue = 2, + engine_sel__mes_query_status__sdma1_queue = 3 +}; + +struct pm4_query_status { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + uint32_t context_id:28; + enum query_status_interrupt_sel_enum interrupt_sel:2; + enum query_status_command_enum command:2; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t pasid:16; + uint32_t reserved1:16; + } bitfields3a; + struct { + uint32_t reserved2:2; + uint32_t doorbell_offset:21; + uint32_t reserved3:3; + enum query_status_engine_sel_enum engine_sel:3; + uint32_t reserved4:3; + } bitfields3b; + uint32_t ordinal3; + }; + + uint32_t addr_lo; + uint32_t addr_hi; + uint32_t data_lo; + uint32_t data_hi; +}; +#endif + +/*--------------------MES_UNMAP_QUEUES--------------------*/ + +#ifndef PM4_MES_UNMAP_QUEUES_DEFINED +#define PM4_MES_UNMAP_QUEUES_DEFINED +enum unmap_queues_action_enum { + action__mes_unmap_queues__preempt_queues = 0, + action__mes_unmap_queues__reset_queues = 1, + action__mes_unmap_queues__disable_process_queues = 2 +}; + +enum unmap_queues_queue_sel_enum { + queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0, + queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1, + queue_sel__mes_unmap_queues__perform_request_on_all_active_queues = 2 +}; + +enum unmap_queues_engine_sel_enum { + engine_sel__mes_unmap_queues__compute = 0, + engine_sel__mes_unmap_queues__sdma0 = 2, + engine_sel__mes_unmap_queues__sdma1 = 3 +}; + +struct pm4_unmap_queues { + union { + union PM4_MES_TYPE_3_HEADER header; /* header */ + uint32_t ordinal1; + }; + + union { + struct { + enum unmap_queues_action_enum action:2; + uint32_t reserved1:2; + enum unmap_queues_queue_sel_enum queue_sel:2; + uint32_t reserved2:20; + enum unmap_queues_engine_sel_enum engine_sel:3; + uint32_t num_queues:3; + } bitfields2; + uint32_t ordinal2; + }; + + union { + struct { + uint32_t pasid:16; + uint32_t reserved3:16; + } bitfields3a; + struct { + uint32_t reserved4:2; + uint32_t doorbell_offset0:21; + uint32_t reserved5:9; + } bitfields3b; + uint32_t ordinal3; + }; + + union { + struct { + uint32_t reserved6:2; + uint32_t doorbell_offset1:21; + uint32_t reserved7:9; + } bitfields4; + uint32_t ordinal4; + }; + + union { + struct { + uint32_t reserved8:2; + uint32_t doorbell_offset2:21; + uint32_t reserved9:9; + } bitfields5; + uint32_t ordinal5; + }; + + union { + struct { + uint32_t reserved10:2; + uint32_t doorbell_offset3:21; + uint32_t reserved11:9; + } bitfields6; + uint32_t ordinal6; + }; + +}; +#endif + +enum { + CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 +}; + +#endif /* KFD_PM4_HEADERS_H_ */ diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_pm4_opcodes.h b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_pm4_opcodes.h new file mode 100644 index 000000000..b72fa3b8c --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_pm4_opcodes.h @@ -0,0 +1,107 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + + +#ifndef KFD_PM4_OPCODES_H +#define KFD_PM4_OPCODES_H + +enum it_opcode_type { + IT_NOP = 0x10, + IT_SET_BASE = 0x11, + IT_CLEAR_STATE = 0x12, + IT_INDEX_BUFFER_SIZE = 0x13, + IT_DISPATCH_DIRECT = 0x15, + IT_DISPATCH_INDIRECT = 0x16, + IT_ATOMIC_GDS = 0x1D, + IT_OCCLUSION_QUERY = 0x1F, + IT_SET_PREDICATION = 0x20, + IT_REG_RMW = 0x21, + IT_COND_EXEC = 0x22, + IT_PRED_EXEC = 0x23, + IT_DRAW_INDIRECT = 0x24, + IT_DRAW_INDEX_INDIRECT = 0x25, + IT_INDEX_BASE = 0x26, + IT_DRAW_INDEX_2 = 0x27, + IT_CONTEXT_CONTROL = 0x28, + IT_INDEX_TYPE = 0x2A, + IT_DRAW_INDIRECT_MULTI = 0x2C, + IT_DRAW_INDEX_AUTO = 0x2D, + IT_NUM_INSTANCES = 0x2F, + IT_DRAW_INDEX_MULTI_AUTO = 0x30, + IT_INDIRECT_BUFFER_CNST = 0x33, + IT_STRMOUT_BUFFER_UPDATE = 0x34, + IT_DRAW_INDEX_OFFSET_2 = 0x35, + IT_DRAW_PREAMBLE = 0x36, + IT_WRITE_DATA = 0x37, + IT_DRAW_INDEX_INDIRECT_MULTI = 0x38, + IT_MEM_SEMAPHORE = 0x39, + IT_COPY_DW = 0x3B, + IT_WAIT_REG_MEM = 0x3C, + IT_INDIRECT_BUFFER = 0x3F, + IT_COPY_DATA = 0x40, + IT_PFP_SYNC_ME = 0x42, + IT_SURFACE_SYNC = 0x43, + IT_COND_WRITE = 0x45, + IT_EVENT_WRITE = 0x46, + IT_EVENT_WRITE_EOP = 0x47, + IT_EVENT_WRITE_EOS = 0x48, + IT_RELEASE_MEM = 0x49, + IT_PREAMBLE_CNTL = 0x4A, + IT_DMA_DATA = 0x50, + IT_ACQUIRE_MEM = 0x58, + IT_REWIND = 0x59, + IT_LOAD_UCONFIG_REG = 0x5E, + IT_LOAD_SH_REG = 0x5F, + IT_LOAD_CONFIG_REG = 0x60, + IT_LOAD_CONTEXT_REG = 0x61, + IT_SET_CONFIG_REG = 0x68, + IT_SET_CONTEXT_REG = 0x69, + IT_SET_CONTEXT_REG_INDIRECT = 0x73, + IT_SET_SH_REG = 0x76, + IT_SET_SH_REG_OFFSET = 0x77, + IT_SET_QUEUE_REG = 0x78, + IT_SET_UCONFIG_REG = 0x79, + IT_SCRATCH_RAM_WRITE = 0x7D, + IT_SCRATCH_RAM_READ = 0x7E, + IT_LOAD_CONST_RAM = 0x80, + IT_WRITE_CONST_RAM = 0x81, + IT_DUMP_CONST_RAM = 0x83, + IT_INCREMENT_CE_COUNTER = 0x84, + IT_INCREMENT_DE_COUNTER = 0x85, + IT_WAIT_ON_CE_COUNTER = 0x86, + IT_WAIT_ON_DE_COUNTER_DIFF = 0x88, + IT_SWITCH_BUFFER = 0x8B, + IT_SET_RESOURCES = 0xA0, + IT_MAP_PROCESS = 0xA1, + IT_MAP_QUEUES = 0xA2, + IT_UNMAP_QUEUES = 0xA3, + IT_QUERY_STATUS = 0xA4, + IT_RUN_LIST = 0xA5, +}; + +#define PM4_TYPE_0 0 +#define PM4_TYPE_2 2 +#define PM4_TYPE_3 3 + +#endif /* KFD_PM4_OPCODES_H */ + diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_priv.h new file mode 100644 index 000000000..f21fccebd --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -0,0 +1,645 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef KFD_PRIV_H_INCLUDED +#define KFD_PRIV_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define KFD_SYSFS_FILE_MODE 0444 + +/* + * When working with cp scheduler we should assign the HIQ manually or via + * the radeon driver to a fixed hqd slot, here are the fixed HIQ hqd slot + * definitions for Kaveri. In Kaveri only the first ME queues participates + * in the cp scheduling taking that in mind we set the HIQ slot in the + * second ME. + */ +#define KFD_CIK_HIQ_PIPE 4 +#define KFD_CIK_HIQ_QUEUE 0 + +/* GPU ID hash width in bits */ +#define KFD_GPU_ID_HASH_WIDTH 16 + +/* Macro for allocating structures */ +#define kfd_alloc_struct(ptr_to_struct) \ + ((typeof(ptr_to_struct)) kzalloc(sizeof(*ptr_to_struct), GFP_KERNEL)) + +#define KFD_MAX_NUM_OF_PROCESSES 512 +#define KFD_MAX_NUM_OF_QUEUES_PER_PROCESS 1024 + +/* + * Kernel module parameter to specify maximum number of supported queues per + * device + */ +extern int max_num_of_queues_per_device; + +#define KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT 4096 +#define KFD_MAX_NUM_OF_QUEUES_PER_DEVICE \ + (KFD_MAX_NUM_OF_PROCESSES * \ + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) + +#define KFD_KERNEL_QUEUE_SIZE 2048 + +/* Kernel module parameter to specify the scheduling policy */ +extern int sched_policy; + +/** + * enum kfd_sched_policy + * + * @KFD_SCHED_POLICY_HWS: H/W scheduling policy known as command processor (cp) + * scheduling. In this scheduling mode we're using the firmware code to + * schedule the user mode queues and kernel queues such as HIQ and DIQ. + * the HIQ queue is used as a special queue that dispatches the configuration + * to the cp and the user mode queues list that are currently running. + * the DIQ queue is a debugging queue that dispatches debugging commands to the + * firmware. + * in this scheduling mode user mode queues over subscription feature is + * enabled. + * + * @KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION: The same as above but the over + * subscription feature disabled. + * + * @KFD_SCHED_POLICY_NO_HWS: no H/W scheduling policy is a mode which directly + * set the command processor registers and sets the queues "manually". This + * mode is used *ONLY* for debugging proposes. + * + */ +enum kfd_sched_policy { + KFD_SCHED_POLICY_HWS = 0, + KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION, + KFD_SCHED_POLICY_NO_HWS +}; + +enum cache_policy { + cache_policy_coherent, + cache_policy_noncoherent +}; + +enum asic_family_type { + CHIP_KAVERI = 0, + CHIP_CARRIZO +}; + +struct kfd_device_info { + unsigned int asic_family; + unsigned int max_pasid_bits; + size_t ih_ring_entry_size; + uint8_t num_of_watch_points; + uint16_t mqd_size_aligned; +}; + +struct kfd_mem_obj { + uint32_t range_start; + uint32_t range_end; + uint64_t gpu_addr; + uint32_t *cpu_ptr; +}; + +struct kfd_dev { + struct kgd_dev *kgd; + + const struct kfd_device_info *device_info; + struct pci_dev *pdev; + + unsigned int id; /* topology stub index */ + + phys_addr_t doorbell_base; /* Start of actual doorbells used by + * KFD. It is aligned for mapping + * into user mode + */ + size_t doorbell_id_offset; /* Doorbell offset (from KFD doorbell + * to HW doorbell, GFX reserved some + * at the start) + */ + size_t doorbell_process_limit; /* Number of processes we have doorbell + * space for. + */ + u32 __iomem *doorbell_kernel_ptr; /* This is a pointer for a doorbells + * page used by kernel queue + */ + + struct kgd2kfd_shared_resources shared_resources; + + const struct kfd2kgd_calls *kfd2kgd; + struct mutex doorbell_mutex; + unsigned long doorbell_available_index[DIV_ROUND_UP( + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)]; + + void *gtt_mem; + uint64_t gtt_start_gpu_addr; + void *gtt_start_cpu_ptr; + void *gtt_sa_bitmap; + struct mutex gtt_sa_lock; + unsigned int gtt_sa_chunk_size; + unsigned int gtt_sa_num_of_chunks; + + /* QCM Device instance */ + struct device_queue_manager *dqm; + + bool init_complete; +}; + +/* KGD2KFD callbacks */ +void kgd2kfd_exit(void); +struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, + struct pci_dev *pdev, const struct kfd2kgd_calls *f2g); +bool kgd2kfd_device_init(struct kfd_dev *kfd, + const struct kgd2kfd_shared_resources *gpu_resources); +void kgd2kfd_device_exit(struct kfd_dev *kfd); + +enum kfd_mempool { + KFD_MEMPOOL_SYSTEM_CACHEABLE = 1, + KFD_MEMPOOL_SYSTEM_WRITECOMBINE = 2, + KFD_MEMPOOL_FRAMEBUFFER = 3, +}; + +/* Character device interface */ +int kfd_chardev_init(void); +void kfd_chardev_exit(void); +struct device *kfd_chardev(void); + +/** + * enum kfd_preempt_type_filter + * + * @KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: Preempts single queue. + * + * @KFD_PRERMPT_TYPE_FILTER_ALL_QUEUES: Preempts all queues in the + * running queues list. + * + * @KFD_PRERMPT_TYPE_FILTER_BY_PASID: Preempts queues that belongs to + * specific process. + * + */ +enum kfd_preempt_type_filter { + KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE, + KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES, + KFD_PREEMPT_TYPE_FILTER_BY_PASID +}; + +enum kfd_preempt_type { + KFD_PREEMPT_TYPE_WAVEFRONT, + KFD_PREEMPT_TYPE_WAVEFRONT_RESET +}; + +/** + * enum kfd_queue_type + * + * @KFD_QUEUE_TYPE_COMPUTE: Regular user mode queue type. + * + * @KFD_QUEUE_TYPE_SDMA: Sdma user mode queue type. + * + * @KFD_QUEUE_TYPE_HIQ: HIQ queue type. + * + * @KFD_QUEUE_TYPE_DIQ: DIQ queue type. + */ +enum kfd_queue_type { + KFD_QUEUE_TYPE_COMPUTE, + KFD_QUEUE_TYPE_SDMA, + KFD_QUEUE_TYPE_HIQ, + KFD_QUEUE_TYPE_DIQ +}; + +enum kfd_queue_format { + KFD_QUEUE_FORMAT_PM4, + KFD_QUEUE_FORMAT_AQL +}; + +/** + * struct queue_properties + * + * @type: The queue type. + * + * @queue_id: Queue identifier. + * + * @queue_address: Queue ring buffer address. + * + * @queue_size: Queue ring buffer size. + * + * @priority: Defines the queue priority relative to other queues in the + * process. + * This is just an indication and HW scheduling may override the priority as + * necessary while keeping the relative prioritization. + * the priority granularity is from 0 to f which f is the highest priority. + * currently all queues are initialized with the highest priority. + * + * @queue_percent: This field is partially implemented and currently a zero in + * this field defines that the queue is non active. + * + * @read_ptr: User space address which points to the number of dwords the + * cp read from the ring buffer. This field updates automatically by the H/W. + * + * @write_ptr: Defines the number of dwords written to the ring buffer. + * + * @doorbell_ptr: This field aim is to notify the H/W of new packet written to + * the queue ring buffer. This field should be similar to write_ptr and the user + * should update this field after he updated the write_ptr. + * + * @doorbell_off: The doorbell offset in the doorbell pci-bar. + * + * @is_interop: Defines if this is a interop queue. Interop queue means that the + * queue can access both graphics and compute resources. + * + * @is_active: Defines if the queue is active or not. + * + * @vmid: If the scheduling mode is no cp scheduling the field defines the vmid + * of the queue. + * + * This structure represents the queue properties for each queue no matter if + * it's user mode or kernel mode queue. + * + */ +struct queue_properties { + enum kfd_queue_type type; + enum kfd_queue_format format; + unsigned int queue_id; + uint64_t queue_address; + uint64_t queue_size; + uint32_t priority; + uint32_t queue_percent; + uint32_t *read_ptr; + uint32_t *write_ptr; + uint32_t __iomem *doorbell_ptr; + uint32_t doorbell_off; + bool is_interop; + bool is_active; + /* Not relevant for user mode queues in cp scheduling */ + unsigned int vmid; + /* Relevant only for sdma queues*/ + uint32_t sdma_engine_id; + uint32_t sdma_queue_id; + uint32_t sdma_vm_addr; + /* Relevant only for VI */ + uint64_t eop_ring_buffer_address; + uint32_t eop_ring_buffer_size; + uint64_t ctx_save_restore_area_address; + uint32_t ctx_save_restore_area_size; +}; + +/** + * struct queue + * + * @list: Queue linked list. + * + * @mqd: The queue MQD. + * + * @mqd_mem_obj: The MQD local gpu memory object. + * + * @gart_mqd_addr: The MQD gart mc address. + * + * @properties: The queue properties. + * + * @mec: Used only in no cp scheduling mode and identifies to micro engine id + * that the queue should be execute on. + * + * @pipe: Used only in no cp scheduling mode and identifies the queue's pipe id. + * + * @queue: Used only in no cp scheduliong mode and identifies the queue's slot. + * + * @process: The kfd process that created this queue. + * + * @device: The kfd device that created this queue. + * + * This structure represents user mode compute queues. + * It contains all the necessary data to handle such queues. + * + */ + +struct queue { + struct list_head list; + void *mqd; + struct kfd_mem_obj *mqd_mem_obj; + uint64_t gart_mqd_addr; + struct queue_properties properties; + + uint32_t mec; + uint32_t pipe; + uint32_t queue; + + unsigned int sdma_id; + + struct kfd_process *process; + struct kfd_dev *device; +}; + +/* + * Please read the kfd_mqd_manager.h description. + */ +enum KFD_MQD_TYPE { + KFD_MQD_TYPE_COMPUTE = 0, /* for no cp scheduling */ + KFD_MQD_TYPE_HIQ, /* for hiq */ + KFD_MQD_TYPE_CP, /* for cp queues and diq */ + KFD_MQD_TYPE_SDMA, /* for sdma queues */ + KFD_MQD_TYPE_MAX +}; + +struct scheduling_resources { + unsigned int vmid_mask; + enum kfd_queue_type type; + uint64_t queue_mask; + uint64_t gws_mask; + uint32_t oac_mask; + uint32_t gds_heap_base; + uint32_t gds_heap_size; +}; + +struct process_queue_manager { + /* data */ + struct kfd_process *process; + unsigned int num_concurrent_processes; + struct list_head queues; + unsigned long *queue_slot_bitmap; +}; + +struct qcm_process_device { + /* The Device Queue Manager that owns this data */ + struct device_queue_manager *dqm; + struct process_queue_manager *pqm; + /* Queues list */ + struct list_head queues_list; + struct list_head priv_queue_list; + + unsigned int queue_count; + unsigned int vmid; + bool is_debug; + /* + * All the memory management data should be here too + */ + uint64_t gds_context_area; + uint32_t sh_mem_config; + uint32_t sh_mem_bases; + uint32_t sh_mem_ape1_base; + uint32_t sh_mem_ape1_limit; + uint32_t page_table_base; + uint32_t gds_size; + uint32_t num_gws; + uint32_t num_oac; +}; + +/* Data that is per-process-per device. */ +struct kfd_process_device { + /* + * List of all per-device data for a process. + * Starts from kfd_process.per_device_data. + */ + struct list_head per_device_list; + + /* The device that owns this data. */ + struct kfd_dev *dev; + + + /* per-process-per device QCM data structure */ + struct qcm_process_device qpd; + + /*Apertures*/ + uint64_t lds_base; + uint64_t lds_limit; + uint64_t gpuvm_base; + uint64_t gpuvm_limit; + uint64_t scratch_base; + uint64_t scratch_limit; + + /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */ + bool bound; +}; + +#define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) + +/* Process data */ +struct kfd_process { + /* + * kfd_process are stored in an mm_struct*->kfd_process* + * hash table (kfd_processes in kfd_process.c) + */ + struct hlist_node kfd_processes; + + struct mm_struct *mm; + + struct mutex mutex; + + /* + * In any process, the thread that started main() is the lead + * thread and outlives the rest. + * It is here because amd_iommu_bind_pasid wants a task_struct. + */ + struct task_struct *lead_thread; + + /* We want to receive a notification when the mm_struct is destroyed */ + struct mmu_notifier mmu_notifier; + + /* Use for delayed freeing of kfd_process structure */ + struct rcu_head rcu; + + unsigned int pasid; + + /* + * List of kfd_process_device structures, + * one for each device the process is using. + */ + struct list_head per_device_data; + + struct process_queue_manager pqm; + + /* The process's queues. */ + size_t queue_array_size; + + /* Size is queue_array_size, up to MAX_PROCESS_QUEUES. */ + struct kfd_queue **queues; + + unsigned long allocated_queue_bitmap[DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, BITS_PER_LONG)]; + + /*Is the user space process 32 bit?*/ + bool is_32bit_user_mode; +}; + +/** + * Ioctl function type. + * + * \param filep pointer to file structure. + * \param p amdkfd process pointer. + * \param data pointer to arg that was copied from user. + */ +typedef int amdkfd_ioctl_t(struct file *filep, struct kfd_process *p, + void *data); + +struct amdkfd_ioctl_desc { + unsigned int cmd; + int flags; + amdkfd_ioctl_t *func; + unsigned int cmd_drv; + const char *name; +}; + +void kfd_process_create_wq(void); +void kfd_process_destroy_wq(void); +struct kfd_process *kfd_create_process(const struct task_struct *); +struct kfd_process *kfd_get_process(const struct task_struct *); + +struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, + struct kfd_process *p); +void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid); +struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, + struct kfd_process *p); +struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, + struct kfd_process *p); + +/* Process device data iterator */ +struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p); +struct kfd_process_device *kfd_get_next_process_device_data(struct kfd_process *p, + struct kfd_process_device *pdd); +bool kfd_has_process_device_data(struct kfd_process *p); + +/* PASIDs */ +int kfd_pasid_init(void); +void kfd_pasid_exit(void); +bool kfd_set_pasid_limit(unsigned int new_limit); +unsigned int kfd_get_pasid_limit(void); +unsigned int kfd_pasid_alloc(void); +void kfd_pasid_free(unsigned int pasid); + +/* Doorbells */ +void kfd_doorbell_init(struct kfd_dev *kfd); +int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma); +u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, + unsigned int *doorbell_off); +void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr); +u32 read_kernel_doorbell(u32 __iomem *db); +void write_kernel_doorbell(u32 __iomem *db, u32 value); +unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, + struct kfd_process *process, + unsigned int queue_id); + +/* GTT Sub-Allocator */ + +int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, + struct kfd_mem_obj **mem_obj); + +int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj); + +extern struct device *kfd_device; + +/* Topology */ +int kfd_topology_init(void); +void kfd_topology_shutdown(void); +int kfd_topology_add_device(struct kfd_dev *gpu); +int kfd_topology_remove_device(struct kfd_dev *gpu); +struct kfd_dev *kfd_device_by_id(uint32_t gpu_id); +struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev); +struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx); + +/* Interrupts */ +void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry); + +/* Power Management */ +void kgd2kfd_suspend(struct kfd_dev *kfd); +int kgd2kfd_resume(struct kfd_dev *kfd); + +/* amdkfd Apertures */ +int kfd_init_apertures(struct kfd_process *process); + +/* Queue Context Management */ +inline uint32_t lower_32(uint64_t x); +inline uint32_t upper_32(uint64_t x); +struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd); +inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m); + +int init_queue(struct queue **q, struct queue_properties properties); +void uninit_queue(struct queue *q); +void print_queue_properties(struct queue_properties *q); +void print_queue(struct queue *q); + +struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); +struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); +struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); +struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); +void device_queue_manager_uninit(struct device_queue_manager *dqm); +struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, + enum kfd_queue_type type); +void kernel_queue_uninit(struct kernel_queue *kq); + +/* Process Queue Manager */ +struct process_queue_node { + struct queue *q; + struct kernel_queue *kq; + struct list_head process_queue_list; +}; + +int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p); +void pqm_uninit(struct process_queue_manager *pqm); +int pqm_create_queue(struct process_queue_manager *pqm, + struct kfd_dev *dev, + struct file *f, + struct queue_properties *properties, + unsigned int flags, + enum kfd_queue_type type, + unsigned int *qid); +int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid); +int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, + struct queue_properties *p); + +/* Packet Manager */ + +#define KFD_HIQ_TIMEOUT (500) + +#define KFD_FENCE_COMPLETED (100) +#define KFD_FENCE_INIT (10) +#define KFD_UNMAP_LATENCY (150) + +struct packet_manager { + struct device_queue_manager *dqm; + struct kernel_queue *priv_queue; + struct mutex lock; + bool allocated; + struct kfd_mem_obj *ib_buffer_obj; +}; + +int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); +void pm_uninit(struct packet_manager *pm); +int pm_send_set_resources(struct packet_manager *pm, + struct scheduling_resources *res); +int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues); +int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, + uint32_t fence_value); + +int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, + enum kfd_preempt_type_filter mode, + uint32_t filter_param, bool reset, + unsigned int sdma_engine); + +void pm_release_ib(struct packet_manager *pm); + +uint64_t kfd_get_number_elems(struct kfd_dev *kfd); +phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, + struct kfd_process *process); + +#endif diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_process.c new file mode 100644 index 000000000..945d6226d --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -0,0 +1,433 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +struct mm_struct; + +#include "kfd_priv.h" + +/* + * Initial size for the array of queues. + * The allocated size is doubled each time + * it is exceeded up to MAX_PROCESS_QUEUES. + */ +#define INITIAL_QUEUE_ARRAY_SIZE 16 + +/* + * List of struct kfd_process (field kfd_process). + * Unique/indexed by mm_struct* + */ +#define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */ +static DEFINE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE); +static DEFINE_MUTEX(kfd_processes_mutex); + +DEFINE_STATIC_SRCU(kfd_processes_srcu); + +static struct workqueue_struct *kfd_process_wq; + +struct kfd_process_release_work { + struct work_struct kfd_work; + struct kfd_process *p; +}; + +static struct kfd_process *find_process(const struct task_struct *thread); +static struct kfd_process *create_process(const struct task_struct *thread); + +void kfd_process_create_wq(void) +{ + if (!kfd_process_wq) + kfd_process_wq = create_workqueue("kfd_process_wq"); +} + +void kfd_process_destroy_wq(void) +{ + if (kfd_process_wq) { + flush_workqueue(kfd_process_wq); + destroy_workqueue(kfd_process_wq); + kfd_process_wq = NULL; + } +} + +struct kfd_process *kfd_create_process(const struct task_struct *thread) +{ + struct kfd_process *process; + + BUG_ON(!kfd_process_wq); + + if (thread->mm == NULL) + return ERR_PTR(-EINVAL); + + /* Only the pthreads threading model is supported. */ + if (thread->group_leader->mm != thread->mm) + return ERR_PTR(-EINVAL); + + /* Take mmap_sem because we call __mmu_notifier_register inside */ + down_write(&thread->mm->mmap_sem); + + /* + * take kfd processes mutex before starting of process creation + * so there won't be a case where two threads of the same process + * create two kfd_process structures + */ + mutex_lock(&kfd_processes_mutex); + + /* A prior open of /dev/kfd could have already created the process. */ + process = find_process(thread); + if (process) + pr_debug("kfd: process already found\n"); + + if (!process) + process = create_process(thread); + + mutex_unlock(&kfd_processes_mutex); + + up_write(&thread->mm->mmap_sem); + + return process; +} + +struct kfd_process *kfd_get_process(const struct task_struct *thread) +{ + struct kfd_process *process; + + if (thread->mm == NULL) + return ERR_PTR(-EINVAL); + + /* Only the pthreads threading model is supported. */ + if (thread->group_leader->mm != thread->mm) + return ERR_PTR(-EINVAL); + + process = find_process(thread); + + return process; +} + +static struct kfd_process *find_process_by_mm(const struct mm_struct *mm) +{ + struct kfd_process *process; + + hash_for_each_possible_rcu(kfd_processes_table, process, + kfd_processes, (uintptr_t)mm) + if (process->mm == mm) + return process; + + return NULL; +} + +static struct kfd_process *find_process(const struct task_struct *thread) +{ + struct kfd_process *p; + int idx; + + idx = srcu_read_lock(&kfd_processes_srcu); + p = find_process_by_mm(thread->mm); + srcu_read_unlock(&kfd_processes_srcu, idx); + + return p; +} + +static void kfd_process_wq_release(struct work_struct *work) +{ + struct kfd_process_release_work *my_work; + struct kfd_process_device *pdd, *temp; + struct kfd_process *p; + + my_work = (struct kfd_process_release_work *) work; + + p = my_work->p; + + pr_debug("Releasing process (pasid %d) in workqueue\n", + p->pasid); + + mutex_lock(&p->mutex); + + list_for_each_entry_safe(pdd, temp, &p->per_device_data, + per_device_list) { + pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n", + pdd->dev->id, p->pasid); + + amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid); + list_del(&pdd->per_device_list); + + kfree(pdd); + } + + kfd_pasid_free(p->pasid); + + mutex_unlock(&p->mutex); + + mutex_destroy(&p->mutex); + + kfree(p->queues); + + kfree(p); + + kfree((void *)work); +} + +static void kfd_process_destroy_delayed(struct rcu_head *rcu) +{ + struct kfd_process_release_work *work; + struct kfd_process *p; + + BUG_ON(!kfd_process_wq); + + p = container_of(rcu, struct kfd_process, rcu); + BUG_ON(atomic_read(&p->mm->mm_count) <= 0); + + mmdrop(p->mm); + + work = (struct kfd_process_release_work *) + kmalloc(sizeof(struct kfd_process_release_work), GFP_ATOMIC); + + if (work) { + INIT_WORK((struct work_struct *) work, kfd_process_wq_release); + work->p = p; + queue_work(kfd_process_wq, (struct work_struct *) work); + } +} + +static void kfd_process_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct kfd_process *p; + + /* + * The kfd_process structure can not be free because the + * mmu_notifier srcu is read locked + */ + p = container_of(mn, struct kfd_process, mmu_notifier); + BUG_ON(p->mm != mm); + + mutex_lock(&kfd_processes_mutex); + hash_del_rcu(&p->kfd_processes); + mutex_unlock(&kfd_processes_mutex); + synchronize_srcu(&kfd_processes_srcu); + + mutex_lock(&p->mutex); + + /* In case our notifier is called before IOMMU notifier */ + pqm_uninit(&p->pqm); + + mutex_unlock(&p->mutex); + + /* + * Because we drop mm_count inside kfd_process_destroy_delayed + * and because the mmu_notifier_unregister function also drop + * mm_count we need to take an extra count here. + */ + atomic_inc(&p->mm->mm_count); + mmu_notifier_unregister_no_release(&p->mmu_notifier, p->mm); + mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed); +} + +static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { + .release = kfd_process_notifier_release, +}; + +static struct kfd_process *create_process(const struct task_struct *thread) +{ + struct kfd_process *process; + int err = -ENOMEM; + + process = kzalloc(sizeof(*process), GFP_KERNEL); + + if (!process) + goto err_alloc_process; + + process->queues = kmalloc_array(INITIAL_QUEUE_ARRAY_SIZE, + sizeof(process->queues[0]), GFP_KERNEL); + if (!process->queues) + goto err_alloc_queues; + + process->pasid = kfd_pasid_alloc(); + if (process->pasid == 0) + goto err_alloc_pasid; + + mutex_init(&process->mutex); + + process->mm = thread->mm; + + /* register notifier */ + process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops; + err = __mmu_notifier_register(&process->mmu_notifier, process->mm); + if (err) + goto err_mmu_notifier; + + hash_add_rcu(kfd_processes_table, &process->kfd_processes, + (uintptr_t)process->mm); + + process->lead_thread = thread->group_leader; + + process->queue_array_size = INITIAL_QUEUE_ARRAY_SIZE; + + INIT_LIST_HEAD(&process->per_device_data); + + err = pqm_init(&process->pqm, process); + if (err != 0) + goto err_process_pqm_init; + + /* init process apertures*/ + process->is_32bit_user_mode = is_compat_task(); + if (kfd_init_apertures(process) != 0) + goto err_init_apretures; + + return process; + +err_init_apretures: + pqm_uninit(&process->pqm); +err_process_pqm_init: + hash_del_rcu(&process->kfd_processes); + synchronize_rcu(); + mmu_notifier_unregister_no_release(&process->mmu_notifier, process->mm); +err_mmu_notifier: + kfd_pasid_free(process->pasid); +err_alloc_pasid: + kfree(process->queues); +err_alloc_queues: + kfree(process); +err_alloc_process: + return ERR_PTR(err); +} + +struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, + struct kfd_process *p) +{ + struct kfd_process_device *pdd = NULL; + + list_for_each_entry(pdd, &p->per_device_data, per_device_list) + if (pdd->dev == dev) + break; + + return pdd; +} + +struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, + struct kfd_process *p) +{ + struct kfd_process_device *pdd = NULL; + + pdd = kzalloc(sizeof(*pdd), GFP_KERNEL); + if (pdd != NULL) { + pdd->dev = dev; + INIT_LIST_HEAD(&pdd->qpd.queues_list); + INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); + pdd->qpd.dqm = dev->dqm; + list_add(&pdd->per_device_list, &p->per_device_data); + } + + return pdd; +} + +/* + * Direct the IOMMU to bind the process (specifically the pasid->mm) + * to the device. + * Unbinding occurs when the process dies or the device is removed. + * + * Assumes that the process lock is held. + */ +struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, + struct kfd_process *p) +{ + struct kfd_process_device *pdd; + int err; + + pdd = kfd_get_process_device_data(dev, p); + if (!pdd) { + pr_err("Process device data doesn't exist\n"); + return ERR_PTR(-ENOMEM); + } + + if (pdd->bound) + return pdd; + + err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread); + if (err < 0) + return ERR_PTR(err); + + pdd->bound = true; + + return pdd; +} + +void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid) +{ + struct kfd_process *p; + struct kfd_process_device *pdd; + int idx, i; + + BUG_ON(dev == NULL); + + idx = srcu_read_lock(&kfd_processes_srcu); + + hash_for_each_rcu(kfd_processes_table, i, p, kfd_processes) + if (p->pasid == pasid) + break; + + srcu_read_unlock(&kfd_processes_srcu, idx); + + BUG_ON(p->pasid != pasid); + + mutex_lock(&p->mutex); + + pqm_uninit(&p->pqm); + + pdd = kfd_get_process_device_data(dev, p); + + /* + * Just mark pdd as unbound, because we still need it to call + * amd_iommu_unbind_pasid() in when the process exits. + * We don't call amd_iommu_unbind_pasid() here + * because the IOMMU called us. + */ + if (pdd) + pdd->bound = false; + + mutex_unlock(&p->mutex); +} + +struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p) +{ + return list_first_entry(&p->per_device_data, + struct kfd_process_device, + per_device_list); +} + +struct kfd_process_device *kfd_get_next_process_device_data(struct kfd_process *p, + struct kfd_process_device *pdd) +{ + if (list_is_last(&pdd->per_device_list, &p->per_device_data)) + return NULL; + return list_next_entry(pdd, per_device_list); +} + +bool kfd_has_process_device_data(struct kfd_process *p) +{ + return !(list_empty(&p->per_device_data)); +} diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c new file mode 100644 index 000000000..530b82c4e --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -0,0 +1,359 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include +#include +#include "kfd_device_queue_manager.h" +#include "kfd_priv.h" +#include "kfd_kernel_queue.h" + +static inline struct process_queue_node *get_queue_by_qid( + struct process_queue_manager *pqm, unsigned int qid) +{ + struct process_queue_node *pqn; + + BUG_ON(!pqm); + + list_for_each_entry(pqn, &pqm->queues, process_queue_list) { + if (pqn->q && pqn->q->properties.queue_id == qid) + return pqn; + if (pqn->kq && pqn->kq->queue->properties.queue_id == qid) + return pqn; + } + + return NULL; +} + +static int find_available_queue_slot(struct process_queue_manager *pqm, + unsigned int *qid) +{ + unsigned long found; + + BUG_ON(!pqm || !qid); + + pr_debug("kfd: in %s\n", __func__); + + found = find_first_zero_bit(pqm->queue_slot_bitmap, + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); + + pr_debug("kfd: the new slot id %lu\n", found); + + if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) { + pr_info("amdkfd: Can not open more queues for process with pasid %d\n", + pqm->process->pasid); + return -ENOMEM; + } + + set_bit(found, pqm->queue_slot_bitmap); + *qid = found; + + return 0; +} + +int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p) +{ + BUG_ON(!pqm); + + INIT_LIST_HEAD(&pqm->queues); + pqm->queue_slot_bitmap = + kzalloc(DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, + BITS_PER_BYTE), GFP_KERNEL); + if (pqm->queue_slot_bitmap == NULL) + return -ENOMEM; + pqm->process = p; + + return 0; +} + +void pqm_uninit(struct process_queue_manager *pqm) +{ + int retval; + struct process_queue_node *pqn, *next; + + BUG_ON(!pqm); + + pr_debug("In func %s\n", __func__); + + list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) { + retval = pqm_destroy_queue( + pqm, + (pqn->q != NULL) ? + pqn->q->properties.queue_id : + pqn->kq->queue->properties.queue_id); + + if (retval != 0) { + pr_err("kfd: failed to destroy queue\n"); + return; + } + } + kfree(pqm->queue_slot_bitmap); + pqm->queue_slot_bitmap = NULL; +} + +static int create_cp_queue(struct process_queue_manager *pqm, + struct kfd_dev *dev, struct queue **q, + struct queue_properties *q_properties, + struct file *f, unsigned int qid) +{ + int retval; + + retval = 0; + + /* Doorbell initialized in user space*/ + q_properties->doorbell_ptr = NULL; + + q_properties->doorbell_off = + kfd_queue_id_to_doorbell(dev, pqm->process, qid); + + /* let DQM handle it*/ + q_properties->vmid = 0; + q_properties->queue_id = qid; + + retval = init_queue(q, *q_properties); + if (retval != 0) + goto err_init_queue; + + (*q)->device = dev; + (*q)->process = pqm->process; + + pr_debug("kfd: PQM After init queue"); + + return retval; + +err_init_queue: + return retval; +} + +int pqm_create_queue(struct process_queue_manager *pqm, + struct kfd_dev *dev, + struct file *f, + struct queue_properties *properties, + unsigned int flags, + enum kfd_queue_type type, + unsigned int *qid) +{ + int retval; + struct kfd_process_device *pdd; + struct queue_properties q_properties; + struct queue *q; + struct process_queue_node *pqn; + struct kernel_queue *kq; + + BUG_ON(!pqm || !dev || !properties || !qid); + + memset(&q_properties, 0, sizeof(struct queue_properties)); + memcpy(&q_properties, properties, sizeof(struct queue_properties)); + q = NULL; + kq = NULL; + + pdd = kfd_get_process_device_data(dev, pqm->process); + if (!pdd) { + pr_err("Process device data doesn't exist\n"); + return -1; + } + + retval = find_available_queue_slot(pqm, qid); + if (retval != 0) + return retval; + + if (list_empty(&pqm->queues)) { + pdd->qpd.pqm = pqm; + dev->dqm->ops.register_process(dev->dqm, &pdd->qpd); + } + + pqn = kzalloc(sizeof(struct process_queue_node), GFP_KERNEL); + if (!pqn) { + retval = -ENOMEM; + goto err_allocate_pqn; + } + + switch (type) { + case KFD_QUEUE_TYPE_SDMA: + case KFD_QUEUE_TYPE_COMPUTE: + /* check if there is over subscription */ + if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && + ((dev->dqm->processes_count >= VMID_PER_DEVICE) || + (dev->dqm->queue_count >= PIPE_PER_ME_CP_SCHEDULING * QUEUES_PER_PIPE))) { + pr_err("kfd: over-subscription is not allowed in radeon_kfd.sched_policy == 1\n"); + retval = -EPERM; + goto err_create_queue; + } + + retval = create_cp_queue(pqm, dev, &q, &q_properties, f, *qid); + if (retval != 0) + goto err_create_queue; + pqn->q = q; + pqn->kq = NULL; + retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd, + &q->properties.vmid); + pr_debug("DQM returned %d for create_queue\n", retval); + print_queue(q); + break; + case KFD_QUEUE_TYPE_DIQ: + kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_DIQ); + if (kq == NULL) { + retval = -ENOMEM; + goto err_create_queue; + } + kq->queue->properties.queue_id = *qid; + pqn->kq = kq; + pqn->q = NULL; + retval = dev->dqm->ops.create_kernel_queue(dev->dqm, + kq, &pdd->qpd); + break; + default: + BUG(); + break; + } + + if (retval != 0) { + pr_debug("Error dqm create queue\n"); + goto err_create_queue; + } + + pr_debug("kfd: PQM After DQM create queue\n"); + + list_add(&pqn->process_queue_list, &pqm->queues); + + if (q) { + *properties = q->properties; + pr_debug("kfd: PQM done creating queue\n"); + print_queue_properties(properties); + } + + return retval; + +err_create_queue: + kfree(pqn); +err_allocate_pqn: + /* check if queues list is empty unregister process from device */ + clear_bit(*qid, pqm->queue_slot_bitmap); + if (list_empty(&pqm->queues)) + dev->dqm->ops.unregister_process(dev->dqm, &pdd->qpd); + return retval; +} + +int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) +{ + struct process_queue_node *pqn; + struct kfd_process_device *pdd; + struct device_queue_manager *dqm; + struct kfd_dev *dev; + int retval; + + dqm = NULL; + + BUG_ON(!pqm); + retval = 0; + + pr_debug("kfd: In Func %s\n", __func__); + + pqn = get_queue_by_qid(pqm, qid); + if (pqn == NULL) { + pr_err("kfd: queue id does not match any known queue\n"); + return -EINVAL; + } + + dev = NULL; + if (pqn->kq) + dev = pqn->kq->dev; + if (pqn->q) + dev = pqn->q->device; + BUG_ON(!dev); + + pdd = kfd_get_process_device_data(dev, pqm->process); + if (!pdd) { + pr_err("Process device data doesn't exist\n"); + return -1; + } + + if (pqn->kq) { + /* destroy kernel queue (DIQ) */ + dqm = pqn->kq->dev->dqm; + dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd); + kernel_queue_uninit(pqn->kq); + } + + if (pqn->q) { + dqm = pqn->q->device->dqm; + retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); + if (retval != 0) + return retval; + + uninit_queue(pqn->q); + } + + list_del(&pqn->process_queue_list); + kfree(pqn); + clear_bit(qid, pqm->queue_slot_bitmap); + + if (list_empty(&pqm->queues)) + dqm->ops.unregister_process(dqm, &pdd->qpd); + + return retval; +} + +int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, + struct queue_properties *p) +{ + int retval; + struct process_queue_node *pqn; + + BUG_ON(!pqm); + + pqn = get_queue_by_qid(pqm, qid); + if (!pqn) { + pr_debug("amdkfd: No queue %d exists for update operation\n", + qid); + return -EFAULT; + } + + pqn->q->properties.queue_address = p->queue_address; + pqn->q->properties.queue_size = p->queue_size; + pqn->q->properties.queue_percent = p->queue_percent; + pqn->q->properties.priority = p->priority; + + retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm, + pqn->q); + if (retval != 0) + return retval; + + return 0; +} + +static __attribute__((unused)) struct kernel_queue *pqm_get_kernel_queue( + struct process_queue_manager *pqm, + unsigned int qid) +{ + struct process_queue_node *pqn; + + BUG_ON(!pqm); + + pqn = get_queue_by_qid(pqm, qid); + if (pqn && pqn->kq) + return pqn->kq; + + return NULL; +} + + diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_queue.c new file mode 100644 index 000000000..9a0c90b07 --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -0,0 +1,85 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include +#include "kfd_priv.h" + +void print_queue_properties(struct queue_properties *q) +{ + if (!q) + return; + + pr_debug("Printing queue properties:\n"); + pr_debug("Queue Type: %u\n", q->type); + pr_debug("Queue Size: %llu\n", q->queue_size); + pr_debug("Queue percent: %u\n", q->queue_percent); + pr_debug("Queue Address: 0x%llX\n", q->queue_address); + pr_debug("Queue Id: %u\n", q->queue_id); + pr_debug("Queue Process Vmid: %u\n", q->vmid); + pr_debug("Queue Read Pointer: 0x%p\n", q->read_ptr); + pr_debug("Queue Write Pointer: 0x%p\n", q->write_ptr); + pr_debug("Queue Doorbell Pointer: 0x%p\n", q->doorbell_ptr); + pr_debug("Queue Doorbell Offset: %u\n", q->doorbell_off); +} + +void print_queue(struct queue *q) +{ + if (!q) + return; + pr_debug("Printing queue:\n"); + pr_debug("Queue Type: %u\n", q->properties.type); + pr_debug("Queue Size: %llu\n", q->properties.queue_size); + pr_debug("Queue percent: %u\n", q->properties.queue_percent); + pr_debug("Queue Address: 0x%llX\n", q->properties.queue_address); + pr_debug("Queue Id: %u\n", q->properties.queue_id); + pr_debug("Queue Process Vmid: %u\n", q->properties.vmid); + pr_debug("Queue Read Pointer: 0x%p\n", q->properties.read_ptr); + pr_debug("Queue Write Pointer: 0x%p\n", q->properties.write_ptr); + pr_debug("Queue Doorbell Pointer: 0x%p\n", q->properties.doorbell_ptr); + pr_debug("Queue Doorbell Offset: %u\n", q->properties.doorbell_off); + pr_debug("Queue MQD Address: 0x%p\n", q->mqd); + pr_debug("Queue MQD Gart: 0x%llX\n", q->gart_mqd_addr); + pr_debug("Queue Process Address: 0x%p\n", q->process); + pr_debug("Queue Device Address: 0x%p\n", q->device); +} + +int init_queue(struct queue **q, struct queue_properties properties) +{ + struct queue *tmp; + + BUG_ON(!q); + + tmp = kzalloc(sizeof(struct queue), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + memcpy(&tmp->properties, &properties, sizeof(struct queue_properties)); + + *q = tmp; + return 0; +} + +void uninit_queue(struct queue *q) +{ + kfree(q); +} diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_topology.c new file mode 100644 index 000000000..c25728bc3 --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -0,0 +1,1254 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kfd_priv.h" +#include "kfd_crat.h" +#include "kfd_topology.h" + +static struct list_head topology_device_list; +static int topology_crat_parsed; +static struct kfd_system_properties sys_props; + +static DECLARE_RWSEM(topology_lock); + +struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) +{ + struct kfd_topology_device *top_dev; + struct kfd_dev *device = NULL; + + down_read(&topology_lock); + + list_for_each_entry(top_dev, &topology_device_list, list) + if (top_dev->gpu_id == gpu_id) { + device = top_dev->gpu; + break; + } + + up_read(&topology_lock); + + return device; +} + +struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) +{ + struct kfd_topology_device *top_dev; + struct kfd_dev *device = NULL; + + down_read(&topology_lock); + + list_for_each_entry(top_dev, &topology_device_list, list) + if (top_dev->gpu->pdev == pdev) { + device = top_dev->gpu; + break; + } + + up_read(&topology_lock); + + return device; +} + +static int kfd_topology_get_crat_acpi(void *crat_image, size_t *size) +{ + struct acpi_table_header *crat_table; + acpi_status status; + + if (!size) + return -EINVAL; + + /* + * Fetch the CRAT table from ACPI + */ + status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); + if (status == AE_NOT_FOUND) { + pr_warn("CRAT table not found\n"); + return -ENODATA; + } else if (ACPI_FAILURE(status)) { + const char *err = acpi_format_exception(status); + + pr_err("CRAT table error: %s\n", err); + return -EINVAL; + } + + if (*size >= crat_table->length && crat_image != NULL) + memcpy(crat_image, crat_table, crat_table->length); + + *size = crat_table->length; + + return 0; +} + +static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, + struct crat_subtype_computeunit *cu) +{ + BUG_ON(!dev); + BUG_ON(!cu); + + dev->node_props.cpu_cores_count = cu->num_cpu_cores; + dev->node_props.cpu_core_id_base = cu->processor_id_low; + if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) + dev->node_props.capability |= HSA_CAP_ATS_PRESENT; + + pr_info("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, + cu->processor_id_low); +} + +static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, + struct crat_subtype_computeunit *cu) +{ + BUG_ON(!dev); + BUG_ON(!cu); + + dev->node_props.simd_id_base = cu->processor_id_low; + dev->node_props.simd_count = cu->num_simd_cores; + dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; + dev->node_props.max_waves_per_simd = cu->max_waves_simd; + dev->node_props.wave_front_size = cu->wave_front_size; + dev->node_props.mem_banks_count = cu->num_banks; + dev->node_props.array_count = cu->num_arrays; + dev->node_props.cu_per_simd_array = cu->num_cu_per_array; + dev->node_props.simd_per_cu = cu->num_simd_per_cu; + dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; + if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) + dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; + pr_info("CU GPU: simds=%d id_base=%d\n", cu->num_simd_cores, + cu->processor_id_low); +} + +/* kfd_parse_subtype_cu is called when the topology mutex is already acquired */ +static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu) +{ + struct kfd_topology_device *dev; + int i = 0; + + BUG_ON(!cu); + + pr_info("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", + cu->proximity_domain, cu->hsa_capability); + list_for_each_entry(dev, &topology_device_list, list) { + if (cu->proximity_domain == i) { + if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) + kfd_populated_cu_info_cpu(dev, cu); + + if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) + kfd_populated_cu_info_gpu(dev, cu); + break; + } + i++; + } + + return 0; +} + +/* + * kfd_parse_subtype_mem is called when the topology mutex is + * already acquired + */ +static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem) +{ + struct kfd_mem_properties *props; + struct kfd_topology_device *dev; + int i = 0; + + BUG_ON(!mem); + + pr_info("Found memory entry in CRAT table with proximity_domain=%d\n", + mem->promixity_domain); + list_for_each_entry(dev, &topology_device_list, list) { + if (mem->promixity_domain == i) { + props = kfd_alloc_struct(props); + if (props == NULL) + return -ENOMEM; + + if (dev->node_props.cpu_cores_count == 0) + props->heap_type = HSA_MEM_HEAP_TYPE_FB_PRIVATE; + else + props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; + + if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) + props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; + if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) + props->flags |= HSA_MEM_FLAGS_NON_VOLATILE; + + props->size_in_bytes = + ((uint64_t)mem->length_high << 32) + + mem->length_low; + props->width = mem->width; + + dev->mem_bank_count++; + list_add_tail(&props->list, &dev->mem_props); + + break; + } + i++; + } + + return 0; +} + +/* + * kfd_parse_subtype_cache is called when the topology mutex + * is already acquired + */ +static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache) +{ + struct kfd_cache_properties *props; + struct kfd_topology_device *dev; + uint32_t id; + + BUG_ON(!cache); + + id = cache->processor_id_low; + + pr_info("Found cache entry in CRAT table with processor_id=%d\n", id); + list_for_each_entry(dev, &topology_device_list, list) + if (id == dev->node_props.cpu_core_id_base || + id == dev->node_props.simd_id_base) { + props = kfd_alloc_struct(props); + if (props == NULL) + return -ENOMEM; + + props->processor_id_low = id; + props->cache_level = cache->cache_level; + props->cache_size = cache->cache_size; + props->cacheline_size = cache->cache_line_size; + props->cachelines_per_tag = cache->lines_per_tag; + props->cache_assoc = cache->associativity; + props->cache_latency = cache->cache_latency; + + if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) + props->cache_type |= HSA_CACHE_TYPE_DATA; + if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) + props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; + if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) + props->cache_type |= HSA_CACHE_TYPE_CPU; + if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) + props->cache_type |= HSA_CACHE_TYPE_HSACU; + + dev->cache_count++; + dev->node_props.caches_count++; + list_add_tail(&props->list, &dev->cache_props); + + break; + } + + return 0; +} + +/* + * kfd_parse_subtype_iolink is called when the topology mutex + * is already acquired + */ +static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink) +{ + struct kfd_iolink_properties *props; + struct kfd_topology_device *dev; + uint32_t i = 0; + uint32_t id_from; + uint32_t id_to; + + BUG_ON(!iolink); + + id_from = iolink->proximity_domain_from; + id_to = iolink->proximity_domain_to; + + pr_info("Found IO link entry in CRAT table with id_from=%d\n", id_from); + list_for_each_entry(dev, &topology_device_list, list) { + if (id_from == i) { + props = kfd_alloc_struct(props); + if (props == NULL) + return -ENOMEM; + + props->node_from = id_from; + props->node_to = id_to; + props->ver_maj = iolink->version_major; + props->ver_min = iolink->version_minor; + + /* + * weight factor (derived from CDIR), currently always 1 + */ + props->weight = 1; + + props->min_latency = iolink->minimum_latency; + props->max_latency = iolink->maximum_latency; + props->min_bandwidth = iolink->minimum_bandwidth_mbs; + props->max_bandwidth = iolink->maximum_bandwidth_mbs; + props->rec_transfer_size = + iolink->recommended_transfer_size; + + dev->io_link_count++; + dev->node_props.io_links_count++; + list_add_tail(&props->list, &dev->io_link_props); + + break; + } + i++; + } + + return 0; +} + +static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr) +{ + struct crat_subtype_computeunit *cu; + struct crat_subtype_memory *mem; + struct crat_subtype_cache *cache; + struct crat_subtype_iolink *iolink; + int ret = 0; + + BUG_ON(!sub_type_hdr); + + switch (sub_type_hdr->type) { + case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: + cu = (struct crat_subtype_computeunit *)sub_type_hdr; + ret = kfd_parse_subtype_cu(cu); + break; + case CRAT_SUBTYPE_MEMORY_AFFINITY: + mem = (struct crat_subtype_memory *)sub_type_hdr; + ret = kfd_parse_subtype_mem(mem); + break; + case CRAT_SUBTYPE_CACHE_AFFINITY: + cache = (struct crat_subtype_cache *)sub_type_hdr; + ret = kfd_parse_subtype_cache(cache); + break; + case CRAT_SUBTYPE_TLB_AFFINITY: + /* + * For now, nothing to do here + */ + pr_info("Found TLB entry in CRAT table (not processing)\n"); + break; + case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: + /* + * For now, nothing to do here + */ + pr_info("Found CCOMPUTE entry in CRAT table (not processing)\n"); + break; + case CRAT_SUBTYPE_IOLINK_AFFINITY: + iolink = (struct crat_subtype_iolink *)sub_type_hdr; + ret = kfd_parse_subtype_iolink(iolink); + break; + default: + pr_warn("Unknown subtype (%d) in CRAT\n", + sub_type_hdr->type); + } + + return ret; +} + +static void kfd_release_topology_device(struct kfd_topology_device *dev) +{ + struct kfd_mem_properties *mem; + struct kfd_cache_properties *cache; + struct kfd_iolink_properties *iolink; + + BUG_ON(!dev); + + list_del(&dev->list); + + while (dev->mem_props.next != &dev->mem_props) { + mem = container_of(dev->mem_props.next, + struct kfd_mem_properties, list); + list_del(&mem->list); + kfree(mem); + } + + while (dev->cache_props.next != &dev->cache_props) { + cache = container_of(dev->cache_props.next, + struct kfd_cache_properties, list); + list_del(&cache->list); + kfree(cache); + } + + while (dev->io_link_props.next != &dev->io_link_props) { + iolink = container_of(dev->io_link_props.next, + struct kfd_iolink_properties, list); + list_del(&iolink->list); + kfree(iolink); + } + + kfree(dev); + + sys_props.num_devices--; +} + +static void kfd_release_live_view(void) +{ + struct kfd_topology_device *dev; + + while (topology_device_list.next != &topology_device_list) { + dev = container_of(topology_device_list.next, + struct kfd_topology_device, list); + kfd_release_topology_device(dev); +} + + memset(&sys_props, 0, sizeof(sys_props)); +} + +static struct kfd_topology_device *kfd_create_topology_device(void) +{ + struct kfd_topology_device *dev; + + dev = kfd_alloc_struct(dev); + if (dev == NULL) { + pr_err("No memory to allocate a topology device"); + return NULL; + } + + INIT_LIST_HEAD(&dev->mem_props); + INIT_LIST_HEAD(&dev->cache_props); + INIT_LIST_HEAD(&dev->io_link_props); + + list_add_tail(&dev->list, &topology_device_list); + sys_props.num_devices++; + + return dev; +} + +static int kfd_parse_crat_table(void *crat_image) +{ + struct kfd_topology_device *top_dev; + struct crat_subtype_generic *sub_type_hdr; + uint16_t node_id; + int ret; + struct crat_header *crat_table = (struct crat_header *)crat_image; + uint16_t num_nodes; + uint32_t image_len; + + if (!crat_image) + return -EINVAL; + + num_nodes = crat_table->num_domains; + image_len = crat_table->length; + + pr_info("Parsing CRAT table with %d nodes\n", num_nodes); + + for (node_id = 0; node_id < num_nodes; node_id++) { + top_dev = kfd_create_topology_device(); + if (!top_dev) { + kfd_release_live_view(); + return -ENOMEM; + } + } + + sys_props.platform_id = + (*((uint64_t *)crat_table->oem_id)) & CRAT_OEMID_64BIT_MASK; + sys_props.platform_oem = *((uint64_t *)crat_table->oem_table_id); + sys_props.platform_rev = crat_table->revision; + + sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); + while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < + ((char *)crat_image) + image_len) { + if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { + ret = kfd_parse_subtype(sub_type_hdr); + if (ret != 0) { + kfd_release_live_view(); + return ret; + } + } + + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length); + } + + sys_props.generation_count++; + topology_crat_parsed = 1; + + return 0; +} + + +#define sysfs_show_gen_prop(buffer, fmt, ...) \ + snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__) +#define sysfs_show_32bit_prop(buffer, name, value) \ + sysfs_show_gen_prop(buffer, "%s %u\n", name, value) +#define sysfs_show_64bit_prop(buffer, name, value) \ + sysfs_show_gen_prop(buffer, "%s %llu\n", name, value) +#define sysfs_show_32bit_val(buffer, value) \ + sysfs_show_gen_prop(buffer, "%u\n", value) +#define sysfs_show_str_val(buffer, value) \ + sysfs_show_gen_prop(buffer, "%s\n", value) + +static ssize_t sysprops_show(struct kobject *kobj, struct attribute *attr, + char *buffer) +{ + ssize_t ret; + + /* Making sure that the buffer is an empty string */ + buffer[0] = 0; + + if (attr == &sys_props.attr_genid) { + ret = sysfs_show_32bit_val(buffer, sys_props.generation_count); + } else if (attr == &sys_props.attr_props) { + sysfs_show_64bit_prop(buffer, "platform_oem", + sys_props.platform_oem); + sysfs_show_64bit_prop(buffer, "platform_id", + sys_props.platform_id); + ret = sysfs_show_64bit_prop(buffer, "platform_rev", + sys_props.platform_rev); + } else { + ret = -EINVAL; + } + + return ret; +} + +static const struct sysfs_ops sysprops_ops = { + .show = sysprops_show, +}; + +static struct kobj_type sysprops_type = { + .sysfs_ops = &sysprops_ops, +}; + +static ssize_t iolink_show(struct kobject *kobj, struct attribute *attr, + char *buffer) +{ + ssize_t ret; + struct kfd_iolink_properties *iolink; + + /* Making sure that the buffer is an empty string */ + buffer[0] = 0; + + iolink = container_of(attr, struct kfd_iolink_properties, attr); + sysfs_show_32bit_prop(buffer, "type", iolink->iolink_type); + sysfs_show_32bit_prop(buffer, "version_major", iolink->ver_maj); + sysfs_show_32bit_prop(buffer, "version_minor", iolink->ver_min); + sysfs_show_32bit_prop(buffer, "node_from", iolink->node_from); + sysfs_show_32bit_prop(buffer, "node_to", iolink->node_to); + sysfs_show_32bit_prop(buffer, "weight", iolink->weight); + sysfs_show_32bit_prop(buffer, "min_latency", iolink->min_latency); + sysfs_show_32bit_prop(buffer, "max_latency", iolink->max_latency); + sysfs_show_32bit_prop(buffer, "min_bandwidth", iolink->min_bandwidth); + sysfs_show_32bit_prop(buffer, "max_bandwidth", iolink->max_bandwidth); + sysfs_show_32bit_prop(buffer, "recommended_transfer_size", + iolink->rec_transfer_size); + ret = sysfs_show_32bit_prop(buffer, "flags", iolink->flags); + + return ret; +} + +static const struct sysfs_ops iolink_ops = { + .show = iolink_show, +}; + +static struct kobj_type iolink_type = { + .sysfs_ops = &iolink_ops, +}; + +static ssize_t mem_show(struct kobject *kobj, struct attribute *attr, + char *buffer) +{ + ssize_t ret; + struct kfd_mem_properties *mem; + + /* Making sure that the buffer is an empty string */ + buffer[0] = 0; + + mem = container_of(attr, struct kfd_mem_properties, attr); + sysfs_show_32bit_prop(buffer, "heap_type", mem->heap_type); + sysfs_show_64bit_prop(buffer, "size_in_bytes", mem->size_in_bytes); + sysfs_show_32bit_prop(buffer, "flags", mem->flags); + sysfs_show_32bit_prop(buffer, "width", mem->width); + ret = sysfs_show_32bit_prop(buffer, "mem_clk_max", mem->mem_clk_max); + + return ret; +} + +static const struct sysfs_ops mem_ops = { + .show = mem_show, +}; + +static struct kobj_type mem_type = { + .sysfs_ops = &mem_ops, +}; + +static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, + char *buffer) +{ + ssize_t ret; + uint32_t i; + struct kfd_cache_properties *cache; + + /* Making sure that the buffer is an empty string */ + buffer[0] = 0; + + cache = container_of(attr, struct kfd_cache_properties, attr); + sysfs_show_32bit_prop(buffer, "processor_id_low", + cache->processor_id_low); + sysfs_show_32bit_prop(buffer, "level", cache->cache_level); + sysfs_show_32bit_prop(buffer, "size", cache->cache_size); + sysfs_show_32bit_prop(buffer, "cache_line_size", cache->cacheline_size); + sysfs_show_32bit_prop(buffer, "cache_lines_per_tag", + cache->cachelines_per_tag); + sysfs_show_32bit_prop(buffer, "association", cache->cache_assoc); + sysfs_show_32bit_prop(buffer, "latency", cache->cache_latency); + sysfs_show_32bit_prop(buffer, "type", cache->cache_type); + snprintf(buffer, PAGE_SIZE, "%ssibling_map ", buffer); + for (i = 0; i < KFD_TOPOLOGY_CPU_SIBLINGS; i++) + ret = snprintf(buffer, PAGE_SIZE, "%s%d%s", + buffer, cache->sibling_map[i], + (i == KFD_TOPOLOGY_CPU_SIBLINGS-1) ? + "\n" : ","); + + return ret; +} + +static const struct sysfs_ops cache_ops = { + .show = kfd_cache_show, +}; + +static struct kobj_type cache_type = { + .sysfs_ops = &cache_ops, +}; + +static ssize_t node_show(struct kobject *kobj, struct attribute *attr, + char *buffer) +{ + struct kfd_topology_device *dev; + char public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; + uint32_t i; + uint32_t log_max_watch_addr; + + /* Making sure that the buffer is an empty string */ + buffer[0] = 0; + + if (strcmp(attr->name, "gpu_id") == 0) { + dev = container_of(attr, struct kfd_topology_device, + attr_gpuid); + return sysfs_show_32bit_val(buffer, dev->gpu_id); + } + + if (strcmp(attr->name, "name") == 0) { + dev = container_of(attr, struct kfd_topology_device, + attr_name); + for (i = 0; i < KFD_TOPOLOGY_PUBLIC_NAME_SIZE; i++) { + public_name[i] = + (char)dev->node_props.marketing_name[i]; + if (dev->node_props.marketing_name[i] == 0) + break; + } + public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE-1] = 0x0; + return sysfs_show_str_val(buffer, public_name); + } + + dev = container_of(attr, struct kfd_topology_device, + attr_props); + sysfs_show_32bit_prop(buffer, "cpu_cores_count", + dev->node_props.cpu_cores_count); + sysfs_show_32bit_prop(buffer, "simd_count", + dev->node_props.simd_count); + + if (dev->mem_bank_count < dev->node_props.mem_banks_count) { + pr_warn("kfd: mem_banks_count truncated from %d to %d\n", + dev->node_props.mem_banks_count, + dev->mem_bank_count); + sysfs_show_32bit_prop(buffer, "mem_banks_count", + dev->mem_bank_count); + } else { + sysfs_show_32bit_prop(buffer, "mem_banks_count", + dev->node_props.mem_banks_count); + } + + sysfs_show_32bit_prop(buffer, "caches_count", + dev->node_props.caches_count); + sysfs_show_32bit_prop(buffer, "io_links_count", + dev->node_props.io_links_count); + sysfs_show_32bit_prop(buffer, "cpu_core_id_base", + dev->node_props.cpu_core_id_base); + sysfs_show_32bit_prop(buffer, "simd_id_base", + dev->node_props.simd_id_base); + sysfs_show_32bit_prop(buffer, "max_waves_per_simd", + dev->node_props.max_waves_per_simd); + sysfs_show_32bit_prop(buffer, "lds_size_in_kb", + dev->node_props.lds_size_in_kb); + sysfs_show_32bit_prop(buffer, "gds_size_in_kb", + dev->node_props.gds_size_in_kb); + sysfs_show_32bit_prop(buffer, "wave_front_size", + dev->node_props.wave_front_size); + sysfs_show_32bit_prop(buffer, "array_count", + dev->node_props.array_count); + sysfs_show_32bit_prop(buffer, "simd_arrays_per_engine", + dev->node_props.simd_arrays_per_engine); + sysfs_show_32bit_prop(buffer, "cu_per_simd_array", + dev->node_props.cu_per_simd_array); + sysfs_show_32bit_prop(buffer, "simd_per_cu", + dev->node_props.simd_per_cu); + sysfs_show_32bit_prop(buffer, "max_slots_scratch_cu", + dev->node_props.max_slots_scratch_cu); + sysfs_show_32bit_prop(buffer, "vendor_id", + dev->node_props.vendor_id); + sysfs_show_32bit_prop(buffer, "device_id", + dev->node_props.device_id); + sysfs_show_32bit_prop(buffer, "location_id", + dev->node_props.location_id); + + if (dev->gpu) { + log_max_watch_addr = + __ilog2_u32(dev->gpu->device_info->num_of_watch_points); + + if (log_max_watch_addr) { + dev->node_props.capability |= + HSA_CAP_WATCH_POINTS_SUPPORTED; + + dev->node_props.capability |= + ((log_max_watch_addr << + HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT) & + HSA_CAP_WATCH_POINTS_TOTALBITS_MASK); + } + + sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute", + dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz( + dev->gpu->kgd)); + + sysfs_show_64bit_prop(buffer, "local_mem_size", + (unsigned long long int) 0); + + sysfs_show_32bit_prop(buffer, "fw_version", + dev->gpu->kfd2kgd->get_fw_version( + dev->gpu->kgd, + KGD_ENGINE_MEC1)); + sysfs_show_32bit_prop(buffer, "capability", + dev->node_props.capability); + } + + return sysfs_show_32bit_prop(buffer, "max_engine_clk_ccompute", + cpufreq_quick_get_max(0)/1000); +} + +static const struct sysfs_ops node_ops = { + .show = node_show, +}; + +static struct kobj_type node_type = { + .sysfs_ops = &node_ops, +}; + +static void kfd_remove_sysfs_file(struct kobject *kobj, struct attribute *attr) +{ + sysfs_remove_file(kobj, attr); + kobject_del(kobj); + kobject_put(kobj); +} + +static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) +{ + struct kfd_iolink_properties *iolink; + struct kfd_cache_properties *cache; + struct kfd_mem_properties *mem; + + BUG_ON(!dev); + + if (dev->kobj_iolink) { + list_for_each_entry(iolink, &dev->io_link_props, list) + if (iolink->kobj) { + kfd_remove_sysfs_file(iolink->kobj, + &iolink->attr); + iolink->kobj = NULL; + } + kobject_del(dev->kobj_iolink); + kobject_put(dev->kobj_iolink); + dev->kobj_iolink = NULL; + } + + if (dev->kobj_cache) { + list_for_each_entry(cache, &dev->cache_props, list) + if (cache->kobj) { + kfd_remove_sysfs_file(cache->kobj, + &cache->attr); + cache->kobj = NULL; + } + kobject_del(dev->kobj_cache); + kobject_put(dev->kobj_cache); + dev->kobj_cache = NULL; + } + + if (dev->kobj_mem) { + list_for_each_entry(mem, &dev->mem_props, list) + if (mem->kobj) { + kfd_remove_sysfs_file(mem->kobj, &mem->attr); + mem->kobj = NULL; + } + kobject_del(dev->kobj_mem); + kobject_put(dev->kobj_mem); + dev->kobj_mem = NULL; + } + + if (dev->kobj_node) { + sysfs_remove_file(dev->kobj_node, &dev->attr_gpuid); + sysfs_remove_file(dev->kobj_node, &dev->attr_name); + sysfs_remove_file(dev->kobj_node, &dev->attr_props); + kobject_del(dev->kobj_node); + kobject_put(dev->kobj_node); + dev->kobj_node = NULL; + } +} + +static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, + uint32_t id) +{ + struct kfd_iolink_properties *iolink; + struct kfd_cache_properties *cache; + struct kfd_mem_properties *mem; + int ret; + uint32_t i; + + BUG_ON(!dev); + + /* + * Creating the sysfs folders + */ + BUG_ON(dev->kobj_node); + dev->kobj_node = kfd_alloc_struct(dev->kobj_node); + if (!dev->kobj_node) + return -ENOMEM; + + ret = kobject_init_and_add(dev->kobj_node, &node_type, + sys_props.kobj_nodes, "%d", id); + if (ret < 0) + return ret; + + dev->kobj_mem = kobject_create_and_add("mem_banks", dev->kobj_node); + if (!dev->kobj_mem) + return -ENOMEM; + + dev->kobj_cache = kobject_create_and_add("caches", dev->kobj_node); + if (!dev->kobj_cache) + return -ENOMEM; + + dev->kobj_iolink = kobject_create_and_add("io_links", dev->kobj_node); + if (!dev->kobj_iolink) + return -ENOMEM; + + /* + * Creating sysfs files for node properties + */ + dev->attr_gpuid.name = "gpu_id"; + dev->attr_gpuid.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&dev->attr_gpuid); + dev->attr_name.name = "name"; + dev->attr_name.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&dev->attr_name); + dev->attr_props.name = "properties"; + dev->attr_props.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&dev->attr_props); + ret = sysfs_create_file(dev->kobj_node, &dev->attr_gpuid); + if (ret < 0) + return ret; + ret = sysfs_create_file(dev->kobj_node, &dev->attr_name); + if (ret < 0) + return ret; + ret = sysfs_create_file(dev->kobj_node, &dev->attr_props); + if (ret < 0) + return ret; + + i = 0; + list_for_each_entry(mem, &dev->mem_props, list) { + mem->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + if (!mem->kobj) + return -ENOMEM; + ret = kobject_init_and_add(mem->kobj, &mem_type, + dev->kobj_mem, "%d", i); + if (ret < 0) + return ret; + + mem->attr.name = "properties"; + mem->attr.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&mem->attr); + ret = sysfs_create_file(mem->kobj, &mem->attr); + if (ret < 0) + return ret; + i++; + } + + i = 0; + list_for_each_entry(cache, &dev->cache_props, list) { + cache->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + if (!cache->kobj) + return -ENOMEM; + ret = kobject_init_and_add(cache->kobj, &cache_type, + dev->kobj_cache, "%d", i); + if (ret < 0) + return ret; + + cache->attr.name = "properties"; + cache->attr.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&cache->attr); + ret = sysfs_create_file(cache->kobj, &cache->attr); + if (ret < 0) + return ret; + i++; + } + + i = 0; + list_for_each_entry(iolink, &dev->io_link_props, list) { + iolink->kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + if (!iolink->kobj) + return -ENOMEM; + ret = kobject_init_and_add(iolink->kobj, &iolink_type, + dev->kobj_iolink, "%d", i); + if (ret < 0) + return ret; + + iolink->attr.name = "properties"; + iolink->attr.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&iolink->attr); + ret = sysfs_create_file(iolink->kobj, &iolink->attr); + if (ret < 0) + return ret; + i++; +} + + return 0; +} + +static int kfd_build_sysfs_node_tree(void) +{ + struct kfd_topology_device *dev; + int ret; + uint32_t i = 0; + + list_for_each_entry(dev, &topology_device_list, list) { + ret = kfd_build_sysfs_node_entry(dev, i); + if (ret < 0) + return ret; + i++; + } + + return 0; +} + +static void kfd_remove_sysfs_node_tree(void) +{ + struct kfd_topology_device *dev; + + list_for_each_entry(dev, &topology_device_list, list) + kfd_remove_sysfs_node_entry(dev); +} + +static int kfd_topology_update_sysfs(void) +{ + int ret; + + pr_info("Creating topology SYSFS entries\n"); + if (sys_props.kobj_topology == NULL) { + sys_props.kobj_topology = + kfd_alloc_struct(sys_props.kobj_topology); + if (!sys_props.kobj_topology) + return -ENOMEM; + + ret = kobject_init_and_add(sys_props.kobj_topology, + &sysprops_type, &kfd_device->kobj, + "topology"); + if (ret < 0) + return ret; + + sys_props.kobj_nodes = kobject_create_and_add("nodes", + sys_props.kobj_topology); + if (!sys_props.kobj_nodes) + return -ENOMEM; + + sys_props.attr_genid.name = "generation_id"; + sys_props.attr_genid.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&sys_props.attr_genid); + ret = sysfs_create_file(sys_props.kobj_topology, + &sys_props.attr_genid); + if (ret < 0) + return ret; + + sys_props.attr_props.name = "system_properties"; + sys_props.attr_props.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&sys_props.attr_props); + ret = sysfs_create_file(sys_props.kobj_topology, + &sys_props.attr_props); + if (ret < 0) + return ret; + } + + kfd_remove_sysfs_node_tree(); + + return kfd_build_sysfs_node_tree(); +} + +static void kfd_topology_release_sysfs(void) +{ + kfd_remove_sysfs_node_tree(); + if (sys_props.kobj_topology) { + sysfs_remove_file(sys_props.kobj_topology, + &sys_props.attr_genid); + sysfs_remove_file(sys_props.kobj_topology, + &sys_props.attr_props); + if (sys_props.kobj_nodes) { + kobject_del(sys_props.kobj_nodes); + kobject_put(sys_props.kobj_nodes); + sys_props.kobj_nodes = NULL; + } + kobject_del(sys_props.kobj_topology); + kobject_put(sys_props.kobj_topology); + sys_props.kobj_topology = NULL; + } +} + +int kfd_topology_init(void) +{ + void *crat_image = NULL; + size_t image_size = 0; + int ret; + + /* + * Initialize the head for the topology device list + */ + INIT_LIST_HEAD(&topology_device_list); + init_rwsem(&topology_lock); + topology_crat_parsed = 0; + + memset(&sys_props, 0, sizeof(sys_props)); + + /* + * Get the CRAT image from the ACPI + */ + ret = kfd_topology_get_crat_acpi(crat_image, &image_size); + if (ret == 0 && image_size > 0) { + pr_info("Found CRAT image with size=%zd\n", image_size); + crat_image = kmalloc(image_size, GFP_KERNEL); + if (!crat_image) { + ret = -ENOMEM; + pr_err("No memory for allocating CRAT image\n"); + goto err; + } + ret = kfd_topology_get_crat_acpi(crat_image, &image_size); + + if (ret == 0) { + down_write(&topology_lock); + ret = kfd_parse_crat_table(crat_image); + if (ret == 0) + ret = kfd_topology_update_sysfs(); + up_write(&topology_lock); + } else { + pr_err("Couldn't get CRAT table size from ACPI\n"); + } + kfree(crat_image); + } else if (ret == -ENODATA) { + ret = 0; + } else { + pr_err("Couldn't get CRAT table size from ACPI\n"); + } + +err: + pr_info("Finished initializing topology ret=%d\n", ret); + return ret; +} + +void kfd_topology_shutdown(void) +{ + kfd_topology_release_sysfs(); + kfd_release_live_view(); +} + +static void kfd_debug_print_topology(void) +{ + struct kfd_topology_device *dev; + uint32_t i = 0; + + pr_info("DEBUG PRINT OF TOPOLOGY:"); + list_for_each_entry(dev, &topology_device_list, list) { + pr_info("Node: %d\n", i); + pr_info("\tGPU assigned: %s\n", (dev->gpu ? "yes" : "no")); + pr_info("\tCPU count: %d\n", dev->node_props.cpu_cores_count); + pr_info("\tSIMD count: %d", dev->node_props.simd_count); + i++; + } +} + +static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) +{ + uint32_t hashout; + uint32_t buf[7]; + int i; + + if (!gpu) + return 0; + + buf[0] = gpu->pdev->devfn; + buf[1] = gpu->pdev->subsystem_vendor; + buf[2] = gpu->pdev->subsystem_device; + buf[3] = gpu->pdev->device; + buf[4] = gpu->pdev->bus->number; + buf[5] = (uint32_t)(gpu->kfd2kgd->get_vmem_size(gpu->kgd) + & 0xffffffff); + buf[6] = (uint32_t)(gpu->kfd2kgd->get_vmem_size(gpu->kgd) >> 32); + + for (i = 0, hashout = 0; i < 7; i++) + hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH); + + return hashout; +} + +static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) +{ + struct kfd_topology_device *dev; + struct kfd_topology_device *out_dev = NULL; + + BUG_ON(!gpu); + + list_for_each_entry(dev, &topology_device_list, list) + if (dev->gpu == NULL && dev->node_props.simd_count > 0) { + dev->gpu = gpu; + out_dev = dev; + break; + } + + return out_dev; +} + +static void kfd_notify_gpu_change(uint32_t gpu_id, int arrival) +{ + /* + * TODO: Generate an event for thunk about the arrival/removal + * of the GPU + */ +} + +int kfd_topology_add_device(struct kfd_dev *gpu) +{ + uint32_t gpu_id; + struct kfd_topology_device *dev; + int res; + + BUG_ON(!gpu); + + gpu_id = kfd_generate_gpu_id(gpu); + + pr_debug("kfd: Adding new GPU (ID: 0x%x) to topology\n", gpu_id); + + down_write(&topology_lock); + /* + * Try to assign the GPU to existing topology device (generated from + * CRAT table + */ + dev = kfd_assign_gpu(gpu); + if (!dev) { + pr_info("GPU was not found in the current topology. Extending.\n"); + kfd_debug_print_topology(); + dev = kfd_create_topology_device(); + if (!dev) { + res = -ENOMEM; + goto err; + } + dev->gpu = gpu; + + /* + * TODO: Make a call to retrieve topology information from the + * GPU vBIOS + */ + + /* + * Update the SYSFS tree, since we added another topology device + */ + if (kfd_topology_update_sysfs() < 0) + kfd_topology_release_sysfs(); + + } + + dev->gpu_id = gpu_id; + gpu->id = gpu_id; + dev->node_props.vendor_id = gpu->pdev->vendor; + dev->node_props.device_id = gpu->pdev->device; + dev->node_props.location_id = (gpu->pdev->bus->number << 24) + + (gpu->pdev->devfn & 0xffffff); + /* + * TODO: Retrieve max engine clock values from KGD + */ + + res = 0; + +err: + up_write(&topology_lock); + + if (res == 0) + kfd_notify_gpu_change(gpu_id, 1); + + return res; +} + +int kfd_topology_remove_device(struct kfd_dev *gpu) +{ + struct kfd_topology_device *dev; + uint32_t gpu_id; + int res = -ENODEV; + + BUG_ON(!gpu); + + down_write(&topology_lock); + + list_for_each_entry(dev, &topology_device_list, list) + if (dev->gpu == gpu) { + gpu_id = dev->gpu_id; + kfd_remove_sysfs_node_entry(dev); + kfd_release_topology_device(dev); + res = 0; + if (kfd_topology_update_sysfs() < 0) + kfd_topology_release_sysfs(); + break; + } + + up_write(&topology_lock); + + if (res == 0) + kfd_notify_gpu_change(gpu_id, 0); + + return res; +} + +/* + * When idx is out of bounds, the function will return NULL + */ +struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx) +{ + + struct kfd_topology_device *top_dev; + struct kfd_dev *device = NULL; + uint8_t device_idx = 0; + + down_read(&topology_lock); + + list_for_each_entry(top_dev, &topology_device_list, list) { + if (device_idx == idx) { + device = top_dev->gpu; + break; + } + + device_idx++; + } + + up_read(&topology_lock); + + return device; + +} diff --git a/kernel/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_topology.h new file mode 100644 index 000000000..989624b3c --- /dev/null +++ b/kernel/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -0,0 +1,168 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef __KFD_TOPOLOGY_H__ +#define __KFD_TOPOLOGY_H__ + +#include +#include +#include "kfd_priv.h" + +#define KFD_TOPOLOGY_PUBLIC_NAME_SIZE 128 + +#define HSA_CAP_HOT_PLUGGABLE 0x00000001 +#define HSA_CAP_ATS_PRESENT 0x00000002 +#define HSA_CAP_SHARED_WITH_GRAPHICS 0x00000004 +#define HSA_CAP_QUEUE_SIZE_POW2 0x00000008 +#define HSA_CAP_QUEUE_SIZE_32BIT 0x00000010 +#define HSA_CAP_QUEUE_IDLE_EVENT 0x00000020 +#define HSA_CAP_VA_LIMIT 0x00000040 +#define HSA_CAP_WATCH_POINTS_SUPPORTED 0x00000080 +#define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 +#define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 +#define HSA_CAP_RESERVED 0xfffff000 + +struct kfd_node_properties { + uint32_t cpu_cores_count; + uint32_t simd_count; + uint32_t mem_banks_count; + uint32_t caches_count; + uint32_t io_links_count; + uint32_t cpu_core_id_base; + uint32_t simd_id_base; + uint32_t capability; + uint32_t max_waves_per_simd; + uint32_t lds_size_in_kb; + uint32_t gds_size_in_kb; + uint32_t wave_front_size; + uint32_t array_count; + uint32_t simd_arrays_per_engine; + uint32_t cu_per_simd_array; + uint32_t simd_per_cu; + uint32_t max_slots_scratch_cu; + uint32_t engine_id; + uint32_t vendor_id; + uint32_t device_id; + uint32_t location_id; + uint32_t max_engine_clk_fcompute; + uint32_t max_engine_clk_ccompute; + uint16_t marketing_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; +}; + +#define HSA_MEM_HEAP_TYPE_SYSTEM 0 +#define HSA_MEM_HEAP_TYPE_FB_PUBLIC 1 +#define HSA_MEM_HEAP_TYPE_FB_PRIVATE 2 +#define HSA_MEM_HEAP_TYPE_GPU_GDS 3 +#define HSA_MEM_HEAP_TYPE_GPU_LDS 4 +#define HSA_MEM_HEAP_TYPE_GPU_SCRATCH 5 + +#define HSA_MEM_FLAGS_HOT_PLUGGABLE 0x00000001 +#define HSA_MEM_FLAGS_NON_VOLATILE 0x00000002 +#define HSA_MEM_FLAGS_RESERVED 0xfffffffc + +struct kfd_mem_properties { + struct list_head list; + uint32_t heap_type; + uint64_t size_in_bytes; + uint32_t flags; + uint32_t width; + uint32_t mem_clk_max; + struct kobject *kobj; + struct attribute attr; +}; + +#define KFD_TOPOLOGY_CPU_SIBLINGS 256 + +#define HSA_CACHE_TYPE_DATA 0x00000001 +#define HSA_CACHE_TYPE_INSTRUCTION 0x00000002 +#define HSA_CACHE_TYPE_CPU 0x00000004 +#define HSA_CACHE_TYPE_HSACU 0x00000008 +#define HSA_CACHE_TYPE_RESERVED 0xfffffff0 + +struct kfd_cache_properties { + struct list_head list; + uint32_t processor_id_low; + uint32_t cache_level; + uint32_t cache_size; + uint32_t cacheline_size; + uint32_t cachelines_per_tag; + uint32_t cache_assoc; + uint32_t cache_latency; + uint32_t cache_type; + uint8_t sibling_map[KFD_TOPOLOGY_CPU_SIBLINGS]; + struct kobject *kobj; + struct attribute attr; +}; + +struct kfd_iolink_properties { + struct list_head list; + uint32_t iolink_type; + uint32_t ver_maj; + uint32_t ver_min; + uint32_t node_from; + uint32_t node_to; + uint32_t weight; + uint32_t min_latency; + uint32_t max_latency; + uint32_t min_bandwidth; + uint32_t max_bandwidth; + uint32_t rec_transfer_size; + uint32_t flags; + struct kobject *kobj; + struct attribute attr; +}; + +struct kfd_topology_device { + struct list_head list; + uint32_t gpu_id; + struct kfd_node_properties node_props; + uint32_t mem_bank_count; + struct list_head mem_props; + uint32_t cache_count; + struct list_head cache_props; + uint32_t io_link_count; + struct list_head io_link_props; + struct kfd_dev *gpu; + struct kobject *kobj_node; + struct kobject *kobj_mem; + struct kobject *kobj_cache; + struct kobject *kobj_iolink; + struct attribute attr_gpuid; + struct attribute attr_name; + struct attribute attr_props; +}; + +struct kfd_system_properties { + uint32_t num_devices; /* Number of H-NUMA nodes */ + uint32_t generation_count; + uint64_t platform_oem; + uint64_t platform_id; + uint64_t platform_rev; + struct kobject *kobj_topology; + struct kobject *kobj_nodes; + struct attribute attr_genid; + struct attribute attr_props; +}; + + + +#endif /* __KFD_TOPOLOGY_H__ */ -- cgit 1.2.3-korg