diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py index feed8a724b..baf936068b 100644 --- a/configs/example/apu_se.py +++ b/configs/example/apu_se.py @@ -182,6 +182,13 @@ parser.add_option("--num-hw-queues", type="int", default=10, parser.add_option("--reg-alloc-policy",type="string", default="simple", help="register allocation policy (simple/dynamic)") +parser.add_option("--dgpu", action="store_true", default=False, + help="Configure the system as a dGPU instead of an APU. " + "The dGPU config has its own local memory pool and is not " + "coherent with the host through hardware. Data is " + "transfered from host to device memory using runtime calls " + "that copy data over a PCIe-like IO bus.") + Ruby.define_options(parser) #add TLB options to the parser @@ -417,7 +424,7 @@ hsapp_gpu_map_size = 0x1000 hsapp_gpu_map_paddr = int(Addr(options.mem_size)) # HSA kernel mode driver -gpu_driver = GPUComputeDriver(filename="kfd") +gpu_driver = GPUComputeDriver(filename = "kfd", isdGPU = options.dgpu) # Creating the GPU kernel launching components: that is the HSA # packet processor (HSAPP), GPU command processor (CP), and the @@ -470,7 +477,15 @@ else: "/usr/lib/x86_64-linux-gnu" ]), 'HOME=%s' % os.getenv('HOME','/'), - "HSA_ENABLE_INTERRUPT=1"] + # Disable the VM fault handler signal creation for dGPUs also + # forces the use of DefaultSignals instead of driver-controlled + # InteruptSignals throughout the runtime. DefaultSignals poll + # on memory in the runtime, while InteruptSignals call into the + # driver. + "HSA_ENABLE_INTERRUPT=1", + # We don't have an SDMA hardware model, so need to fallback to + # vector copy kernels for dGPU memcopies to/from host and device. + "HSA_ENABLE_SDMA=0"] process = Process(executable = executable, cmd = [options.cmd] + options.options.split(), drivers = [gpu_driver], env = env) @@ -643,7 +658,12 @@ system.redirect_paths = redirect_paths root = Root(system=system, full_system=False) -hsaTopology.createHsaTopology(options) +# Create the /sys/devices filesystem for the simulator so that the HSA Runtime +# knows what type of GPU hardware we are simulating +if options.dgpu: + hsaTopology.createFijiTopology(options) +else: + hsaTopology.createCarrizoTopology(options) m5.ticks.setGlobalFrequency('1THz') if options.abs_max_tick: diff --git a/configs/example/hsaTopology.py b/configs/example/hsaTopology.py index 707a83df3d..a5e0d446fc 100644 --- a/configs/example/hsaTopology.py +++ b/configs/example/hsaTopology.py @@ -49,7 +49,177 @@ def remake_dir(path): rmtree(path) makedirs(path) -def createHsaTopology(options): +# This fakes out a dGPU setup so the runtime correctly operations. The spoofed +# system has a single dGPU and a single socket CPU. Note that more complex +# topologies (multi-GPU, multi-socket CPUs) need to have a different setup +# here or the runtime won't be able to issue Memcpies from one node to another. +# +# TODO: There is way too much hardcoded here. It doesn't effect anything in +# our current ROCm stack (1.6), but it is highly possible that it will in the +# future. We might need to scrub through this and extract the appropriate +# fields from the simulator in the future. +def createFijiTopology(options): + topology_dir = joinpath(m5.options.outdir, \ + 'fs/sys/devices/virtual/kfd/kfd/topology') + remake_dir(topology_dir) + + amdgpu_dir = joinpath(m5.options.outdir, \ + 'fs/sys/module/amdgpu/parameters') + remake_dir(amdgpu_dir) + + # Fiji reported VM size in GB. Used to reserve an allocation from CPU + # to implement SVM (i.e. GPUVM64 pointers and X86 pointers agree) + file_append((amdgpu_dir, 'vm_size'), 256) + + # Ripped from real Fiji platform to appease KMT version checks + file_append((topology_dir, 'generation_id'), 2) + + # Set up system properties. Regiter as ast-rocm server + sys_prop = 'platform_oem 35498446626881\n' + \ + 'platform_id 71791775140929\n' + \ + 'platform_rev 2\n' + file_append((topology_dir, 'system_properties'), sys_prop) + + # Populate the topology tree + # Our dGPU system is two nodes. Node 0 is a CPU and Node 1 is a dGPU + node_dir = joinpath(topology_dir, 'nodes/0') + remake_dir(node_dir) + + # Register as a CPU + file_append((node_dir, 'gpu_id'), 0) + file_append((node_dir, 'name'), '') + + # CPU links. Only thing that matters is we tell the runtime that GPU is + # connected through PCIe to CPU socket 0. + io_links = 1 + io_dir = joinpath(node_dir, 'io_links/0') + remake_dir(io_dir) + io_prop = 'type 2\n' + \ + 'version_major 0\n' + \ + 'version_minor 0\n' + \ + 'node_from 0\n' + \ + 'node_to 1\n' + \ + 'weight 20\n' + \ + 'min_latency 0\n' + \ + 'max_latency 0\n' + \ + 'min_bandwidth 0\n' + \ + 'max_bandwidth 0\n' + \ + 'recommended_transfer_size 0\n' + \ + 'flags 13\n' + file_append((io_dir, 'properties'), io_prop) + + # Populate CPU node properties + node_prop = 'cpu_cores_count %s\n' % options.num_cpus + \ + 'simd_count 0\n' + \ + 'mem_banks_count 1\n' + \ + 'caches_count 0\n' + \ + 'io_links_count %s\n' % io_links + \ + 'cpu_core_id_base 0\n' + \ + 'simd_id_base 0\n' + \ + 'max_waves_per_simd 0\n' + \ + 'lds_size_in_kb 0\n' + \ + 'gds_size_in_kb 0\n' + \ + 'wave_front_size 64\n' + \ + 'array_count 0\n' + \ + 'simd_arrays_per_engine 0\n' + \ + 'cu_per_simd_array 0\n' + \ + 'simd_per_cu 0\n' + \ + 'max_slots_scratch_cu 0\n' + \ + 'vendor_id 0\n' + \ + 'device_id 0\n' + \ + 'location_id 0\n' + \ + 'drm_render_minor 0\n' + \ + 'max_engine_clk_ccompute 3400\n' + + file_append((node_dir, 'properties'), node_prop) + + # CPU memory reporting + mem_dir = joinpath(node_dir, 'mem_banks/0') + remake_dir(mem_dir) + mem_prop = 'heap_type 0\n' + \ + 'size_in_bytes 33704329216\n' + \ + 'flags 0\n' + \ + 'width 72\n' + \ + 'mem_clk_max 2400\n' + + file_append((mem_dir, 'properties'), mem_prop) + + # Build the GPU node + node_dir = joinpath(topology_dir, 'nodes/1') + remake_dir(node_dir) + + # Register as a Fiji + file_append((node_dir, 'gpu_id'), 50156) + file_append((node_dir, 'name'), 'Fiji\n') + + # Real Fiji shows 96, but building that topology is complex and doesn't + # appear to be required for anything. + caches = 0 + + # GPU links. Only thing that matters is we tell the runtime that GPU is + # connected through PCIe to CPU socket 0. + io_links = 1 + io_dir = joinpath(node_dir, 'io_links/0') + remake_dir(io_dir) + io_prop = 'type 2\n' + \ + 'version_major 0\n' + \ + 'version_minor 0\n' + \ + 'node_from 1\n' + \ + 'node_to 0\n' + \ + 'weight 20\n' + \ + 'min_latency 0\n' + \ + 'max_latency 0\n' + \ + 'min_bandwidth 0\n' + \ + 'max_bandwidth 0\n' + \ + 'recommended_transfer_size 0\n' + \ + 'flags 1\n' + file_append((io_dir, 'properties'), io_prop) + + # Populate GPU node properties + node_prop = 'cpu_cores_count %s\n' % options.num_cpus + \ + 'simd_count %s\n' \ + % (options.num_compute_units * options.simds_per_cu) + \ + 'mem_banks_count 1\n' + \ + 'caches_count %s\n' % caches + \ + 'io_links_count %s\n' % io_links + \ + 'cpu_core_id_base 0\n' + \ + 'simd_id_base 2147487744\n' + \ + 'max_waves_per_simd %s\n' % options.wfs_per_simd + \ + 'lds_size_in_kb %s\n' % int(options.lds_size / 1024) + \ + 'gds_size_in_kb 0\n' + \ + 'wave_front_size %s\n' % options.wf_size + \ + 'array_count 4\n' + \ + 'simd_arrays_per_engine %s\n' % options.sa_per_complex + \ + 'cu_per_simd_array %s\n' % options.cu_per_sa + \ + 'simd_per_cu %s\n' % options.simds_per_cu + \ + 'max_slots_scratch_cu 32\n' + \ + 'vendor_id 4098\n' + \ + 'device_id 29440\n' + \ + 'location_id 512\n' + \ + 'max_engine_clk_fcompute %s\n' \ + % int(toFrequency(options.gpu_clock) / 1e6) + \ + 'local_mem_size 4294967296\n' + \ + 'fw_version 730\n' + \ + 'capability 4736\n' + \ + 'max_engine_clk_ccompute %s\n' \ + % int(toFrequency(options.CPUClock) / 1e6) + + file_append((node_dir, 'properties'), node_prop) + + # Fiji HBM reporting + # TODO: Extract size, clk, and width from sim paramters + mem_dir = joinpath(node_dir, 'mem_banks/0') + remake_dir(mem_dir) + mem_prop = 'heap_type 1\n' + \ + 'size_in_bytes 4294967296\n' + \ + 'flags 0\n' + \ + 'width 4096\n' + \ + 'mem_clk_max 500\n' + + file_append((mem_dir, 'properties'), mem_prop) + + +def createCarrizoTopology(options): topology_dir = joinpath(m5.options.outdir, \ 'fs/sys/devices/virtual/kfd/kfd/topology') remake_dir(topology_dir) diff --git a/src/dev/hsa/hsa_driver.cc b/src/dev/hsa/hsa_driver.cc index db31cbc103..f2db43635c 100644 --- a/src/dev/hsa/hsa_driver.cc +++ b/src/dev/hsa/hsa_driver.cc @@ -70,25 +70,25 @@ Addr HSADriver::mmap(ThreadContext *tc, Addr start, uint64_t length, int prot, int tgt_flags, int tgt_fd, off_t offset) { - // Is this a signal event mmap - bool is_event_mmap = false; - // If addr == 0, then we may need to do mmap. - bool should_mmap = (start == 0); - auto process = tc->getProcessPtr(); - auto mem_state = process->memState; - // Check if mmap is for signal events first - if (((offset >> PAGE_SHIFT) & KFD_MMAP_TYPE_MASK) == - KFD_MMAP_TYPE_EVENTS) { - is_event_mmap = true; - DPRINTF(HSADriver, "amdkfd mmap for events(start: %p, length: 0x%x," - "offset: 0x%x, )\n", start, length, offset); - panic_if(start != 0, - "Start address should be provided by KFD\n"); - panic_if(length != 8 * KFD_SIGNAL_EVENT_LIMIT, - "Requested length %d, expected length %d; length mismatch\n", - length, 8 * KFD_SIGNAL_EVENT_LIMIT); - // For signal event, do mmap only is eventPage is uninitialized - should_mmap = (!eventPage); + // Is this a signal event mmap + bool is_event_mmap = false; + // If addr == 0, then we may need to do mmap. + bool should_mmap = (start == 0); + auto process = tc->getProcessPtr(); + auto mem_state = process->memState; + // Check if mmap is for signal events first + if (((offset >> PAGE_SHIFT) & KFD_MMAP_TYPE_MASK) == + KFD_MMAP_TYPE_EVENTS) { + is_event_mmap = true; + DPRINTF(HSADriver, "amdkfd mmap for events(start: %p, length: 0x%x," + "offset: 0x%x, )\n", start, length, offset); + panic_if(start != 0, + "Start address should be provided by KFD\n"); + panic_if(length != 8 * KFD_SIGNAL_EVENT_LIMIT, + "Requested length %d, expected length %d; length mismatch\n", + length, 8 * KFD_SIGNAL_EVENT_LIMIT); + // For signal event, do mmap only is eventPage is uninitialized + should_mmap = (!eventPage); } else { DPRINTF(HSADriver, "amdkfd doorbell mmap (start: %p, length: 0x%x," "offset: 0x%x)\n", start, length, offset); diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py index d2959ac6b8..e54882330c 100644 --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -236,6 +236,7 @@ class Shader(ClockedObject): class GPUComputeDriver(HSADriver): type = 'GPUComputeDriver' cxx_header = 'gpu-compute/gpu_compute_driver.hh' + isdGPU = Param.Bool(False, 'Driver is for a dGPU') class GPUDispatcher(SimObject): type = 'GPUDispatcher' diff --git a/src/gpu-compute/gpu_compute_driver.cc b/src/gpu-compute/gpu_compute_driver.cc index 664afa9257..6c4639a2b2 100644 --- a/src/gpu-compute/gpu_compute_driver.cc +++ b/src/gpu-compute/gpu_compute_driver.cc @@ -40,10 +40,11 @@ #include "dev/hsa/kfd_event_defines.h" #include "dev/hsa/kfd_ioctl.h" #include "params/GPUComputeDriver.hh" +#include "sim/process.hh" #include "sim/syscall_emul_buf.hh" GPUComputeDriver::GPUComputeDriver(const Params &p) - : HSADriver(p) + : HSADriver(p), isdGPU(p.isdGPU) { device->attachDriver(this); DPRINTF(GPUDriver, "Constructing KFD: device\n"); @@ -86,6 +87,19 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) break; case AMDKFD_IOC_SET_MEMORY_POLICY: { + /** + * This is where the runtime requests MTYPE from an aperture. + * Basically, the globally memory aperture is divided up into + * a default aperture and an alternate aperture each of which have + * their own MTYPE policies. This is done to mark a small piece + * of the global memory as uncacheable. Host memory mappings will + * be carved out of this uncacheable aperture, which is how they + * implement 'coherent' host/device memory on dGPUs. + * + * TODO: Need to reflect per-aperture MTYPE policies based on this + * call. + * + */ warn("unimplemented ioctl: AMDKFD_IOC_SET_MEMORY_POLICY\n"); } break; @@ -145,7 +159,10 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) gpuVmApeLimit(args->process_apertures[i].gpuvm_base); // NOTE: Must match ID populated by hsaTopology.py - args->process_apertures[i].gpu_id = 2765; + if (isdGPU) + args->process_apertures[i].gpu_id = 50156; + else + args->process_apertures[i].gpu_id = 2765; DPRINTF(GPUDriver, "GPUVM base for node[%i] = %#x\n", i, args->process_apertures[i].gpuvm_base); @@ -351,9 +368,91 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) warn("unimplemented ioctl: AMDKFD_IOC_DBG_WAVE_CONTROL\n"); } break; + /** + * In real hardware, this IOCTL maps host memory, dGPU memory, or dGPU + * doorbells into GPUVM space. Essentially, ROCm implements SVM by + * carving out a region of free VA space that both the host and GPUVM + * can agree upon. The entire GPU VA space is reserved on the host + * using a fixed mmap at a low VA range that is also directly + * accessable by the GPU's limited number of VA bits. When we actually + * call memory allocation later in the program, this IOCTL is invoked + * to create BOs/VMAs in the driver and bind them to physical + * memory/doorbells. + * + * For gem5, we don't need to carve out any GPUVM space here (we don't + * support GPUVM and use host page tables on the GPU directly). We can + * can just use the existing host SVM region. We comment on each memory + * type seperately. + */ case AMDKFD_IOC_ALLOC_MEMORY_OF_GPU: { - warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n"); + DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n"); + TypedBufferArg args(ioc_buf); + args.copyIn(virt_proxy); + + assert(isdGPU); + assert((args->va_addr % TheISA::PageBytes) == 0); + Addr mmap_offset = 0; + + if (KFD_IOC_ALLOC_MEM_FLAGS_VRAM & args->flags) { + DPRINTF(GPUDriver, "amdkfd allocation type: VRAM\n"); + args->mmap_offset = args->va_addr; + // VRAM allocations are device memory mapped into GPUVM + // space. + // + // We can't rely on the lazy host allocator (fixupFault) to + // handle this mapping since it needs to be placed in dGPU + // framebuffer memory. The lazy allocator will try to place + // this in host memory. + // + // TODO: We don't have the appropriate bifurcation of the + // physical address space with different memory controllers + // yet. This is where we will explicitly add the PT maps to + // dGPU memory in the future. + } else if (KFD_IOC_ALLOC_MEM_FLAGS_USERPTR & args->flags) { + DPRINTF(GPUDriver, "amdkfd allocation type: USERPTR\n"); + mmap_offset = args->mmap_offset; + // USERPTR allocations are system memory mapped into GPUVM + // space. The user provides the driver with the pointer. + // + // No action needs to be taken for this memory type. We will + // lazily map it into host memory on first touch. + } else if (KFD_IOC_ALLOC_MEM_FLAGS_GTT & args->flags) { + DPRINTF(GPUDriver, "amdkfd allocation type: GTT\n"); + args->mmap_offset = args->va_addr; + // GTT allocations are system memory mapped into GPUVM space. + // It's different than a USERPTR allocation since the driver + // itself allocates the physical memory on the host. + // + // No action needs to be taken for this memory type. We will + // lazily map it into host memory on first touch. The + // fixupFault will find the original SVM aperture mapped to the + // host. + // + // Note that for GTT the thunk layer needs to call mmap on the + // driver FD later if it wants the host to have access to this + // memory (which it probably does). + } else if (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL & args->flags) { + DPRINTF(GPUDriver, "amdkfd allocation type: DOORBELL\n"); + // DOORBELL allocations are the queue doorbells that are + // memory mapped into GPUVM space. + // + // Explicitly map this virtual address to our PIO doorbell + // interface in the page tables (non-cacheable) + tc->getProcessPtr()->pTable->map(args->va_addr, + device->hsaPacketProc().pioAddr, + args->size, false); + break; + } + + DPRINTF(GPUDriver, "amdkfd allocation arguments: va_addr %p " + "size %lu, mmap_offset %p, gpu_id %d\n", + args->va_addr, args->size, mmap_offset, args->gpu_id); + + // TODO: Not sure where the handle is used yet. Set it to an + // easily trackable value. + args->handle= 0xdeadbeef; + args.copyOut(virt_proxy); } break; case AMDKFD_IOC_FREE_MEMORY_OF_GPU: @@ -361,6 +460,13 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) warn("unimplemented ioctl: AMDKFD_IOC_FREE_MEMORY_OF_GPU\n"); } break; + /** + * Called to map an already allocated region of memory to this GPU's + * GPUVM VA space. We don't need to implement this in the simulator + * since we only have a single VM system. If the region has already + * been allocated somewhere like the CPU, then it's already visible + * to the device. + */ case AMDKFD_IOC_MAP_MEMORY_TO_GPU: { warn("unimplemented ioctl: AMDKFD_IOC_MAP_MEMORY_TO_GPU\n"); @@ -415,7 +521,11 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) ape_args->gpuvm_base = gpuVmApeBase(i + 1); ape_args->gpuvm_limit = gpuVmApeLimit(ape_args->gpuvm_base); - ape_args->gpu_id = 2765; + // NOTE: Must match ID populated by hsaTopology.py + if (isdGPU) + ape_args->gpu_id = 50156; + else + ape_args->gpu_id = 2765; assert(bits(ape_args->scratch_base, 63, 47) != 0x1ffff); assert(bits(ape_args->scratch_base, 63, 47) != 0); diff --git a/src/gpu-compute/gpu_compute_driver.hh b/src/gpu-compute/gpu_compute_driver.hh index d2c822d588..f8c02b2d01 100644 --- a/src/gpu-compute/gpu_compute_driver.hh +++ b/src/gpu-compute/gpu_compute_driver.hh @@ -55,6 +55,7 @@ class GPUComputeDriver final : public HSADriver void sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout); private: + bool isdGPU; /** * The aperture (APE) base/limit pairs are set * statically at startup by the real KFD. AMD