From 63caa780c293bee059d4c53a10a250acc929fdb8 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Wed, 17 Jan 2024 10:45:18 -0600 Subject: [PATCH] misc: Remove all references to GCN3 Replace instances of "GCN3" with Vega. Remove gfx801 and gfx803. Rename FIJI to Vega and Carrizo to Raven. Using misc since there is not enough room to fit all the tags. Change-Id: Ibafc939d49a69be9068107a906e878408c7a5891 --- configs/example/apu_se.py | 10 +- configs/example/hsaTopology.py | 194 +------------------------- src/cpu/testers/gpu_ruby_test/README | 4 +- src/dev/amdgpu/hwreg_defines.hh | 2 +- src/dev/hsa/hsa_packet_processor.hh | 25 +--- src/gpu-compute/GPU.py | 4 +- src/gpu-compute/gpu_compute_driver.cc | 35 ----- src/gpu-compute/gpu_compute_driver.hh | 2 - src/gpu-compute/gpu_dyn_inst.cc | 2 +- src/gpu-compute/hsa_queue_entry.hh | 11 +- src/gpu-compute/schedule_stage.cc | 2 - src/mem/request.hh | 4 +- src/mem/ruby/system/VIPERCoalescer.cc | 2 +- 13 files changed, 22 insertions(+), 275 deletions(-) diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py index 9483a6d291..9c023fe5c6 100644 --- a/configs/example/apu_se.py +++ b/configs/example/apu_se.py @@ -375,7 +375,7 @@ parser.add_argument( parser.add_argument( "--gfx-version", type=str, - default="gfx801", + default="gfx902", choices=GfxVersion.vals, help="Gfx version for gpuNote: gfx902 is not fully supported by ROCm", ) @@ -951,19 +951,15 @@ root = Root(system=system, full_system=False) # knows what type of GPU hardware we are simulating if args.dgpu: assert args.gfx_version in [ - "gfx803", "gfx900", ], "Incorrect gfx version for dGPU" - if args.gfx_version == "gfx803": - hsaTopology.createFijiTopology(args) - elif args.gfx_version == "gfx900": + if args.gfx_version == "gfx900": hsaTopology.createVegaTopology(args) else: assert args.gfx_version in [ - "gfx801", "gfx902", ], "Incorrect gfx version for APU" - hsaTopology.createCarrizoTopology(args) + hsaTopology.createRavenTopology(args) m5.ticks.setGlobalFrequency("1THz") if args.abs_max_tick: diff --git a/configs/example/hsaTopology.py b/configs/example/hsaTopology.py index 2dcbdeca01..4540293482 100644 --- a/configs/example/hsaTopology.py +++ b/configs/example/hsaTopology.py @@ -243,7 +243,7 @@ def createVegaTopology(options): file_append((node_dir, "properties"), node_prop) - # Fiji HBM reporting + # Vega HBM reporting # TODO: Extract size, clk, and width from sim paramters mem_dir = joinpath(node_dir, "mem_banks/0") remake_dir(mem_dir) @@ -260,196 +260,7 @@ def createVegaTopology(options): file_append((mem_dir, "properties"), mem_prop) -# This fakes out a dGPU setup so the runtime correctly operations. The spoofed -# system has a single dGPU and a single socket CPU. Note that more complex -# topologies (multi-GPU, multi-socket CPUs) need to have a different setup -# here or the runtime won't be able to issue Memcpies from one node to another. -# -# TODO: There is way too much hardcoded here. It doesn't effect anything in -# our current ROCm stack (1.6), but it is highly possible that it will in the -# future. We might need to scrub through this and extract the appropriate -# fields from the simulator in the future. -def createFijiTopology(options): - topology_dir = joinpath( - m5.options.outdir, "fs/sys/devices/virtual/kfd/kfd/topology" - ) - remake_dir(topology_dir) - - amdgpu_dir = joinpath(m5.options.outdir, "fs/sys/module/amdgpu/parameters") - remake_dir(amdgpu_dir) - - # Fiji reported VM size in GB. Used to reserve an allocation from CPU - # to implement SVM (i.e. GPUVM64 pointers and X86 pointers agree) - file_append((amdgpu_dir, "vm_size"), 256) - - # Ripped from real Fiji platform to appease KMT version checks - file_append((topology_dir, "generation_id"), 2) - - # Set up system properties. Regiter as ast-rocm server - sys_prop = ( - "platform_oem 35498446626881\n" - + "platform_id 71791775140929\n" - + "platform_rev 2\n" - ) - file_append((topology_dir, "system_properties"), sys_prop) - - # Populate the topology tree - # Our dGPU system is two nodes. Node 0 is a CPU and Node 1 is a dGPU - node_dir = joinpath(topology_dir, "nodes/0") - remake_dir(node_dir) - - # Register as a CPU - file_append((node_dir, "gpu_id"), 0) - file_append((node_dir, "name"), "") - - # CPU links. Only thing that matters is we tell the runtime that GPU is - # connected through PCIe to CPU socket 0. - io_links = 1 - io_dir = joinpath(node_dir, "io_links/0") - remake_dir(io_dir) - io_prop = ( - "type 2\n" - + "version_major 0\n" - + "version_minor 0\n" - + "node_from 0\n" - + "node_to 1\n" - + "weight 20\n" - + "min_latency 0\n" - + "max_latency 0\n" - + "min_bandwidth 0\n" - + "max_bandwidth 0\n" - + "recommended_transfer_size 0\n" - + "flags 13\n" - ) - file_append((io_dir, "properties"), io_prop) - - # Populate CPU node properties - node_prop = ( - f"cpu_cores_count {options.num_cpus}\n" - + "simd_count 0\n" - + "mem_banks_count 1\n" - + "caches_count 0\n" - + f"io_links_count {io_links}\n" - + "cpu_core_id_base 0\n" - + "simd_id_base 0\n" - + "max_waves_per_simd 0\n" - + "lds_size_in_kb 0\n" - + "gds_size_in_kb 0\n" - + "wave_front_size 64\n" - + "array_count 0\n" - + "simd_arrays_per_engine 0\n" - + "cu_per_simd_array 0\n" - + "simd_per_cu 0\n" - + "max_slots_scratch_cu 0\n" - + "vendor_id 0\n" - + "device_id 0\n" - + "location_id 0\n" - + "drm_render_minor 0\n" - + "max_engine_clk_ccompute 3400\n" - ) - - file_append((node_dir, "properties"), node_prop) - - # CPU memory reporting - mem_dir = joinpath(node_dir, "mem_banks/0") - remake_dir(mem_dir) - # Heap type value taken from real system, heap type values: - # https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/blob/roc-4.0.x/include/hsakmttypes.h#L317 - mem_prop = ( - "heap_type 0\n" - + "size_in_bytes 33704329216\n" - + "flags 0\n" - + "width 72\n" - + "mem_clk_max 2400\n" - ) - - file_append((mem_dir, "properties"), mem_prop) - - # Build the GPU node - node_dir = joinpath(topology_dir, "nodes/1") - remake_dir(node_dir) - - # Register as a Fiji - file_append((node_dir, "gpu_id"), 50156) - file_append((node_dir, "name"), "Fiji\n") - - # Should be the same as the render driver filename (dri/renderD) - drm_num = 128 - - # Real Fiji shows 96, but building that topology is complex and doesn't - # appear to be required for anything. - caches = 0 - - # GPU links. Only thing that matters is we tell the runtime that GPU is - # connected through PCIe to CPU socket 0. - io_links = 1 - io_dir = joinpath(node_dir, "io_links/0") - remake_dir(io_dir) - io_prop = ( - "type 2\n" - + "version_major 0\n" - + "version_minor 0\n" - + "node_from 1\n" - + "node_to 0\n" - + "weight 20\n" - + "min_latency 0\n" - + "max_latency 0\n" - + "min_bandwidth 0\n" - + "max_bandwidth 0\n" - + "recommended_transfer_size 0\n" - + "flags 1\n" - ) - file_append((io_dir, "properties"), io_prop) - - # Populate GPU node properties - node_prop = ( - "cpu_cores_count 0\n" - + f"simd_count {options.num_compute_units * options.simds_per_cu}\n" - + "mem_banks_count 1\n" - + f"caches_count {caches}\n" - + f"io_links_count {io_links}\n" - + "cpu_core_id_base 0\n" - + "simd_id_base 2147487744\n" - + f"max_waves_per_simd {options.wfs_per_simd}\n" - + f"lds_size_in_kb {int(options.lds_size / 1024)}\n" - + "gds_size_in_kb 0\n" - + f"wave_front_size {options.wf_size}\n" - + "array_count 4\n" - + f"simd_arrays_per_engine {options.sa_per_complex}\n" - + f"cu_per_simd_array {options.cu_per_sa}\n" - + f"simd_per_cu {options.simds_per_cu}\n" - + "max_slots_scratch_cu 32\n" - + "vendor_id 4098\n" - + "device_id 29440\n" - + "location_id 512\n" - + f"drm_render_minor {drm_num}\n" - + f"max_engine_clk_fcompute {int(toFrequency(options.gpu_clock) / 1000000.0)}\n" - + "local_mem_size 4294967296\n" - + "fw_version 730\n" - + "capability 4736\n" - + f"max_engine_clk_ccompute {int(toFrequency(options.CPUClock) / 1000000.0)}\n" - ) - - file_append((node_dir, "properties"), node_prop) - - # Fiji HBM reporting - # TODO: Extract size, clk, and width from sim paramters - mem_dir = joinpath(node_dir, "mem_banks/0") - remake_dir(mem_dir) - # Heap type value taken from real system, heap type values: - # https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/blob/roc-4.0.x/include/hsakmttypes.h#L317 - mem_prop = ( - "heap_type 1\n" - + "size_in_bytes 4294967296\n" - + "flags 0\n" - + "width 4096\n" - + "mem_clk_max 500\n" - ) - - file_append((mem_dir, "properties"), mem_prop) - - -def createCarrizoTopology(options): +def createRavenTopology(options): topology_dir = joinpath( m5.options.outdir, "fs/sys/devices/virtual/kfd/kfd/topology" ) @@ -476,7 +287,6 @@ def createCarrizoTopology(options): file_append((node_dir, "gpu_id"), 2765) gfx_dict = { - "gfx801": {"name": "Carrizo\n", "id": 39028}, "gfx902": {"name": "Raven\n", "id": 5597}, } diff --git a/src/cpu/testers/gpu_ruby_test/README b/src/cpu/testers/gpu_ruby_test/README index 00e4c8e781..db7230a402 100644 --- a/src/cpu/testers/gpu_ruby_test/README +++ b/src/cpu/testers/gpu_ruby_test/README @@ -38,11 +38,11 @@ assumes tested protocols supports release consistency. To start using the tester quickly, you can use the following example command line to get running immediately: -build/GCN3_X86/gem5.opt configs/example/ruby_gpu_random_test.py \ +build/VEGA_X86/gem5.opt configs/example/ruby_gpu_random_test.py \ --test-length=1000 --system-size=medium --cache-size=small An overview of the main command line options is as follows. For all options -use `build/GCN3_X86/gem5.opt configs/example/ruby_gpu_random_test.py --help` +use `build/VEGA_X86/gem5.opt configs/example/ruby_gpu_random_test.py --help` or see the configuration file. * --cache-size (small, large): Use smaller sizes for testing evict, etc. diff --git a/src/dev/amdgpu/hwreg_defines.hh b/src/dev/amdgpu/hwreg_defines.hh index f5097c8994..30ad58457d 100644 --- a/src/dev/amdgpu/hwreg_defines.hh +++ b/src/dev/amdgpu/hwreg_defines.hh @@ -44,7 +44,7 @@ namespace gem5 /* * Further descriptions can be found in the "Hardware Register Values" table - * in any of the GCN3, Vega, CDNA1, CNDA2, or RDNA ISA manuals. + * in any of the Vega, CDNA, or RDNA ISA manuals. */ enum amdgpu_hwreg { diff --git a/src/dev/hsa/hsa_packet_processor.hh b/src/dev/hsa/hsa_packet_processor.hh index b72092538d..d70b964ba2 100644 --- a/src/dev/hsa/hsa_packet_processor.hh +++ b/src/dev/hsa/hsa_packet_processor.hh @@ -115,28 +115,13 @@ class HSAQueueDescriptor 10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/ rocr/src/core/runtime/amd_aql_queue.cpp#L624 * - * GFX7 and GFX8 will allocate twice as much space for their HSA - * queues as they actually access (using mod operations to map the - * virtual addresses from the upper half of the queue to the same - * virtual addresses as the lower half). Thus, we need to check if - * the ISA is GFX8 and mod the address by half of the queue size if - * so. */ uint64_t retAddr = 0ll; - if ((gfxVersion == GfxVersion::gfx801) || - (gfxVersion == GfxVersion::gfx803)) { - retAddr = basePointer + ((ix % (numElts/2)) * objSize()); - DPRINTF(HSAPacketProcessor, "ptr() gfx8: base: 0x%x, " - "index: 0x%x, numElts: 0x%x, numElts/2: 0x%x, " - "objSize: 0x%x, retAddr: 0x%x\n", basePointer, ix, - numElts, numElts/2, objSize(), retAddr); - } else { - retAddr = basePointer + ((ix % numElts) * objSize()); - DPRINTF(HSAPacketProcessor, "ptr() gfx9: base: 0x%x, " - "index: 0x%x, numElts: 0x%x, objSize: 0x%x, " - "retAddr: 0x%x\n", basePointer, ix, numElts, objSize(), - retAddr); - } + retAddr = basePointer + ((ix % numElts) * objSize()); + DPRINTF(HSAPacketProcessor, "ptr() gfx9: base: 0x%x, " + "index: 0x%x, numElts: 0x%x, objSize: 0x%x, " + "retAddr: 0x%x\n", basePointer, ix, numElts, objSize(), + retAddr); return retAddr; } }; diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py index de0adf8b07..78baa596a7 100644 --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -45,7 +45,7 @@ class PrefetchType(Enum): class GfxVersion(ScopedEnum): - vals = ["gfx801", "gfx803", "gfx900", "gfx902", "gfx908", "gfx90a"] + vals = ["gfx900", "gfx902", "gfx908", "gfx90a"] class PoolManager(SimObject): @@ -320,7 +320,7 @@ class GPUComputeDriver(EmulatedDriver): cxx_header = "gpu-compute/gpu_compute_driver.hh" device = Param.GPUCommandProcessor("GPU controlled by this driver") isdGPU = Param.Bool(False, "Driver is for a dGPU") - gfxVersion = Param.GfxVersion("gfx801", "ISA of gpu to model") + gfxVersion = Param.GfxVersion("gfx902", "ISA of gpu to model") dGPUPoolID = Param.Int(0, "Pool ID for dGPU.") # Default Mtype for caches # -- 1 1 1 C_RW_S (Cached-ReadWrite-Shared) diff --git a/src/gpu-compute/gpu_compute_driver.cc b/src/gpu-compute/gpu_compute_driver.cc index 6c843c654f..6170abdc7b 100644 --- a/src/gpu-compute/gpu_compute_driver.cc +++ b/src/gpu-compute/gpu_compute_driver.cc @@ -327,13 +327,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) */ switch (gfxVersion) { - case GfxVersion::gfx801: - case GfxVersion::gfx803: - args->process_apertures[i].scratch_base = - scratchApeBase(i + 1); - args->process_apertures[i].lds_base = - ldsApeBase(i + 1); - break; case GfxVersion::gfx900: case GfxVersion::gfx902: args->process_apertures[i].scratch_base = @@ -345,7 +338,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) fatal("Invalid gfx version\n"); } - // GFX8 and GFX9 set lds and scratch limits the same way args->process_apertures[i].scratch_limit = scratchApeLimit(args->process_apertures[i].scratch_base); @@ -353,13 +345,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) ldsApeLimit(args->process_apertures[i].lds_base); switch (gfxVersion) { - case GfxVersion::gfx801: - args->process_apertures[i].gpuvm_base = - gpuVmApeBase(i + 1); - args->process_apertures[i].gpuvm_limit = - gpuVmApeLimit(args->process_apertures[i].gpuvm_base); - break; - case GfxVersion::gfx803: case GfxVersion::gfx900: case GfxVersion::gfx902: // Taken from SVM_USE_BASE in Linux kernel @@ -383,9 +368,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) // id composed out of a non-zero base and an offset. if (isdGPU) { switch (gfxVersion) { - case GfxVersion::gfx803: - args->process_apertures[i].gpu_id = 50156; - break; case GfxVersion::gfx900: args->process_apertures[i].gpu_id = 22124; break; @@ -394,7 +376,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) } } else { switch (gfxVersion) { - case GfxVersion::gfx801: case GfxVersion::gfx902: args->process_apertures[i].gpu_id = 2765; break; @@ -630,11 +611,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) (ioc_args->kfd_process_device_apertures_ptr); switch (gfxVersion) { - case GfxVersion::gfx801: - case GfxVersion::gfx803: - ape_args->scratch_base = scratchApeBase(i + 1); - ape_args->lds_base = ldsApeBase(i + 1); - break; case GfxVersion::gfx900: case GfxVersion::gfx902: ape_args->scratch_base = scratchApeBaseV9(); @@ -644,18 +620,11 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) fatal("Invalid gfx version\n"); } - // GFX8 and GFX9 set lds and scratch limits the same way ape_args->scratch_limit = scratchApeLimit(ape_args->scratch_base); ape_args->lds_limit = ldsApeLimit(ape_args->lds_base); switch (gfxVersion) { - case GfxVersion::gfx801: - ape_args->gpuvm_base = gpuVmApeBase(i + 1); - ape_args->gpuvm_limit = - gpuVmApeLimit(ape_args->gpuvm_base); - break; - case GfxVersion::gfx803: case GfxVersion::gfx900: case GfxVersion::gfx902: // Taken from SVM_USE_BASE in Linux kernel @@ -670,9 +639,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) // NOTE: Must match ID populated by hsaTopology.py if (isdGPU) { switch (gfxVersion) { - case GfxVersion::gfx803: - ape_args->gpu_id = 50156; - break; case GfxVersion::gfx900: ape_args->gpu_id = 22124; break; @@ -681,7 +647,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) } } else { switch (gfxVersion) { - case GfxVersion::gfx801: case GfxVersion::gfx902: ape_args->gpu_id = 2765; break; diff --git a/src/gpu-compute/gpu_compute_driver.hh b/src/gpu-compute/gpu_compute_driver.hh index 9a3c6479c3..a455a607e2 100644 --- a/src/gpu-compute/gpu_compute_driver.hh +++ b/src/gpu-compute/gpu_compute_driver.hh @@ -86,8 +86,6 @@ class GPUComputeDriver final : public EmulatedDriver doorbellSize() { switch (gfxVersion) { - case GfxVersion::gfx801: - case GfxVersion::gfx803: case GfxVersion::gfx902: return 4; case GfxVersion::gfx900: diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index c59317d2c4..66b2b8ec49 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -56,7 +56,7 @@ GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, a_data = new uint8_t[computeUnit()->wfSize() * 8]; x_data = new uint8_t[computeUnit()->wfSize() * 8]; // scalar loads can read up to 16 Dwords of data (see publicly - // available GCN3 ISA manual) + // available Vega ISA manual) scalar_data = new uint8_t[16 * sizeof(uint32_t)]; for (int i = 0; i < (16 * sizeof(uint32_t)); ++i) { scalar_data[i] = 0; diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh index d81b879594..a464e4882d 100644 --- a/src/gpu-compute/hsa_queue_entry.hh +++ b/src/gpu-compute/hsa_queue_entry.hh @@ -102,14 +102,9 @@ class HSAQueueEntry numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4; } - // SGPR allocation granularies: - // - GFX8: 8 - // - GFX9: 16 - // Source: https://llvm.org/docs/.html - if (gfx_version == GfxVersion::gfx801 || - gfx_version == GfxVersion::gfx803) { - numSgprs = (akc->granulated_wavefront_sgpr_count + 1) * 8; - } else if (gfx_version == GfxVersion::gfx900 || + // SGPR allocation granulary is 16 in GFX9 + // Source: https://llvm.org/docs/AMDGPUUsage.html + if (gfx_version == GfxVersion::gfx900 || gfx_version == GfxVersion::gfx902 || gfx_version == GfxVersion::gfx908 || gfx_version == GfxVersion::gfx90a) { diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc index 13dc423897..af9ce538e3 100644 --- a/src/gpu-compute/schedule_stage.cc +++ b/src/gpu-compute/schedule_stage.cc @@ -626,8 +626,6 @@ void ScheduleStage::arbitrateVrfToLdsBus() { // Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops - // Note: a Flat instruction in GFx8 reserves both VRF->Glb memory bus - // and a VRF->LDS bus. In GFx9, this is not the case. // iterate the GM pipelines for (int i = 0; i < computeUnit.numVectorGlobalMemUnits; i++) { diff --git a/src/mem/request.hh b/src/mem/request.hh index df249ac249..5e359febf7 100644 --- a/src/mem/request.hh +++ b/src/mem/request.hh @@ -292,8 +292,8 @@ class Request : public Extensible /** * These bits are used to set the coherence policy for the GPU and are - * encoded in the GCN3 instructions. The GCN3 ISA defines two cache levels - * See the AMD GCN3 ISA Architecture Manual for more details. + * encoded in the Vega instructions. The Vega ISA defines two cache levels + * See the AMD Vega ISA Architecture Manual for more details. * * INV_L1: L1 cache invalidation * FLUSH_L2: L2 cache flush diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc index a5198cce63..2adc41b578 100644 --- a/src/mem/ruby/system/VIPERCoalescer.cc +++ b/src/mem/ruby/system/VIPERCoalescer.cc @@ -77,7 +77,7 @@ VIPERCoalescer::makeRequest(PacketPtr pkt) // AtomicOp : cache atomic // Flush : flush and invalidate cache // - // VIPER does not expect MemSyncReq & Release since in GCN3, compute unit + // VIPER does not expect MemSyncReq & Release since compute unit // does not specify an equivalent type of memory request. assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL1()) || pkt->cmd == MemCmd::ReadReq ||