diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py index 9483a6d291..9c023fe5c6 100644 --- a/configs/example/apu_se.py +++ b/configs/example/apu_se.py @@ -375,7 +375,7 @@ parser.add_argument( parser.add_argument( "--gfx-version", type=str, - default="gfx801", + default="gfx902", choices=GfxVersion.vals, help="Gfx version for gpuNote: gfx902 is not fully supported by ROCm", ) @@ -951,19 +951,15 @@ root = Root(system=system, full_system=False) # knows what type of GPU hardware we are simulating if args.dgpu: assert args.gfx_version in [ - "gfx803", "gfx900", ], "Incorrect gfx version for dGPU" - if args.gfx_version == "gfx803": - hsaTopology.createFijiTopology(args) - elif args.gfx_version == "gfx900": + if args.gfx_version == "gfx900": hsaTopology.createVegaTopology(args) else: assert args.gfx_version in [ - "gfx801", "gfx902", ], "Incorrect gfx version for APU" - hsaTopology.createCarrizoTopology(args) + hsaTopology.createRavenTopology(args) m5.ticks.setGlobalFrequency("1THz") if args.abs_max_tick: diff --git a/configs/example/hsaTopology.py b/configs/example/hsaTopology.py index 2dcbdeca01..4540293482 100644 --- a/configs/example/hsaTopology.py +++ b/configs/example/hsaTopology.py @@ -243,7 +243,7 @@ def createVegaTopology(options): file_append((node_dir, "properties"), node_prop) - # Fiji HBM reporting + # Vega HBM reporting # TODO: Extract size, clk, and width from sim paramters mem_dir = joinpath(node_dir, "mem_banks/0") remake_dir(mem_dir) @@ -260,196 +260,7 @@ def createVegaTopology(options): file_append((mem_dir, "properties"), mem_prop) -# This fakes out a dGPU setup so the runtime correctly operations. The spoofed -# system has a single dGPU and a single socket CPU. Note that more complex -# topologies (multi-GPU, multi-socket CPUs) need to have a different setup -# here or the runtime won't be able to issue Memcpies from one node to another. -# -# TODO: There is way too much hardcoded here. It doesn't effect anything in -# our current ROCm stack (1.6), but it is highly possible that it will in the -# future. We might need to scrub through this and extract the appropriate -# fields from the simulator in the future. -def createFijiTopology(options): - topology_dir = joinpath( - m5.options.outdir, "fs/sys/devices/virtual/kfd/kfd/topology" - ) - remake_dir(topology_dir) - - amdgpu_dir = joinpath(m5.options.outdir, "fs/sys/module/amdgpu/parameters") - remake_dir(amdgpu_dir) - - # Fiji reported VM size in GB. Used to reserve an allocation from CPU - # to implement SVM (i.e. GPUVM64 pointers and X86 pointers agree) - file_append((amdgpu_dir, "vm_size"), 256) - - # Ripped from real Fiji platform to appease KMT version checks - file_append((topology_dir, "generation_id"), 2) - - # Set up system properties. Regiter as ast-rocm server - sys_prop = ( - "platform_oem 35498446626881\n" - + "platform_id 71791775140929\n" - + "platform_rev 2\n" - ) - file_append((topology_dir, "system_properties"), sys_prop) - - # Populate the topology tree - # Our dGPU system is two nodes. Node 0 is a CPU and Node 1 is a dGPU - node_dir = joinpath(topology_dir, "nodes/0") - remake_dir(node_dir) - - # Register as a CPU - file_append((node_dir, "gpu_id"), 0) - file_append((node_dir, "name"), "") - - # CPU links. Only thing that matters is we tell the runtime that GPU is - # connected through PCIe to CPU socket 0. - io_links = 1 - io_dir = joinpath(node_dir, "io_links/0") - remake_dir(io_dir) - io_prop = ( - "type 2\n" - + "version_major 0\n" - + "version_minor 0\n" - + "node_from 0\n" - + "node_to 1\n" - + "weight 20\n" - + "min_latency 0\n" - + "max_latency 0\n" - + "min_bandwidth 0\n" - + "max_bandwidth 0\n" - + "recommended_transfer_size 0\n" - + "flags 13\n" - ) - file_append((io_dir, "properties"), io_prop) - - # Populate CPU node properties - node_prop = ( - f"cpu_cores_count {options.num_cpus}\n" - + "simd_count 0\n" - + "mem_banks_count 1\n" - + "caches_count 0\n" - + f"io_links_count {io_links}\n" - + "cpu_core_id_base 0\n" - + "simd_id_base 0\n" - + "max_waves_per_simd 0\n" - + "lds_size_in_kb 0\n" - + "gds_size_in_kb 0\n" - + "wave_front_size 64\n" - + "array_count 0\n" - + "simd_arrays_per_engine 0\n" - + "cu_per_simd_array 0\n" - + "simd_per_cu 0\n" - + "max_slots_scratch_cu 0\n" - + "vendor_id 0\n" - + "device_id 0\n" - + "location_id 0\n" - + "drm_render_minor 0\n" - + "max_engine_clk_ccompute 3400\n" - ) - - file_append((node_dir, "properties"), node_prop) - - # CPU memory reporting - mem_dir = joinpath(node_dir, "mem_banks/0") - remake_dir(mem_dir) - # Heap type value taken from real system, heap type values: - # https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/blob/roc-4.0.x/include/hsakmttypes.h#L317 - mem_prop = ( - "heap_type 0\n" - + "size_in_bytes 33704329216\n" - + "flags 0\n" - + "width 72\n" - + "mem_clk_max 2400\n" - ) - - file_append((mem_dir, "properties"), mem_prop) - - # Build the GPU node - node_dir = joinpath(topology_dir, "nodes/1") - remake_dir(node_dir) - - # Register as a Fiji - file_append((node_dir, "gpu_id"), 50156) - file_append((node_dir, "name"), "Fiji\n") - - # Should be the same as the render driver filename (dri/renderD) - drm_num = 128 - - # Real Fiji shows 96, but building that topology is complex and doesn't - # appear to be required for anything. - caches = 0 - - # GPU links. Only thing that matters is we tell the runtime that GPU is - # connected through PCIe to CPU socket 0. - io_links = 1 - io_dir = joinpath(node_dir, "io_links/0") - remake_dir(io_dir) - io_prop = ( - "type 2\n" - + "version_major 0\n" - + "version_minor 0\n" - + "node_from 1\n" - + "node_to 0\n" - + "weight 20\n" - + "min_latency 0\n" - + "max_latency 0\n" - + "min_bandwidth 0\n" - + "max_bandwidth 0\n" - + "recommended_transfer_size 0\n" - + "flags 1\n" - ) - file_append((io_dir, "properties"), io_prop) - - # Populate GPU node properties - node_prop = ( - "cpu_cores_count 0\n" - + f"simd_count {options.num_compute_units * options.simds_per_cu}\n" - + "mem_banks_count 1\n" - + f"caches_count {caches}\n" - + f"io_links_count {io_links}\n" - + "cpu_core_id_base 0\n" - + "simd_id_base 2147487744\n" - + f"max_waves_per_simd {options.wfs_per_simd}\n" - + f"lds_size_in_kb {int(options.lds_size / 1024)}\n" - + "gds_size_in_kb 0\n" - + f"wave_front_size {options.wf_size}\n" - + "array_count 4\n" - + f"simd_arrays_per_engine {options.sa_per_complex}\n" - + f"cu_per_simd_array {options.cu_per_sa}\n" - + f"simd_per_cu {options.simds_per_cu}\n" - + "max_slots_scratch_cu 32\n" - + "vendor_id 4098\n" - + "device_id 29440\n" - + "location_id 512\n" - + f"drm_render_minor {drm_num}\n" - + f"max_engine_clk_fcompute {int(toFrequency(options.gpu_clock) / 1000000.0)}\n" - + "local_mem_size 4294967296\n" - + "fw_version 730\n" - + "capability 4736\n" - + f"max_engine_clk_ccompute {int(toFrequency(options.CPUClock) / 1000000.0)}\n" - ) - - file_append((node_dir, "properties"), node_prop) - - # Fiji HBM reporting - # TODO: Extract size, clk, and width from sim paramters - mem_dir = joinpath(node_dir, "mem_banks/0") - remake_dir(mem_dir) - # Heap type value taken from real system, heap type values: - # https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/blob/roc-4.0.x/include/hsakmttypes.h#L317 - mem_prop = ( - "heap_type 1\n" - + "size_in_bytes 4294967296\n" - + "flags 0\n" - + "width 4096\n" - + "mem_clk_max 500\n" - ) - - file_append((mem_dir, "properties"), mem_prop) - - -def createCarrizoTopology(options): +def createRavenTopology(options): topology_dir = joinpath( m5.options.outdir, "fs/sys/devices/virtual/kfd/kfd/topology" ) @@ -476,7 +287,6 @@ def createCarrizoTopology(options): file_append((node_dir, "gpu_id"), 2765) gfx_dict = { - "gfx801": {"name": "Carrizo\n", "id": 39028}, "gfx902": {"name": "Raven\n", "id": 5597}, } diff --git a/src/cpu/testers/gpu_ruby_test/README b/src/cpu/testers/gpu_ruby_test/README index 00e4c8e781..db7230a402 100644 --- a/src/cpu/testers/gpu_ruby_test/README +++ b/src/cpu/testers/gpu_ruby_test/README @@ -38,11 +38,11 @@ assumes tested protocols supports release consistency. To start using the tester quickly, you can use the following example command line to get running immediately: -build/GCN3_X86/gem5.opt configs/example/ruby_gpu_random_test.py \ +build/VEGA_X86/gem5.opt configs/example/ruby_gpu_random_test.py \ --test-length=1000 --system-size=medium --cache-size=small An overview of the main command line options is as follows. For all options -use `build/GCN3_X86/gem5.opt configs/example/ruby_gpu_random_test.py --help` +use `build/VEGA_X86/gem5.opt configs/example/ruby_gpu_random_test.py --help` or see the configuration file. * --cache-size (small, large): Use smaller sizes for testing evict, etc. diff --git a/src/dev/amdgpu/hwreg_defines.hh b/src/dev/amdgpu/hwreg_defines.hh index f5097c8994..30ad58457d 100644 --- a/src/dev/amdgpu/hwreg_defines.hh +++ b/src/dev/amdgpu/hwreg_defines.hh @@ -44,7 +44,7 @@ namespace gem5 /* * Further descriptions can be found in the "Hardware Register Values" table - * in any of the GCN3, Vega, CDNA1, CNDA2, or RDNA ISA manuals. + * in any of the Vega, CDNA, or RDNA ISA manuals. */ enum amdgpu_hwreg { diff --git a/src/dev/hsa/hsa_packet_processor.hh b/src/dev/hsa/hsa_packet_processor.hh index b72092538d..d70b964ba2 100644 --- a/src/dev/hsa/hsa_packet_processor.hh +++ b/src/dev/hsa/hsa_packet_processor.hh @@ -115,28 +115,13 @@ class HSAQueueDescriptor 10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/ rocr/src/core/runtime/amd_aql_queue.cpp#L624 * - * GFX7 and GFX8 will allocate twice as much space for their HSA - * queues as they actually access (using mod operations to map the - * virtual addresses from the upper half of the queue to the same - * virtual addresses as the lower half). Thus, we need to check if - * the ISA is GFX8 and mod the address by half of the queue size if - * so. */ uint64_t retAddr = 0ll; - if ((gfxVersion == GfxVersion::gfx801) || - (gfxVersion == GfxVersion::gfx803)) { - retAddr = basePointer + ((ix % (numElts/2)) * objSize()); - DPRINTF(HSAPacketProcessor, "ptr() gfx8: base: 0x%x, " - "index: 0x%x, numElts: 0x%x, numElts/2: 0x%x, " - "objSize: 0x%x, retAddr: 0x%x\n", basePointer, ix, - numElts, numElts/2, objSize(), retAddr); - } else { - retAddr = basePointer + ((ix % numElts) * objSize()); - DPRINTF(HSAPacketProcessor, "ptr() gfx9: base: 0x%x, " - "index: 0x%x, numElts: 0x%x, objSize: 0x%x, " - "retAddr: 0x%x\n", basePointer, ix, numElts, objSize(), - retAddr); - } + retAddr = basePointer + ((ix % numElts) * objSize()); + DPRINTF(HSAPacketProcessor, "ptr() gfx9: base: 0x%x, " + "index: 0x%x, numElts: 0x%x, objSize: 0x%x, " + "retAddr: 0x%x\n", basePointer, ix, numElts, objSize(), + retAddr); return retAddr; } }; diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py index de0adf8b07..78baa596a7 100644 --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -45,7 +45,7 @@ class PrefetchType(Enum): class GfxVersion(ScopedEnum): - vals = ["gfx801", "gfx803", "gfx900", "gfx902", "gfx908", "gfx90a"] + vals = ["gfx900", "gfx902", "gfx908", "gfx90a"] class PoolManager(SimObject): @@ -320,7 +320,7 @@ class GPUComputeDriver(EmulatedDriver): cxx_header = "gpu-compute/gpu_compute_driver.hh" device = Param.GPUCommandProcessor("GPU controlled by this driver") isdGPU = Param.Bool(False, "Driver is for a dGPU") - gfxVersion = Param.GfxVersion("gfx801", "ISA of gpu to model") + gfxVersion = Param.GfxVersion("gfx902", "ISA of gpu to model") dGPUPoolID = Param.Int(0, "Pool ID for dGPU.") # Default Mtype for caches # -- 1 1 1 C_RW_S (Cached-ReadWrite-Shared) diff --git a/src/gpu-compute/gpu_compute_driver.cc b/src/gpu-compute/gpu_compute_driver.cc index 6c843c654f..6170abdc7b 100644 --- a/src/gpu-compute/gpu_compute_driver.cc +++ b/src/gpu-compute/gpu_compute_driver.cc @@ -327,13 +327,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) */ switch (gfxVersion) { - case GfxVersion::gfx801: - case GfxVersion::gfx803: - args->process_apertures[i].scratch_base = - scratchApeBase(i + 1); - args->process_apertures[i].lds_base = - ldsApeBase(i + 1); - break; case GfxVersion::gfx900: case GfxVersion::gfx902: args->process_apertures[i].scratch_base = @@ -345,7 +338,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) fatal("Invalid gfx version\n"); } - // GFX8 and GFX9 set lds and scratch limits the same way args->process_apertures[i].scratch_limit = scratchApeLimit(args->process_apertures[i].scratch_base); @@ -353,13 +345,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) ldsApeLimit(args->process_apertures[i].lds_base); switch (gfxVersion) { - case GfxVersion::gfx801: - args->process_apertures[i].gpuvm_base = - gpuVmApeBase(i + 1); - args->process_apertures[i].gpuvm_limit = - gpuVmApeLimit(args->process_apertures[i].gpuvm_base); - break; - case GfxVersion::gfx803: case GfxVersion::gfx900: case GfxVersion::gfx902: // Taken from SVM_USE_BASE in Linux kernel @@ -383,9 +368,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) // id composed out of a non-zero base and an offset. if (isdGPU) { switch (gfxVersion) { - case GfxVersion::gfx803: - args->process_apertures[i].gpu_id = 50156; - break; case GfxVersion::gfx900: args->process_apertures[i].gpu_id = 22124; break; @@ -394,7 +376,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) } } else { switch (gfxVersion) { - case GfxVersion::gfx801: case GfxVersion::gfx902: args->process_apertures[i].gpu_id = 2765; break; @@ -630,11 +611,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) (ioc_args->kfd_process_device_apertures_ptr); switch (gfxVersion) { - case GfxVersion::gfx801: - case GfxVersion::gfx803: - ape_args->scratch_base = scratchApeBase(i + 1); - ape_args->lds_base = ldsApeBase(i + 1); - break; case GfxVersion::gfx900: case GfxVersion::gfx902: ape_args->scratch_base = scratchApeBaseV9(); @@ -644,18 +620,11 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) fatal("Invalid gfx version\n"); } - // GFX8 and GFX9 set lds and scratch limits the same way ape_args->scratch_limit = scratchApeLimit(ape_args->scratch_base); ape_args->lds_limit = ldsApeLimit(ape_args->lds_base); switch (gfxVersion) { - case GfxVersion::gfx801: - ape_args->gpuvm_base = gpuVmApeBase(i + 1); - ape_args->gpuvm_limit = - gpuVmApeLimit(ape_args->gpuvm_base); - break; - case GfxVersion::gfx803: case GfxVersion::gfx900: case GfxVersion::gfx902: // Taken from SVM_USE_BASE in Linux kernel @@ -670,9 +639,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) // NOTE: Must match ID populated by hsaTopology.py if (isdGPU) { switch (gfxVersion) { - case GfxVersion::gfx803: - ape_args->gpu_id = 50156; - break; case GfxVersion::gfx900: ape_args->gpu_id = 22124; break; @@ -681,7 +647,6 @@ GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) } } else { switch (gfxVersion) { - case GfxVersion::gfx801: case GfxVersion::gfx902: ape_args->gpu_id = 2765; break; diff --git a/src/gpu-compute/gpu_compute_driver.hh b/src/gpu-compute/gpu_compute_driver.hh index 9a3c6479c3..a455a607e2 100644 --- a/src/gpu-compute/gpu_compute_driver.hh +++ b/src/gpu-compute/gpu_compute_driver.hh @@ -86,8 +86,6 @@ class GPUComputeDriver final : public EmulatedDriver doorbellSize() { switch (gfxVersion) { - case GfxVersion::gfx801: - case GfxVersion::gfx803: case GfxVersion::gfx902: return 4; case GfxVersion::gfx900: diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index c59317d2c4..66b2b8ec49 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -56,7 +56,7 @@ GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, a_data = new uint8_t[computeUnit()->wfSize() * 8]; x_data = new uint8_t[computeUnit()->wfSize() * 8]; // scalar loads can read up to 16 Dwords of data (see publicly - // available GCN3 ISA manual) + // available Vega ISA manual) scalar_data = new uint8_t[16 * sizeof(uint32_t)]; for (int i = 0; i < (16 * sizeof(uint32_t)); ++i) { scalar_data[i] = 0; diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh index d81b879594..a464e4882d 100644 --- a/src/gpu-compute/hsa_queue_entry.hh +++ b/src/gpu-compute/hsa_queue_entry.hh @@ -102,14 +102,9 @@ class HSAQueueEntry numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4; } - // SGPR allocation granularies: - // - GFX8: 8 - // - GFX9: 16 - // Source: https://llvm.org/docs/.html - if (gfx_version == GfxVersion::gfx801 || - gfx_version == GfxVersion::gfx803) { - numSgprs = (akc->granulated_wavefront_sgpr_count + 1) * 8; - } else if (gfx_version == GfxVersion::gfx900 || + // SGPR allocation granulary is 16 in GFX9 + // Source: https://llvm.org/docs/AMDGPUUsage.html + if (gfx_version == GfxVersion::gfx900 || gfx_version == GfxVersion::gfx902 || gfx_version == GfxVersion::gfx908 || gfx_version == GfxVersion::gfx90a) { diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc index 13dc423897..af9ce538e3 100644 --- a/src/gpu-compute/schedule_stage.cc +++ b/src/gpu-compute/schedule_stage.cc @@ -626,8 +626,6 @@ void ScheduleStage::arbitrateVrfToLdsBus() { // Arbitrate the VRF->GM and VRF->LDS buses for Flat memory ops - // Note: a Flat instruction in GFx8 reserves both VRF->Glb memory bus - // and a VRF->LDS bus. In GFx9, this is not the case. // iterate the GM pipelines for (int i = 0; i < computeUnit.numVectorGlobalMemUnits; i++) { diff --git a/src/mem/request.hh b/src/mem/request.hh index df249ac249..5e359febf7 100644 --- a/src/mem/request.hh +++ b/src/mem/request.hh @@ -292,8 +292,8 @@ class Request : public Extensible /** * These bits are used to set the coherence policy for the GPU and are - * encoded in the GCN3 instructions. The GCN3 ISA defines two cache levels - * See the AMD GCN3 ISA Architecture Manual for more details. + * encoded in the Vega instructions. The Vega ISA defines two cache levels + * See the AMD Vega ISA Architecture Manual for more details. * * INV_L1: L1 cache invalidation * FLUSH_L2: L2 cache flush diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc index a5198cce63..2adc41b578 100644 --- a/src/mem/ruby/system/VIPERCoalescer.cc +++ b/src/mem/ruby/system/VIPERCoalescer.cc @@ -77,7 +77,7 @@ VIPERCoalescer::makeRequest(PacketPtr pkt) // AtomicOp : cache atomic // Flush : flush and invalidate cache // - // VIPER does not expect MemSyncReq & Release since in GCN3, compute unit + // VIPER does not expect MemSyncReq & Release since compute unit // does not specify an equivalent type of memory request. assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL1()) || pkt->cmd == MemCmd::ReadReq ||