diff --git a/configs/example/gpufs/mi300.py b/configs/example/gpufs/mi300.py new file mode 100644 index 0000000000..9e0e0da622 --- /dev/null +++ b/configs/example/gpufs/mi300.py @@ -0,0 +1,172 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +""" This file creates an X86 system with a KVM CPU and GPU device capable of +running the MI300 ISA (gfx942). Most of this file sets up a runscript which +will load in a binary, shell script, or python file from the host and run that +within gem5. Jump to line 146 for list of system parameters to configure. +""" + +import argparse +import base64 +import os +import sys +import tempfile +from typing import Optional + +import runfs +from amd import AmdGPUOptions +from common import ( + GPUTLBOptions, + Options, +) +from ruby import Ruby + +import m5 + +from gem5.resources.resource import AbstractResource + +demo_runscript_without_checkpoint = """\ +export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH +export HSA_ENABLE_INTERRUPT=0 +export HCC_AMDGPU_TARGET=gfx942 +export HSA_OVERRIDE_GFX_VERSION="9.4.2" +dmesg -n8 +cat /proc/cpuinfo +dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128 +if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then + echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5." + /sbin/m5 exit +fi +modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0 +echo "Running {} {}" +echo "{}" | base64 -d > myapp +chmod +x myapp +./myapp {} +/sbin/m5 exit +""" + +demo_runscript_with_checkpoint = """\ +export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH +export HSA_ENABLE_INTERRUPT=0 +export HCC_AMDGPU_TARGET=gfx942 +export HSA_OVERRIDE_GFX_VERSION="9.4.2" +dmesg -n8 +dd if=/root/roms/mi200.rom of=/dev/mem bs=1k seek=768 count=128 +if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then + echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5." + /sbin/m5 exit +fi +modprobe -v amdgpu ip_block_mask=0x6f ppfeaturemask=0 dpm=0 audio=0 ras_enable=0 +echo "Running {} {}" +echo "{}" | base64 -d > myapp +chmod +x myapp +/sbin/m5 checkpoint +./myapp {} +/sbin/m5 exit +""" + + +def addDemoOptions(parser): + parser.add_argument( + "-a", "--app", default=None, help="GPU application to run" + ) + parser.add_argument( + "-o", "--opts", default="", help="GPU application arguments" + ) + + +def runMI300GPUFS( + cpu_type, + disk: Optional[AbstractResource] = None, + kernel: Optional[AbstractResource] = None, + app: Optional[AbstractResource] = None, +): + parser = argparse.ArgumentParser() + runfs.addRunFSOptions(parser) + Options.addCommonOptions(parser) + AmdGPUOptions.addAmdGPUOptions(parser) + Ruby.define_options(parser) + GPUTLBOptions.tlb_options(parser) + addDemoOptions(parser) + + # Parse now so we can override options + args = parser.parse_args() + demo_runscript = "" + + if disk != None: + args.disk_image = disk.get_local_path() + if kernel != None: + args.kernel = kernel.get_local_path() + if app != None: + args.app = app.get_local_path() + + # Create temp script to run application + if not os.path.isfile(args.app): + print("Could not find applcation", args.app) + sys.exit(1) + + # Choose runscript Based on whether any checkpointing args are set + if args.checkpoint_dir is not None: + demo_runscript = demo_runscript_with_checkpoint + else: + demo_runscript = demo_runscript_without_checkpoint + + with open(os.path.abspath(args.app), "rb") as binfile: + encodedBin = base64.b64encode(binfile.read()).decode() + + _, tempRunscript = tempfile.mkstemp() + with open(tempRunscript, "w") as b64file: + runscriptStr = demo_runscript.format( + args.app, args.opts, encodedBin, args.opts + ) + b64file.write(runscriptStr) + + args.script = tempRunscript + + # Defaults for CPU + args.cpu_type = "X86KvmCPU" + args.mem_size = "8GB" + + # Defaults for MI300X + args.gpu_device = "MI300X" + args.dgpu_mem_size = "16GB" # GPU memory size, must be 16GB currently. + + # See: https://rocm.docs.amd.com/en/latest/conceptual/gpu-arch/mi300.html + # Topology for one XCD. Number of CUs is approximately 304 / 8, rounded + # up to 40 due to gem5 restriction of 4 CUs per SQC / scalar cache. + args.num_compute_units = 40 + args.gpu_topology = "Crossbar" + + # Run gem5 + runfs.runGpuFSSystem(args) + + +if __name__ == "__m5_main__": + runMI300GPUFS("X86KvmCPU") diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py index fed155bc44..866fa89822 100644 --- a/configs/example/gpufs/runfs.py +++ b/configs/example/gpufs/runfs.py @@ -134,9 +134,9 @@ def addRunFSOptions(parser): parser.add_argument( "--gpu-device", default="Vega10", - choices=["Vega10", "MI100", "MI200"], - help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), or " - "MI200 (gfx90a)", + choices=["Vega10", "MI100", "MI200", "MI300X"], + help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), MI200 " + "(gfx90a), or MI300X (gfx942).", ) parser.add_argument( diff --git a/configs/example/gpufs/system/amdgpu.py b/configs/example/gpufs/system/amdgpu.py index 0813759e2a..55937cd255 100644 --- a/configs/example/gpufs/system/amdgpu.py +++ b/configs/example/gpufs/system/amdgpu.py @@ -191,10 +191,14 @@ def connectGPU(system, args): system.pc.south_bridge.gpu.DeviceID = 0x740F system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002 system.pc.south_bridge.gpu.SubsystemID = 0x0C34 + elif args.gpu_device == "MI300X": + system.pc.south_bridge.gpu.DeviceID = 0x740F + system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002 + system.pc.south_bridge.gpu.SubsystemID = 0x0C34 elif args.gpu_device == "Vega10": system.pc.south_bridge.gpu.DeviceID = 0x6863 else: - panic(f"Unknown GPU device: {args.gpu_device}") + m5.util.panic(f"Unknown GPU device: {args.gpu_device}") # Use the gem5 default of 0x280 OR'd with 0x10 which tells Linux there is # a PCI capabilities list to travse. diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py index 7c596f0ccf..1322650964 100644 --- a/configs/example/gpufs/system/system.py +++ b/configs/example/gpufs/system/system.py @@ -161,7 +161,7 @@ def makeGpuFSSystem(args): 0x7D000, ] sdma_sizes = [0x1000] * 8 - elif args.gpu_device == "MI200": + elif args.gpu_device == "MI200" or args.gpu_device == "MI300X": num_sdmas = 5 sdma_bases = [ 0x4980, diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc index 6f34301f48..eb5a5bb309 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.cc +++ b/src/arch/amdgpu/vega/gpu_decoder.cc @@ -9922,29 +9922,25 @@ namespace VegaISA GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORD(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_LOAD_DWORD(&iFmt->iFmt_FLAT); } GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX2(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_LOAD_DWORDX2(&iFmt->iFmt_FLAT); } GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX3(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_LOAD_DWORDX3(&iFmt->iFmt_FLAT); } GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_LOAD_DWORDX4(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_LOAD_DWORDX4(&iFmt->iFmt_FLAT); } GPUStaticInst* @@ -9977,29 +9973,25 @@ namespace VegaISA GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORD(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_STORE_DWORD(&iFmt->iFmt_FLAT); } GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX2(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_STORE_DWORDX2(&iFmt->iFmt_FLAT); } GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX3(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_STORE_DWORDX3(&iFmt->iFmt_FLAT); } GPUStaticInst* Decoder::decode_OP_SCRATCH__SCRATCH_STORE_DWORDX4(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_FLAT__FLAT_STORE_DWORDX4(&iFmt->iFmt_FLAT); } GPUStaticInst* diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh index 9ab7b84974..5861f296ff 100644 --- a/src/arch/amdgpu/vega/insts/op_encodings.hh +++ b/src/arch/amdgpu/vega/insts/op_encodings.hh @@ -1258,13 +1258,12 @@ namespace VegaISA // If saddr = 0x7f there is no scalar reg to read and address will // be a 64-bit address. Otherwise, saddr is the reg index for a // scalar reg used as the base address for a 32-bit address. - if ((saddr == 0x7f && (isFlatGlobal() || isFlatScratch())) - || isFlat()) { + if ((saddr == 0x7f && isFlatGlobal()) || isFlat()) { ConstVecOperandU64 vbase(gpuDynInst, vaddr); vbase.read(); calcAddrVgpr(gpuDynInst, vbase, offset); - } else { + } else if (isFlatGlobal()) { // Assume we are operating in 64-bit mode and read a pair of // SGPRs for the address base. ConstScalarOperandU64 sbase(gpuDynInst, saddr); @@ -1274,6 +1273,57 @@ namespace VegaISA voffset.read(); calcAddrSgpr(gpuDynInst, voffset, sbase, offset); + // For scratch, saddr = 0x7f there is no scalar reg to read and + // a vgpr will be used for address offset. Otherwise, saddr is + // the sgpr index holding the address offset. For scratch + // instructions the offset GPR is always 32-bits. + } else if (saddr != 0x7f) { + assert(isFlatScratch()); + + ConstScalarOperandU32 soffset(gpuDynInst, saddr); + soffset.read(); + + Addr flat_scratch_addr = readFlatScratch(gpuDynInst); + + int elemSize; + auto staticInst = gpuDynInst->staticInstruction(); + if (gpuDynInst->isLoad()) { + elemSize = staticInst->getOperandSize(2); + } else { + assert(gpuDynInst->isStore()); + elemSize = staticInst->getOperandSize(1); + } + + unsigned swizzleOffset = soffset.rawData() + offset; + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + gpuDynInst->addr.at(lane) = flat_scratch_addr + + swizzle(swizzleOffset, lane, elemSize); + } + } + } else { + assert(isFlatScratch()); + + ConstVecOperandU32 voffset(gpuDynInst, vaddr); + voffset.read(); + + Addr flat_scratch_addr = readFlatScratch(gpuDynInst); + + int elemSize; + auto staticInst = gpuDynInst->staticInstruction(); + if (gpuDynInst->isLoad()) { + elemSize = staticInst->getOperandSize(2); + } else { + assert(gpuDynInst->isStore()); + elemSize = staticInst->getOperandSize(1); + } + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + gpuDynInst->addr.at(lane) = flat_scratch_addr + + swizzle(voffset[lane] + offset, lane, elemSize); + } + } } if (isFlat()) { @@ -1285,6 +1335,7 @@ namespace VegaISA assert(isFlatScratch()); gpuDynInst->staticInstruction()->executed_as = enums::SC_PRIVATE; + gpuDynInst->resolveFlatSegment(gpuDynInst->exec_mask); } } @@ -1421,6 +1472,23 @@ namespace VegaISA } } } + + VecElemU32 + swizzle(VecElemU32 offset, int lane, int elem_size) + { + // This is not described in the spec. We use the swizzle from + // buffer memory instructions and fix the stride to 4. Multiply + // the thread ID by the storage size to avoid threads clobbering + // their data. + return ((offset / 4) * 4 * 64) + + (offset % 4) + (lane * elem_size); + } + + Addr + readFlatScratch(GPUDynInstPtr gpuDynInst) + { + return gpuDynInst->computeUnit()->shader->getScratchBase(); + } }; // Inst_FLAT } // namespace VegaISA } // namespace gem5 diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc index 6bb5f9c2c5..b3a91830fe 100644 --- a/src/dev/amdgpu/amdgpu_device.cc +++ b/src/dev/amdgpu/amdgpu_device.cc @@ -86,6 +86,8 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) gfx_version = GfxVersion::gfx908; } else if (p.device_name == "MI200") { gfx_version = GfxVersion::gfx90a; + } else if (p.device_name == "MI300X") { + gfx_version = GfxVersion::gfx942; } else { panic("Unknown GPU device %s\n", p.device_name); } @@ -124,7 +126,8 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) sdmaFunc.insert({0x10b, &SDMAEngine::setPageDoorbellOffsetLo}); sdmaFunc.insert({0xe0, &SDMAEngine::setPageSize}); sdmaFunc.insert({0x113, &SDMAEngine::setPageWptrLo}); - } else if (p.device_name == "MI100" || p.device_name == "MI200") { + } else if (p.device_name == "MI100" || p.device_name == "MI200" + || p.device_name == "MI300X") { sdmaFunc.insert({0xd9, &SDMAEngine::setPageBaseLo}); sdmaFunc.insert({0xe1, &SDMAEngine::setPageRptrLo}); sdmaFunc.insert({0xe0, &SDMAEngine::setPageRptrHi}); @@ -195,6 +198,10 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24); setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24); setRegVal(MI200_MEM_SIZE_REG, mem_size); + } else if (p.device_name == "MI300X") { + setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24); + setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24); + setRegVal(MI200_MEM_SIZE_REG, mem_size); } else { panic("Unknown GPU device %s\n", p.device_name); } diff --git a/src/dev/amdgpu/pm4_defines.hh b/src/dev/amdgpu/pm4_defines.hh index a303f8ef84..d00dc3730d 100644 --- a/src/dev/amdgpu/pm4_defines.hh +++ b/src/dev/amdgpu/pm4_defines.hh @@ -328,8 +328,8 @@ typedef struct GEM5_PACKED }; uint64_t completionSignal; }; -} PM4MapProcessMI200; -static_assert(sizeof(PM4MapProcessMI200) == 80); +} PM4MapProcessV2; +static_assert(sizeof(PM4MapProcessV2) == 80); typedef struct GEM5_PACKED { diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc index a921942678..9a8ba13914 100644 --- a/src/dev/amdgpu/pm4_packet_processor.cc +++ b/src/dev/amdgpu/pm4_packet_processor.cc @@ -290,18 +290,19 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header) dmaBuffer); } break; case IT_MAP_PROCESS: { - if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a) { - dmaBuffer = new PM4MapProcessMI200(); + if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a || + gpuDevice->getGfxVersion() == GfxVersion::gfx942) { + dmaBuffer = new PM4MapProcessV2(); cb = new DmaVirtCallback( [ = ] (const uint64_t &) - { mapProcessGfx90a(q, (PM4MapProcessMI200 *)dmaBuffer); }); - dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessMI200), + { mapProcessV2(q, (PM4MapProcessV2 *)dmaBuffer); }); + dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessV2), cb, dmaBuffer); } else { dmaBuffer = new PM4MapProcess(); cb = new DmaVirtCallback( [ = ] (const uint64_t &) - { mapProcessGfx9(q, (PM4MapProcess *)dmaBuffer); }); + { mapProcessV1(q, (PM4MapProcess *)dmaBuffer); }); dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb, dmaBuffer); } @@ -701,7 +702,7 @@ PM4PacketProcessor::mapProcess(uint32_t pasid, uint64_t ptBase, } void -PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt) +PM4PacketProcessor::mapProcessV1(PM4Queue *q, PM4MapProcess *pkt) { q->incRptr(sizeof(PM4MapProcess)); @@ -716,9 +717,9 @@ PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt) } void -PM4PacketProcessor::mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt) +PM4PacketProcessor::mapProcessV2(PM4Queue *q, PM4MapProcessV2 *pkt) { - q->incRptr(sizeof(PM4MapProcessMI200)); + q->incRptr(sizeof(PM4MapProcessV2)); DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: " "%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum, diff --git a/src/dev/amdgpu/pm4_packet_processor.hh b/src/dev/amdgpu/pm4_packet_processor.hh index 82c3c2716f..71271415fd 100644 --- a/src/dev/amdgpu/pm4_packet_processor.hh +++ b/src/dev/amdgpu/pm4_packet_processor.hh @@ -146,8 +146,8 @@ class PM4PacketProcessor : public DmaVirtDevice void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt); void doneMQDWrite(Addr mqdAddr, Addr addr); void mapProcess(uint32_t pasid, uint64_t ptBase, uint32_t shMemBases); - void mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt); - void mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt); + void mapProcessV1(PM4Queue *q, PM4MapProcess *pkt); + void mapProcessV2(PM4Queue *q, PM4MapProcessV2 *pkt); void processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, QueueDesc *mqd, uint16_t vmid); void processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py index 41ff9e7893..8cb40f1c87 100644 --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -45,7 +45,7 @@ class PrefetchType(Enum): class GfxVersion(ScopedEnum): - vals = ["gfx900", "gfx902", "gfx908", "gfx90a"] + vals = ["gfx900", "gfx902", "gfx908", "gfx90a", "gfx942"] class PoolManager(SimObject): diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index 66b2b8ec49..80f18d2fa2 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -910,35 +910,63 @@ GPUDynInst::resolveFlatSegment(const VectorMask &mask) * #flat-addressing */ - uint32_t numSgprs = wavefront()->maxSgprs; - uint32_t physSgprIdx = - wavefront()->computeUnit->registerManager->mapSgpr(wavefront(), - numSgprs - 4); - uint32_t offset = - wavefront()->computeUnit->srf[simdId]->read(physSgprIdx); - physSgprIdx = - wavefront()->computeUnit->registerManager->mapSgpr(wavefront(), - numSgprs - 3); - uint32_t size = - wavefront()->computeUnit->srf[simdId]->read(physSgprIdx); - for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) { - if (mask[lane]) { - addr[lane] = addr[lane] + lane * size + offset + - wavefront()->computeUnit->shader->getHiddenPrivateBase() - - wavefront()->computeUnit->shader->getScratchBase(); + ComputeUnit *cu = wavefront()->computeUnit; + + if (wavefront()->gfxVersion == GfxVersion::gfx942) { + // Architected flat scratch base address in FLAT_SCRATCH registers + uint32_t fs_lo = cu->srf[simdId]->read( + VegaISA::REG_FLAT_SCRATCH_LO); + uint32_t fs_hi = cu->srf[simdId]->read( + VegaISA::REG_FLAT_SCRATCH_HI); + + Addr arch_flat_scratch = ((Addr)(fs_hi) << 32) | fs_lo; + + for (int lane = 0; lane < cu->wfSize(); ++lane) { + if (mask[lane]) { + // The scratch base is added for other gfx versions, + // otherwise this would simply add the register base. + addr[lane] = addr[lane] - cu->shader->getScratchBase() + + arch_flat_scratch; + } + } + } else { + // In absolute flat scratch the program needs to place scratch + // address in SGPRn-3,4. + uint32_t numSgprs = wavefront()->maxSgprs; + uint32_t physSgprIdx = + cu->registerManager->mapSgpr(wavefront(), numSgprs - 4); + uint32_t offset = cu->srf[simdId]->read(physSgprIdx); + physSgprIdx = + cu->registerManager->mapSgpr(wavefront(), numSgprs - 3); + uint32_t size = cu->srf[simdId]->read(physSgprIdx); + + + for (int lane = 0; lane < cu->wfSize(); ++lane) { + if (mask[lane]) { + addr[lane] = addr[lane] + lane * size + offset + + cu->shader->getHiddenPrivateBase() - + cu->shader->getScratchBase(); + } } } - wavefront()->execUnitId = wavefront()->flatLmUnitId; - wavefront()->decLGKMInstsIssued(); - if (isLoad()) { - wavefront()->rdLmReqsInPipe--; - } else if (isStore()) { - wavefront()->wrLmReqsInPipe--; - } else if (isAtomic() || isMemSync()) { - wavefront()->wrLmReqsInPipe--; - wavefront()->rdLmReqsInPipe--; - } else { - panic("Invalid memory operation!\n"); + + wavefront()->execUnitId = wavefront()->flatLmUnitId; + + // For FLAT the local memory pipe counters are incremented, but they + // are not incremented for explicit scratch_* instructions. Only + // decrement these counters if we are explicitly a FLAT instruction. + if (isFlat()) { + wavefront()->decLGKMInstsIssued(); + if (isLoad()) { + wavefront()->rdLmReqsInPipe--; + } else if (isStore()) { + wavefront()->wrLmReqsInPipe--; + } else if (isAtomic() || isMemSync()) { + wavefront()->wrLmReqsInPipe--; + wavefront()->rdLmReqsInPipe--; + } else { + panic("Invalid memory operation!\n"); + } } } else { for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) { diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh index 6132ab2d29..1ec06dc7d3 100644 --- a/src/gpu-compute/gpu_static_inst.hh +++ b/src/gpu-compute/gpu_static_inst.hh @@ -179,7 +179,8 @@ class GPUStaticInst : public GPUStaticInstFlags { return _flags[MemoryRef] && (_flags[GlobalSegment] || _flags[PrivateSegment] || _flags[ReadOnlySegment] || - _flags[SpillSegment] || _flags[FlatGlobal]); + _flags[SpillSegment] || _flags[FlatGlobal] || + _flags[FlatScratch]); } bool diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh index f015b091fc..44de1a8d32 100644 --- a/src/gpu-compute/hsa_queue_entry.hh +++ b/src/gpu-compute/hsa_queue_entry.hh @@ -94,9 +94,10 @@ class HSAQueueEntry // LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html // #code-object-v3-kernel-descriptor // - // Currently, the only supported gfx version in gem5 that computes - // VGPR count differently is gfx90a. - if (gfx_version == GfxVersion::gfx90a) { + // Currently, the only supported gfx versions in gem5 that compute + // VGPR count differently are gfx90a and gfx942. + if (gfx_version == GfxVersion::gfx90a || + gfx_version == GfxVersion::gfx942) { numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8; } else { numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4; @@ -107,7 +108,8 @@ class HSAQueueEntry if (gfx_version == GfxVersion::gfx900 || gfx_version == GfxVersion::gfx902 || gfx_version == GfxVersion::gfx908 || - gfx_version == GfxVersion::gfx90a) { + gfx_version == GfxVersion::gfx90a || + gfx_version == GfxVersion::gfx942) { numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2; } else { panic("Saw unknown gfx version setting up GPR counts\n"); diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc index 98d882b20e..de7c2333c2 100644 --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -118,6 +118,7 @@ void Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems) { int regInitIdx = 0; + gfxVersion = task->gfxVersion(); // Iterate over all the init fields and check which // bits are enabled. Useful information can be found here: @@ -378,8 +379,29 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems) wfSlotId, wfDynId, physSgprIdx, workGroupId[2]); break; case PrivSegWaveByteOffset: + + // For architected flat scratch, this enable is reused to set + // the FLAT_SCRATCH register pair to the scratch backing + // memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch + if (task->gfxVersion() == GfxVersion::gfx942) { + Addr arch_flat_scratch = + task->amdQueue.scratch_backing_memory_location; + computeUnit->srf[simdId]->write( + VegaISA::REG_FLAT_SCRATCH_HI, + bits(arch_flat_scratch, 63, 32)); + computeUnit->srf[simdId]->write( + VegaISA::REG_FLAT_SCRATCH_LO, + bits(arch_flat_scratch, 31, 0)); + + break; + } + + // Not architected flat scratch. Write the scratch wavefront + // offset: https://llvm.org/docs/AMDGPUUsage.html + // #amdgpu-amdhsa-initial-kernel-execution-state physSgprIdx = computeUnit->registerManager->mapSgpr(this, regInitIdx); + /** * the compute_tmpring_size_wavesize specifies the number of * kB allocated per wavefront, hence the multiplication by @@ -442,7 +464,8 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems) // Default to false and set to true for gem5 supported ISAs. bool packed_work_item_id = false; - if (task->gfxVersion() == GfxVersion::gfx90a) { + if (task->gfxVersion() == GfxVersion::gfx90a || + task->gfxVersion() == GfxVersion::gfx942) { packed_work_item_id = true; } diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh index 82035f7d47..b7dff4617b 100644 --- a/src/gpu-compute/wavefront.hh +++ b/src/gpu-compute/wavefront.hh @@ -92,6 +92,8 @@ class Wavefront : public SimObject S_BARRIER }; + // gfx version wavefront is executing + GfxVersion gfxVersion; // HW slot id where the WF is mapped to inside a SIMD unit const int wfSlotId; int kernId;