diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py index 4c906019c1..f8ef70d5a2 100644 --- a/configs/example/gpufs/runfs.py +++ b/configs/example/gpufs/runfs.py @@ -132,8 +132,9 @@ def addRunFSOptions(parser): parser.add_argument( "--gpu-device", default="Vega10", - choices=["Vega10", "MI100"], - help="GPU model to run: Vega10 (gfx900) or MI100 (gfx908)", + choices=["Vega10", "MI100", "MI200"], + help="GPU model to run: Vega10 (gfx900), MI100 (gfx908), or " + "MI200 (gfx90a)", ) diff --git a/configs/example/gpufs/system/amdgpu.py b/configs/example/gpufs/system/amdgpu.py index 5f98b55c32..9697e50a04 100644 --- a/configs/example/gpufs/system/amdgpu.py +++ b/configs/example/gpufs/system/amdgpu.py @@ -177,6 +177,10 @@ def connectGPU(system, args): system.pc.south_bridge.gpu.DeviceID = 0x738C system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002 system.pc.south_bridge.gpu.SubsystemID = 0x0C34 + elif args.gpu_device == "MI200": + system.pc.south_bridge.gpu.DeviceID = 0x740F + system.pc.south_bridge.gpu.SubsystemVendorID = 0x1002 + system.pc.south_bridge.gpu.SubsystemID = 0x0C34 elif args.gpu_device == "Vega10": system.pc.south_bridge.gpu.DeviceID = 0x6863 else: diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py index 90c5c01091..263ffc0a43 100644 --- a/configs/example/gpufs/system/system.py +++ b/configs/example/gpufs/system/system.py @@ -152,6 +152,16 @@ def makeGpuFSSystem(args): 0x7D000, ] sdma_sizes = [0x1000] * 8 + elif args.gpu_device == "MI200": + num_sdmas = 5 + sdma_bases = [ + 0x4980, + 0x6180, + 0x78000, + 0x79000, + 0x7A000, + ] + sdma_sizes = [0x1000] * 5 else: m5.util.panic(f"Unknown GPU device {args.gpu_device}") diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc index 7037e6fb1c..3260d058b0 100644 --- a/src/dev/amdgpu/amdgpu_device.cc +++ b/src/dev/amdgpu/amdgpu_device.cc @@ -115,7 +115,7 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) sdmaFunc.insert({0x10b, &SDMAEngine::setPageDoorbellOffsetLo}); sdmaFunc.insert({0xe0, &SDMAEngine::setPageSize}); sdmaFunc.insert({0x113, &SDMAEngine::setPageWptrLo}); - } else if (p.device_name == "MI100") { + } else if (p.device_name == "MI100" || p.device_name == "MI200") { sdmaFunc.insert({0xd9, &SDMAEngine::setPageBaseLo}); sdmaFunc.insert({0xe1, &SDMAEngine::setPageRptrLo}); sdmaFunc.insert({0xe0, &SDMAEngine::setPageRptrHi}); @@ -144,10 +144,19 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) if (p.device_name == "Vega10") { setRegVal(VEGA10_FB_LOCATION_BASE, mmhubBase >> 24); setRegVal(VEGA10_FB_LOCATION_TOP, mmhubTop >> 24); + gfx_version = GfxVersion::gfx900; } else if (p.device_name == "MI100") { setRegVal(MI100_FB_LOCATION_BASE, mmhubBase >> 24); setRegVal(MI100_FB_LOCATION_TOP, mmhubTop >> 24); setRegVal(MI100_MEM_SIZE_REG, 0x3ff0); // 16GB of memory + gfx_version = GfxVersion::gfx908; + } else if (p.device_name == "MI200") { + // This device can have either 64GB or 128GB of device memory. + // This limits to 16GB for simulation. + setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24); + setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24); + setRegVal(MI200_MEM_SIZE_REG, 0x3ff0); + gfx_version = GfxVersion::gfx90a; } else { panic("Unknown GPU device %s\n", p.device_name); } diff --git a/src/dev/amdgpu/amdgpu_device.hh b/src/dev/amdgpu/amdgpu_device.hh index cab799147e..56ed2f4fa8 100644 --- a/src/dev/amdgpu/amdgpu_device.hh +++ b/src/dev/amdgpu/amdgpu_device.hh @@ -42,6 +42,7 @@ #include "dev/amdgpu/mmio_reader.hh" #include "dev/io_device.hh" #include "dev/pci/device.hh" +#include "enums/GfxVersion.hh" #include "params/AMDGPUDevice.hh" namespace gem5 @@ -145,6 +146,9 @@ class AMDGPUDevice : public PciDevice */ memory::PhysicalMemory deviceMem; + /* Device information */ + GfxVersion gfx_version = GfxVersion::gfx900; + public: AMDGPUDevice(const AMDGPUDeviceParams &p); @@ -206,6 +210,9 @@ class AMDGPUDevice : public PciDevice uint16_t getVMID(Addr doorbell) { return doorbellVMIDMap[doorbell]; } std::unordered_map>& getUsedVMIDs(); void insertQId(uint16_t vmid, int id); + + /* Device information */ + GfxVersion getGfxVersion() const { return gfx_version; } }; } // namespace gem5 diff --git a/src/dev/amdgpu/amdgpu_nbio.cc b/src/dev/amdgpu/amdgpu_nbio.cc index 69e4373e64..07027c3765 100644 --- a/src/dev/amdgpu/amdgpu_nbio.cc +++ b/src/dev/amdgpu/amdgpu_nbio.cc @@ -75,12 +75,14 @@ AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset) case VEGA10_INV_ENG17_ACK2: case MI100_INV_ENG17_ACK2: case MI100_INV_ENG17_ACK3: + case MI200_INV_ENG17_ACK2: pkt->setLE(0x10001); break; case VEGA10_INV_ENG17_SEM1: case VEGA10_INV_ENG17_SEM2: case MI100_INV_ENG17_SEM2: case MI100_INV_ENG17_SEM3: + case MI200_INV_ENG17_SEM2: pkt->setLE(0x1); break; // PSP responds with bit 31 set when ready diff --git a/src/dev/amdgpu/amdgpu_nbio.hh b/src/dev/amdgpu/amdgpu_nbio.hh index d1e5391ec4..dc95443916 100644 --- a/src/dev/amdgpu/amdgpu_nbio.hh +++ b/src/dev/amdgpu/amdgpu_nbio.hh @@ -80,6 +80,11 @@ class AMDGPUDevice; #define MI100_INV_ENG17_SEM2 0x6a888 #define MI100_INV_ENG17_SEM3 0x76888 +#define MI200_INV_ENG17_ACK1 0x0a318 +#define MI200_INV_ENG17_ACK2 0x6b018 +#define MI200_INV_ENG17_SEM1 0x0a288 +#define MI200_INV_ENG17_SEM2 0x6af88 + class AMDGPUNbio { public: diff --git a/src/dev/amdgpu/amdgpu_vm.hh b/src/dev/amdgpu/amdgpu_vm.hh index ac35a11968..f35a735111 100644 --- a/src/dev/amdgpu/amdgpu_vm.hh +++ b/src/dev/amdgpu/amdgpu_vm.hh @@ -81,6 +81,10 @@ #define MI100_FB_LOCATION_BASE 0x6ac00 #define MI100_FB_LOCATION_TOP 0x6ac04 +#define MI200_MEM_SIZE_REG 0x0378c +#define MI200_FB_LOCATION_BASE 0x6b300 +#define MI200_FB_LOCATION_TOP 0x6b304 + // AMD GPUs support 16 different virtual address spaces static constexpr int AMDGPU_VM_COUNT = 16; diff --git a/src/dev/amdgpu/pm4_defines.hh b/src/dev/amdgpu/pm4_defines.hh index 42832d50bf..a303f8ef84 100644 --- a/src/dev/amdgpu/pm4_defines.hh +++ b/src/dev/amdgpu/pm4_defines.hh @@ -273,6 +273,64 @@ typedef struct GEM5_PACKED } PM4MapProcess; static_assert(sizeof(PM4MapProcess) == 60); +typedef struct GEM5_PACKED +{ + uint32_t pasid : 16; + uint32_t reserved0 : 8; + uint32_t diq : 1; + uint32_t processQuantum : 7; + union + { + struct + { + uint32_t ptBaseLo; + uint32_t ptBaseHi; + }; + uint64_t ptBase; + }; + uint32_t shMemBases; + uint32_t shMemConfig; + uint32_t sqShaderTbaLo; + uint32_t sqShaderTbaHi; + uint32_t sqShaderTmaLo; + uint32_t sqShaderTmaHi; + uint32_t reserved1; + union + { + struct + { + uint32_t gdsAddrLo; + uint32_t gdsAddrHi; + }; + uint64_t gdsAddr; + }; + union + { + struct + { + uint32_t numGws : 7; + uint32_t sdma_enable : 1; + uint32_t numOac : 4; + uint32_t reserved3 : 4; + uint32_t gdsSize : 6; + uint32_t numQueues : 10; + }; + uint32_t ordinal14; + }; + uint32_t spiGdbgPerVmidCntl; + uint32_t tcpWatchCntl[4]; + union + { + struct + { + uint32_t completionSignalLo; + uint32_t completionSignalHi; + }; + uint64_t completionSignal; + }; +} PM4MapProcessMI200; +static_assert(sizeof(PM4MapProcessMI200) == 80); + typedef struct GEM5_PACKED { uint32_t function : 4; diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc index 3690113ac4..e7b846529e 100644 --- a/src/dev/amdgpu/pm4_packet_processor.cc +++ b/src/dev/amdgpu/pm4_packet_processor.cc @@ -271,12 +271,21 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header) dmaBuffer); } break; case IT_MAP_PROCESS: { - dmaBuffer = new PM4MapProcess(); - cb = new DmaVirtCallback( - [ = ] (const uint64_t &) - { mapProcess(q, (PM4MapProcess *)dmaBuffer); }); - dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb, - dmaBuffer); + if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a) { + dmaBuffer = new PM4MapProcessMI200(); + cb = new DmaVirtCallback( + [ = ] (const uint64_t &) + { mapProcessGfx90a(q, (PM4MapProcessMI200 *)dmaBuffer); }); + dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessMI200), + cb, dmaBuffer); + } else { + dmaBuffer = new PM4MapProcess(); + cb = new DmaVirtCallback( + [ = ] (const uint64_t &) + { mapProcessGfx9(q, (PM4MapProcess *)dmaBuffer); }); + dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb, + dmaBuffer); + } } break; case IT_UNMAP_QUEUES: { @@ -613,27 +622,50 @@ PM4PacketProcessor::doneMQDWrite(Addr mqdAddr, Addr addr) { } void -PM4PacketProcessor::mapProcess(PM4Queue *q, PM4MapProcess *pkt) +PM4PacketProcessor::mapProcess(uint32_t pasid, uint64_t ptBase, + uint32_t shMemBases) { - q->incRptr(sizeof(PM4MapProcess)); - uint16_t vmid = gpuDevice->allocateVMID(pkt->pasid); + uint16_t vmid = gpuDevice->allocateVMID(pasid); - DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p vmid: %d quantum: " - "%d pt: %p signal: %p\n", pkt->pasid, vmid, pkt->processQuantum, - pkt->ptBase, pkt->completionSignal); - - gpuDevice->getVM().setPageTableBase(vmid, pkt->ptBase); - gpuDevice->CP()->shader()->setHwReg(HW_REG_SH_MEM_BASES, pkt->shMemBases); + gpuDevice->getVM().setPageTableBase(vmid, ptBase); + gpuDevice->CP()->shader()->setHwReg(HW_REG_SH_MEM_BASES, shMemBases); // Setup the apertures that gem5 uses. These values are bits [63:48]. - Addr lds_base = (Addr)bits(pkt->shMemBases, 31, 16) << 48; - Addr scratch_base = (Addr)bits(pkt->shMemBases, 15, 0) << 48; + Addr lds_base = (Addr)bits(shMemBases, 31, 16) << 48; + Addr scratch_base = (Addr)bits(shMemBases, 15, 0) << 48; // There does not seem to be any register for the limit, but the driver // assumes scratch and LDS have a 4GB aperture, so use that. gpuDevice->CP()->shader()->setLdsApe(lds_base, lds_base + 0xFFFFFFFF); gpuDevice->CP()->shader()->setScratchApe(scratch_base, scratch_base + 0xFFFFFFFF); +} + +void +PM4PacketProcessor::mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt) +{ + q->incRptr(sizeof(PM4MapProcess)); + + DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: " + "%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum, + pkt->ptBase, pkt->completionSignal); + + mapProcess(pkt->pasid, pkt->ptBase, pkt->shMemBases); + + delete pkt; + decodeNext(q); +} + +void +PM4PacketProcessor::mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt) +{ + q->incRptr(sizeof(PM4MapProcessMI200)); + + DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: " + "%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum, + pkt->ptBase, pkt->completionSignal); + + mapProcess(pkt->pasid, pkt->ptBase, pkt->shMemBases); delete pkt; decodeNext(q); diff --git a/src/dev/amdgpu/pm4_packet_processor.hh b/src/dev/amdgpu/pm4_packet_processor.hh index 4617a21a06..3fb055148c 100644 --- a/src/dev/amdgpu/pm4_packet_processor.hh +++ b/src/dev/amdgpu/pm4_packet_processor.hh @@ -141,7 +141,9 @@ class PM4PacketProcessor : public DmaVirtDevice void mapQueues(PM4Queue *q, PM4MapQueues *pkt); void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt); void doneMQDWrite(Addr mqdAddr, Addr addr); - void mapProcess(PM4Queue *q, PM4MapProcess *pkt); + void mapProcess(uint32_t pasid, uint64_t ptBase, uint32_t shMemBases); + void mapProcessGfx9(PM4Queue *q, PM4MapProcess *pkt); + void mapProcessGfx90a(PM4Queue *q, PM4MapProcessMI200 *pkt); void processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, QueueDesc *mqd, uint16_t vmid); void processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py index 3a87186a30..c5449cc398 100644 --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -45,7 +45,7 @@ class PrefetchType(Enum): class GfxVersion(ScopedEnum): - vals = ["gfx801", "gfx803", "gfx900", "gfx902"] + vals = ["gfx801", "gfx803", "gfx900", "gfx902", "gfx908", "gfx90a"] class PoolManager(SimObject): diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc index af59b7822b..9755180204 100644 --- a/src/gpu-compute/gpu_command_processor.cc +++ b/src/gpu-compute/gpu_command_processor.cc @@ -228,7 +228,8 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id, DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str()); HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id, - dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr); + dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr, + gpuDevice->getGfxVersion()); DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), " "grid size (%dx%dx%d) kernarg addr: %#x, completion " diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh index fbe0efef21..4083c1c85a 100644 --- a/src/gpu-compute/hsa_queue_entry.hh +++ b/src/gpu-compute/hsa_queue_entry.hh @@ -51,6 +51,7 @@ #include "base/types.hh" #include "dev/hsa/hsa_packet.hh" #include "dev/hsa/hsa_queue.hh" +#include "enums/GfxVersion.hh" #include "gpu-compute/kernel_code.hh" namespace gem5 @@ -61,7 +62,7 @@ class HSAQueueEntry public: HSAQueueEntry(std::string kernel_name, uint32_t queue_id, int dispatch_id, void *disp_pkt, AMDKernelCode *akc, - Addr host_pkt_addr, Addr code_addr) + Addr host_pkt_addr, Addr code_addr, GfxVersion gfx_version) : kernName(kernel_name), _wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x, (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y, @@ -92,9 +93,19 @@ class HSAQueueEntry // we need to rip register usage from the resource registers. // // We can't get an exact number of registers from the resource - // registers because they round, but we can get an upper bound on it - if (!numVgprs) - numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4; + // registers because they round, but we can get an upper bound on it. + // We determine the number of registers by solving for "vgprs_used" + // in the LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html + // #code-object-v3-kernel-descriptor + // Currently, the only supported gfx version in gem5 that computes + // this differently is gfx90a. + if (!numVgprs) { + if (gfx_version == GfxVersion::gfx90a) { + numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8; + } else { + numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4; + } + } if (!numSgprs || numSgprs == std::numeric_limitswavefront_sgpr_count)>::max()) {