From 8b91ac6f8d82cc02660adad3513f03c41413435c Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Fri, 21 Apr 2023 19:15:40 -0500 Subject: [PATCH] dev-amdgpu: Refactor MMIO interface for SDMA engines Currently the amdgpu simulated device is assumed to be a Vega10. As a result there are a few things that are hardcoded. One of those is the number of SDMAs. In order to add a newer device, such as MI100+, we need to enable a flexible number of SDMAs. In order to support a variable number of SDMAs and with the MMIO offsets of each device being potentially different, the MMIO interface for SDMAs is changed to use an SDMA class method dispatch table with forwards a 32-bit value from the MMIO packet to the MMIO functions in SDMA of the format `void method(uint32_t)`. Several changes are made to enable this: - Allow the SDMA to have a variable MMIO base and size. These are configured in python. - An SDMA class method dispatch table which contains the MMIO offset relative to the SDMA's MMIO base address. - An updated writeMMIO method to iterate over the SDMA MMIO address ranges and call the appropriate SDMA MMIO method which matches the MMIO offset. - Moved all SDMA related MMIO data bit twiddling, masking, etc. into the MMIO methods themselves instead of in the writeMMIO method in SDMAEngine. Change-Id: Ifce626f84d52f9e27e4438ba4e685e30dbf06dbc Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70040 Maintainer: Matt Sinclair Tested-by: kokoro Reviewed-by: Matt Sinclair --- configs/example/gpufs/system/system.py | 56 +++++++++++---- src/dev/amdgpu/AMDGPU.py | 11 +-- src/dev/amdgpu/amdgpu_device.cc | 97 ++++++++++++++++++-------- src/dev/amdgpu/amdgpu_device.hh | 13 +++- src/dev/amdgpu/interrupt_handler.cc | 6 ++ src/dev/amdgpu/interrupt_handler.hh | 6 ++ src/dev/amdgpu/sdma_engine.cc | 35 ++++++++-- src/dev/amdgpu/sdma_engine.hh | 15 +++- 8 files changed, 182 insertions(+), 57 deletions(-) diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py index 93f0194efb..90c5c01091 100644 --- a/configs/example/gpufs/system/system.py +++ b/configs/example/gpufs/system/system.py @@ -129,15 +129,45 @@ def makeGpuFSSystem(args): device_ih = AMDGPUInterruptHandler() system.pc.south_bridge.gpu.device_ih = device_ih - # Setup the SDMA engines - sdma0_pt_walker = VegaPagetableWalker() - sdma1_pt_walker = VegaPagetableWalker() + # Setup the SDMA engines depending on device. The MMIO base addresses + # can be found in the driver code under: + # include/asic_reg/sdmaX/sdmaX_Y_Z_offset.h + num_sdmas = 2 + sdma_bases = [] + sdma_sizes = [] + if args.gpu_device == "Vega10": + num_sdmas = 2 + sdma_bases = [0x4980, 0x5180] + sdma_sizes = [0x800] * 2 + elif args.gpu_device == "MI100": + num_sdmas = 8 + sdma_bases = [ + 0x4980, + 0x6180, + 0x78000, + 0x79000, + 0x7A000, + 0x7B000, + 0x7C000, + 0x7D000, + ] + sdma_sizes = [0x1000] * 8 + else: + m5.util.panic(f"Unknown GPU device {args.gpu_device}") - sdma0 = SDMAEngine(walker=sdma0_pt_walker) - sdma1 = SDMAEngine(walker=sdma1_pt_walker) + sdma_pt_walkers = [] + sdma_engines = [] + for sdma_idx in range(num_sdmas): + sdma_pt_walker = VegaPagetableWalker() + sdma_engine = SDMAEngine( + walker=sdma_pt_walker, + mmio_base=sdma_bases[sdma_idx], + mmio_size=sdma_sizes[sdma_idx], + ) + sdma_pt_walkers.append(sdma_pt_walker) + sdma_engines.append(sdma_engine) - system.pc.south_bridge.gpu.sdma0 = sdma0 - system.pc.south_bridge.gpu.sdma1 = sdma1 + system.pc.south_bridge.gpu.sdmas = sdma_engines # Setup PM4 packet processor pm4_pkt_proc = PM4PacketProcessor() @@ -155,22 +185,22 @@ def makeGpuFSSystem(args): system._dma_ports.append(gpu_hsapp) system._dma_ports.append(gpu_cmd_proc) system._dma_ports.append(system.pc.south_bridge.gpu) - system._dma_ports.append(sdma0) - system._dma_ports.append(sdma1) + for sdma in sdma_engines: + system._dma_ports.append(sdma) system._dma_ports.append(device_ih) system._dma_ports.append(pm4_pkt_proc) system._dma_ports.append(system_hub) system._dma_ports.append(gpu_mem_mgr) system._dma_ports.append(hsapp_pt_walker) system._dma_ports.append(cp_pt_walker) - system._dma_ports.append(sdma0_pt_walker) - system._dma_ports.append(sdma1_pt_walker) + for sdma_pt_walker in sdma_pt_walkers: + system._dma_ports.append(sdma_pt_walker) gpu_hsapp.pio = system.iobus.mem_side_ports gpu_cmd_proc.pio = system.iobus.mem_side_ports system.pc.south_bridge.gpu.pio = system.iobus.mem_side_ports - sdma0.pio = system.iobus.mem_side_ports - sdma1.pio = system.iobus.mem_side_ports + for sdma in sdma_engines: + sdma.pio = system.iobus.mem_side_ports device_ih.pio = system.iobus.mem_side_ports pm4_pkt_proc.pio = system.iobus.mem_side_ports system_hub.pio = system.iobus.mem_side_ports diff --git a/src/dev/amdgpu/AMDGPU.py b/src/dev/amdgpu/AMDGPU.py index 1e786726c9..616c501c63 100644 --- a/src/dev/amdgpu/AMDGPU.py +++ b/src/dev/amdgpu/AMDGPU.py @@ -79,11 +79,9 @@ class AMDGPUDevice(PciDevice): False, "Take a checkpoint before the device begins sending MMIOs" ) - # Specific to Vega10: Vega10 has two SDMA engines these do not have any - # assigned function and are referenced by ID so they are given the generic - # names sdma0, sdma1, ... sdmaN. - sdma0 = Param.SDMAEngine("SDMA Engine 0") - sdma1 = Param.SDMAEngine("SDMA Engine 1") + # SDMA engines. There are a different number depending on device, + # therefore an array is used. + sdmas = VectorParam.SDMAEngine("All SDMA Engines") # The cp is needed here to handle certain packets the device may receive. # The config script should not create a new cp here but rather assign the @@ -100,6 +98,9 @@ class SDMAEngine(DmaVirtDevice): cxx_header = "dev/amdgpu/sdma_engine.hh" cxx_class = "gem5::SDMAEngine" + mmio_base = Param.Addr(0x0, "Base MMIO Address") + mmio_size = Param.Addr(0x800, "Size of MMIO range") + gpu_device = Param.AMDGPUDevice(NULL, "GPU Controller") walker = Param.VegaPagetableWalker("Page table walker") diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc index 7e6304afa1..2acf1f4af3 100644 --- a/src/dev/amdgpu/amdgpu_device.cc +++ b/src/dev/amdgpu/amdgpu_device.cc @@ -53,7 +53,7 @@ namespace gem5 AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) : PciDevice(p), gpuMemMgr(p.memory_manager), deviceIH(p.device_ih), - sdma0(p.sdma0), sdma1(p.sdma1), pm4PktProc(p.pm4_pkt_proc), cp(p.cp), + pm4PktProc(p.pm4_pkt_proc), cp(p.cp), checkpoint_before_mmios(p.checkpoint_before_mmios), init_interrupt_count(0), _lastVMID(0), deviceMem(name() + ".deviceMem", p.memories, false, "", false) @@ -84,10 +84,47 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) mmioReader.readMMIOTrace(p.trace_file); } - sdma0->setGPUDevice(this); - sdma0->setId(0); - sdma1->setGPUDevice(this); - sdma1->setId(1); + int sdma_id = 0; + for (auto& s : p.sdmas) { + s->setGPUDevice(this); + s->setId(sdma_id); + sdmaIds.insert({sdma_id, s}); + sdmaMmios.insert({sdma_id, + RangeSize(s->getMmioBase(), s->getMmioSize())}); + DPRINTF(AMDGPUDevice, "SDMA%d has MMIO range %s\n", sdma_id, + sdmaMmios[sdma_id].to_string().c_str()); + sdma_id++; + } + + // Map SDMA MMIO addresses to functions + sdmaFunc.insert({0x81, &SDMAEngine::setGfxBaseLo}); + sdmaFunc.insert({0x82, &SDMAEngine::setGfxBaseHi}); + sdmaFunc.insert({0x88, &SDMAEngine::setGfxRptrHi}); + sdmaFunc.insert({0x89, &SDMAEngine::setGfxRptrLo}); + sdmaFunc.insert({0x92, &SDMAEngine::setGfxDoorbellLo}); + sdmaFunc.insert({0xab, &SDMAEngine::setGfxDoorbellOffsetLo}); + sdmaFunc.insert({0x80, &SDMAEngine::setGfxSize}); + sdmaFunc.insert({0xb2, &SDMAEngine::setGfxWptrLo}); + sdmaFunc.insert({0xb3, &SDMAEngine::setGfxWptrHi}); + if (p.device_name == "Vega10") { + sdmaFunc.insert({0xe1, &SDMAEngine::setPageBaseLo}); + sdmaFunc.insert({0xe9, &SDMAEngine::setPageRptrLo}); + sdmaFunc.insert({0xe8, &SDMAEngine::setPageRptrHi}); + sdmaFunc.insert({0xf2, &SDMAEngine::setPageDoorbellLo}); + sdmaFunc.insert({0x10b, &SDMAEngine::setPageDoorbellOffsetLo}); + sdmaFunc.insert({0xe0, &SDMAEngine::setPageSize}); + sdmaFunc.insert({0x113, &SDMAEngine::setPageWptrLo}); + } else if (p.device_name == "MI100") { + sdmaFunc.insert({0xd9, &SDMAEngine::setPageBaseLo}); + sdmaFunc.insert({0xe1, &SDMAEngine::setPageRptrLo}); + sdmaFunc.insert({0xe0, &SDMAEngine::setPageRptrHi}); + sdmaFunc.insert({0xea, &SDMAEngine::setPageDoorbellLo}); + sdmaFunc.insert({0xd8, &SDMAEngine::setPageDoorbellOffsetLo}); + sdmaFunc.insert({0x10b, &SDMAEngine::setPageWptrLo}); + } else { + panic("Unknown GPU device %s\n", p.device_name); + } + deviceIH->setGPUDevice(this); pm4PktProc->setGPUDevice(this); cp->hsaPacketProc().setGPUDevice(this); @@ -351,15 +388,25 @@ AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset) DPRINTF(AMDGPUDevice, "Wrote MMIO %#lx\n", offset); + // Check SDMA functions first, then fallback to switch statement + for (int idx = 0; idx < sdmaIds.size(); ++idx) { + if (sdmaMmios[idx].contains(offset)) { + Addr sdma_offset = (offset - sdmaMmios[idx].start()) >> 2; + if (sdmaFunc.count(sdma_offset)) { + DPRINTF(AMDGPUDevice, "Calling SDMA%d MMIO function %lx\n", + idx, sdma_offset); + sdmaFuncPtr mptr = sdmaFunc[sdma_offset]; + (getSDMAById(idx)->*mptr)(pkt->getLE()); + } else { + DPRINTF(AMDGPUDevice, "Unknown SDMA%d MMIO: %#lx\n", idx, + sdma_offset); + } + + return; + } + } + switch (aperture) { - /* Write a register to the first System DMA. */ - case SDMA0_BASE: - sdma0->writeMMIO(pkt, aperture_offset >> SDMA_OFFSET_SHIFT); - break; - /* Write a register to the second System DMA. */ - case SDMA1_BASE: - sdma1->writeMMIO(pkt, aperture_offset >> SDMA_OFFSET_SHIFT); - break; /* Write a general register to the graphics register bus manager. */ case GRBM_BASE: gpuvm.writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT); @@ -483,19 +530,9 @@ AMDGPUDevice::getSDMAById(int id) * PM4 packets selected SDMAs using an integer ID. This method simply maps * the integer ID to a pointer to the SDMA and checks for invalid IDs. */ - switch (id) { - case 0: - return sdma0; - break; - case 1: - return sdma1; - break; - default: - panic("No SDMA with id %d\n", id); - break; - } + assert(sdmaIds.count(id)); - return nullptr; + return sdmaIds[id]; } SDMAEngine* @@ -549,7 +586,7 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const idx = 0; for (auto & it : sdmaEngs) { sdma_engs_offset[idx] = it.first; - sdma_engs[idx] = it.second == sdma0 ? 0 : 1; + sdma_engs[idx] = idx; ++idx; } @@ -620,7 +657,8 @@ AMDGPUDevice::unserialize(CheckpointIn &cp) UNSERIALIZE_ARRAY(sdma_engs, sizeof(sdma_engs)/sizeof(sdma_engs[0])); for (int idx = 0; idx < sdma_engs_size; ++idx) { - SDMAEngine *sdma = sdma_engs[idx] == 0 ? sdma0 : sdma1; + assert(sdmaIds.count(idx)); + SDMAEngine *sdma = sdmaIds[idx]; sdmaEngs.insert(std::make_pair(sdma_engs_offset[idx], sdma)); } } @@ -669,8 +707,9 @@ AMDGPUDevice::deallocateAllQueues() idMap.erase(idMap.begin(), idMap.end()); usedVMIDs.erase(usedVMIDs.begin(), usedVMIDs.end()); - sdma0->deallocateRLCQueues(); - sdma1->deallocateRLCQueues(); + for (auto& it : sdmaEngs) { + it.second->deallocateRLCQueues(); + } } void diff --git a/src/dev/amdgpu/amdgpu_device.hh b/src/dev/amdgpu/amdgpu_device.hh index b64067a158..0e58f29038 100644 --- a/src/dev/amdgpu/amdgpu_device.hh +++ b/src/dev/amdgpu/amdgpu_device.hh @@ -109,12 +109,19 @@ class AMDGPUDevice : public PciDevice AMDGPUMemoryManager *gpuMemMgr; AMDGPUInterruptHandler *deviceIH; AMDGPUVM gpuvm; - SDMAEngine *sdma0; - SDMAEngine *sdma1; - std::unordered_map sdmaEngs; PM4PacketProcessor *pm4PktProc; GPUCommandProcessor *cp; + // SDMAs mapped by doorbell offset + std::unordered_map sdmaEngs; + // SDMAs mapped by ID + std::unordered_map sdmaIds; + // SDMA ID to MMIO range + std::unordered_map sdmaMmios; + // SDMA ID to function + typedef void (SDMAEngine::*sdmaFuncPtr)(uint32_t); + std::unordered_map sdmaFunc; + /** * Initial checkpoint support variables. */ diff --git a/src/dev/amdgpu/interrupt_handler.cc b/src/dev/amdgpu/interrupt_handler.cc index a771976d98..6f277a1618 100644 --- a/src/dev/amdgpu/interrupt_handler.cc +++ b/src/dev/amdgpu/interrupt_handler.cc @@ -80,6 +80,12 @@ AMDGPUInterruptHandler::prepareInterruptCookie(ContextID cntxt_id, assert(client_id == SOC15_IH_CLIENTID_RLC || client_id == SOC15_IH_CLIENTID_SDMA0 || client_id == SOC15_IH_CLIENTID_SDMA1 || + client_id == SOC15_IH_CLIENTID_SDMA2 || + client_id == SOC15_IH_CLIENTID_SDMA3 || + client_id == SOC15_IH_CLIENTID_SDMA4 || + client_id == SOC15_IH_CLIENTID_SDMA5 || + client_id == SOC15_IH_CLIENTID_SDMA6 || + client_id == SOC15_IH_CLIENTID_SDMA7 || client_id == SOC15_IH_CLIENTID_GRBM_CP); assert(source_id == CP_EOP || source_id == TRAP_ID); diff --git a/src/dev/amdgpu/interrupt_handler.hh b/src/dev/amdgpu/interrupt_handler.hh index ab8a853074..9b80e081cc 100644 --- a/src/dev/amdgpu/interrupt_handler.hh +++ b/src/dev/amdgpu/interrupt_handler.hh @@ -58,6 +58,12 @@ enum soc15_ih_clientid SOC15_IH_CLIENTID_RLC = 0x07, SOC15_IH_CLIENTID_SDMA0 = 0x08, SOC15_IH_CLIENTID_SDMA1 = 0x09, + SOC15_IH_CLIENTID_SDMA2 = 0x01, + SOC15_IH_CLIENTID_SDMA3 = 0x04, + SOC15_IH_CLIENTID_SDMA4 = 0x05, + SOC15_IH_CLIENTID_SDMA5 = 0x11, + SOC15_IH_CLIENTID_SDMA6 = 0x13, + SOC15_IH_CLIENTID_SDMA7 = 0x18, SOC15_IH_CLIENTID_GRBM_CP = 0x14 }; diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc index 736df45d9d..e99d694634 100644 --- a/src/dev/amdgpu/sdma_engine.cc +++ b/src/dev/amdgpu/sdma_engine.cc @@ -49,7 +49,8 @@ SDMAEngine::SDMAEngine(const SDMAEngineParams &p) : DmaVirtDevice(p), id(0), gfxBase(0), gfxRptr(0), gfxDoorbell(0), gfxDoorbellOffset(0), gfxWptr(0), pageBase(0), pageRptr(0), pageDoorbell(0), pageDoorbellOffset(0), - pageWptr(0), gpuDevice(nullptr), walker(p.walker) + pageWptr(0), gpuDevice(nullptr), walker(p.walker), + mmioBase(p.mmio_base), mmioSize(p.mmio_size) { gfx.ib(&gfxIb); gfxIb.parent(&gfx); @@ -87,6 +88,18 @@ SDMAEngine::getIHClientId() return SOC15_IH_CLIENTID_SDMA0; case 1: return SOC15_IH_CLIENTID_SDMA1; + case 2: + return SOC15_IH_CLIENTID_SDMA2; + case 3: + return SOC15_IH_CLIENTID_SDMA3; + case 4: + return SOC15_IH_CLIENTID_SDMA4; + case 5: + return SOC15_IH_CLIENTID_SDMA5; + case 6: + return SOC15_IH_CLIENTID_SDMA6; + case 7: + return SOC15_IH_CLIENTID_SDMA7; default: panic("Unknown SDMA id"); } @@ -1240,6 +1253,10 @@ SDMAEngine::setGfxDoorbellOffsetLo(uint32_t data) { gfxDoorbellOffset = insertBits(gfxDoorbellOffset, 31, 0, 0); gfxDoorbellOffset |= data; + if (bits(gfxDoorbell, 28, 28)) { + gpuDevice->setDoorbellType(gfxDoorbellOffset, QueueType::SDMAGfx); + gpuDevice->setSDMAEngine(gfxDoorbellOffset, this); + } } void @@ -1250,9 +1267,11 @@ SDMAEngine::setGfxDoorbellOffsetHi(uint32_t data) } void -SDMAEngine::setGfxSize(uint64_t data) +SDMAEngine::setGfxSize(uint32_t data) { - gfx.size(data); + uint32_t rb_size = bits(data, 6, 1); + assert(rb_size >= 6 && rb_size <= 62); + gfx.size(1 << (rb_size + 2)); } void @@ -1320,6 +1339,10 @@ SDMAEngine::setPageDoorbellOffsetLo(uint32_t data) { pageDoorbellOffset = insertBits(pageDoorbellOffset, 31, 0, 0); pageDoorbellOffset |= data; + if (bits(pageDoorbell, 28, 28)) { + gpuDevice->setDoorbellType(pageDoorbellOffset, QueueType::SDMAPage); + gpuDevice->setSDMAEngine(pageDoorbellOffset, this); + } } void @@ -1330,9 +1353,11 @@ SDMAEngine::setPageDoorbellOffsetHi(uint32_t data) } void -SDMAEngine::setPageSize(uint64_t data) +SDMAEngine::setPageSize(uint32_t data) { - page.size(data); + uint32_t rb_size = bits(data, 6, 1); + assert(rb_size >= 6 && rb_size <= 62); + page.size(1 << (rb_size + 2)); } void diff --git a/src/dev/amdgpu/sdma_engine.hh b/src/dev/amdgpu/sdma_engine.hh index 27c169193b..1e4f965920 100644 --- a/src/dev/amdgpu/sdma_engine.hh +++ b/src/dev/amdgpu/sdma_engine.hh @@ -156,6 +156,9 @@ class SDMAEngine : public DmaVirtDevice void processRLC0(Addr wptrOffset); void processRLC1(Addr wptrOffset); + Addr mmioBase = 0; + Addr mmioSize = 0; + public: SDMAEngine(const SDMAEngineParams &p); @@ -242,6 +245,14 @@ class SDMAEngine : public DmaVirtDevice void atomicDone(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt, uint64_t *dmaBuffer); + /** + * Methods for getting SDMA MMIO base address and size. These are set by + * the python configuration depending on device to allow for flexible base + * addresses depending on what GPU is being simulated. + */ + Addr getMmioBase() { return mmioBase; } + Addr getMmioSize() { return mmioSize; } + /** * Methods for getting the values of SDMA MMIO registers. */ @@ -269,7 +280,7 @@ class SDMAEngine : public DmaVirtDevice void setGfxDoorbellHi(uint32_t data); void setGfxDoorbellOffsetLo(uint32_t data); void setGfxDoorbellOffsetHi(uint32_t data); - void setGfxSize(uint64_t data); + void setGfxSize(uint32_t data); void setGfxWptrLo(uint32_t data); void setGfxWptrHi(uint32_t data); void setPageBaseLo(uint32_t data); @@ -280,7 +291,7 @@ class SDMAEngine : public DmaVirtDevice void setPageDoorbellHi(uint32_t data); void setPageDoorbellOffsetLo(uint32_t data); void setPageDoorbellOffsetHi(uint32_t data); - void setPageSize(uint64_t data); + void setPageSize(uint32_t data); void setPageWptrLo(uint32_t data); void setPageWptrHi(uint32_t data);