From 823b5a6eb87e45f2cb54d3b1c736dad11e4e70e4 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Tue, 13 Feb 2024 17:43:23 -0600 Subject: [PATCH] dev-amdgpu: Support multiple CPs and MMIO AddrRanges Currently gem5 assumes that there is only one command processor (CP) which contains the PM4 packet processor. Some GPU devices have multiple CPs which the driver tests individually during POST if they are used or not. Therefore, these additional CPs need to be supported. This commit allows for multiple PM4 packet processors which represent multiple CPs. Each of these processors will have its own independent MMIO address range. To more easily support ranges, the MMIO addresses now use AddrRange to index a PM4 packet processor instead of the hard-coded constexpr MMIO start and size pairs. By default only one PM4 packet processor is created, meaning the functionality of the simulation is unchanged for devices currently supported in gem5. Change-Id: I977f4fd3a169ef4a78671a4fb58c8ea0e19bf52c --- configs/example/gpufs/system/system.py | 18 ++- src/dev/amdgpu/AMDGPU.py | 6 +- src/dev/amdgpu/amdgpu_defines.hh | 37 ++---- src/dev/amdgpu/amdgpu_device.cc | 160 +++++++++++++++++-------- src/dev/amdgpu/amdgpu_device.hh | 16 ++- src/dev/amdgpu/amdgpu_vm.cc | 30 +++++ src/dev/amdgpu/amdgpu_vm.hh | 53 ++++---- src/dev/amdgpu/pm4_mmio.hh | 54 ++++----- src/dev/amdgpu/pm4_packet_processor.cc | 15 ++- src/dev/amdgpu/pm4_packet_processor.hh | 7 ++ 10 files changed, 245 insertions(+), 151 deletions(-) diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py index 671d4efdc9..1f89bd935b 100644 --- a/configs/example/gpufs/system/system.py +++ b/configs/example/gpufs/system/system.py @@ -188,9 +188,15 @@ def makeGpuFSSystem(args): system.pc.south_bridge.gpu.sdmas = sdma_engines - # Setup PM4 packet processor - pm4_pkt_proc = PM4PacketProcessor() - system.pc.south_bridge.gpu.pm4_pkt_proc = pm4_pkt_proc + # Setup PM4 packet processors + pm4_procs = [] + pm4_procs.append( + PM4PacketProcessor( + ip_id=0, mmio_range=AddrRange(start=0xC000, end=0xD000) + ) + ) + + system.pc.south_bridge.gpu.pm4_pkt_procs = pm4_procs # GPU data path gpu_mem_mgr = AMDGPUMemoryManager() @@ -207,7 +213,8 @@ def makeGpuFSSystem(args): for sdma in sdma_engines: system._dma_ports.append(sdma) system._dma_ports.append(device_ih) - system._dma_ports.append(pm4_pkt_proc) + for pm4_proc in pm4_procs: + system._dma_ports.append(pm4_proc) system._dma_ports.append(system_hub) system._dma_ports.append(gpu_mem_mgr) system._dma_ports.append(hsapp_pt_walker) @@ -221,7 +228,8 @@ def makeGpuFSSystem(args): for sdma in sdma_engines: sdma.pio = system.iobus.mem_side_ports device_ih.pio = system.iobus.mem_side_ports - pm4_pkt_proc.pio = system.iobus.mem_side_ports + for pm4_proc in pm4_procs: + pm4_proc.pio = system.iobus.mem_side_ports system_hub.pio = system.iobus.mem_side_ports # Full system needs special TLBs for SQC, Scalar, and vector data ports diff --git a/src/dev/amdgpu/AMDGPU.py b/src/dev/amdgpu/AMDGPU.py index 0370f09e01..0e0f597927 100644 --- a/src/dev/amdgpu/AMDGPU.py +++ b/src/dev/amdgpu/AMDGPU.py @@ -95,7 +95,7 @@ class AMDGPUDevice(PciDevice): # The config script should not create a new cp here but rather assign the # same cp that is assigned to the Shader SimObject. cp = Param.GPUCommandProcessor(NULL, "Command Processor") - pm4_pkt_proc = Param.PM4PacketProcessor("PM4 Packet Processor") + pm4_pkt_procs = VectorParam.PM4PacketProcessor("PM4 Packet Processor") memory_manager = Param.AMDGPUMemoryManager("GPU Memory Manager") memories = VectorParam.AbstractMemory([], "All memories in the device") device_ih = Param.AMDGPUInterruptHandler("GPU Interrupt handler") @@ -118,6 +118,10 @@ class PM4PacketProcessor(DmaVirtDevice): cxx_header = "dev/amdgpu/pm4_packet_processor.hh" cxx_class = "gem5::PM4PacketProcessor" + # Default to 0 as the common case is one PM4 packet processor + ip_id = Param.Int(0, "Instance ID of this PM4 processor") + mmio_range = Param.AddrRange("Range of MMIO addresses") + class AMDGPUMemoryManager(ClockedObject): type = "AMDGPUMemoryManager" diff --git a/src/dev/amdgpu/amdgpu_defines.hh b/src/dev/amdgpu/amdgpu_defines.hh index bc6377fbbc..883501b84d 100644 --- a/src/dev/amdgpu/amdgpu_defines.hh +++ b/src/dev/amdgpu/amdgpu_defines.hh @@ -49,6 +49,16 @@ enum QueueType RLC }; +/* + * Hold information about doorbells including queue type and the IP + * block ID if the IP can have multiple instances. + */ +typedef struct +{ + QueueType qtype; + int ip_id; +} DoorbellInfo; + // AMD GPUs support 16 different virtual address spaces static constexpr int AMDGPU_VM_COUNT = 16; @@ -61,36 +71,11 @@ constexpr int MMIO_BAR = 5; constexpr uint32_t VGA_ROM_DEFAULT = 0xc0000; constexpr uint32_t ROM_SIZE = 0x20000; // 128kB -/* SDMA base, size, mmio offset shift. */ -static constexpr uint32_t SDMA0_BASE = 0x4980; -static constexpr uint32_t SDMA1_BASE = 0x5180; -static constexpr uint32_t SDMA_SIZE = 0x800; -static constexpr uint32_t SDMA_OFFSET_SHIFT = 2; - -/* Interrupt handler base, size, mmio offset shift. */ -static constexpr uint32_t IH_BASE = 0x4280; -static constexpr uint32_t IH_SIZE = 0x700; +/* Most MMIOs use DWORD addresses and thus need to be shifted. */ static constexpr uint32_t IH_OFFSET_SHIFT = 2; - -/* Graphics register bus manager base, size, mmio offset shift. */ -static constexpr uint32_t GRBM_BASE = 0x8000; -static constexpr uint32_t GRBM_SIZE = 0x5000; static constexpr uint32_t GRBM_OFFSET_SHIFT = 2; - -/* GFX base, size, mmio offset shift. */ -static constexpr uint32_t GFX_BASE = 0x28000; -static constexpr uint32_t GFX_SIZE = 0x17000; -static constexpr uint32_t GFX_OFFSET_SHIFT = 2; - -/* MMHUB base, size, mmio offset shift. */ -static constexpr uint32_t MMHUB_BASE = 0x68000; -static constexpr uint32_t MMHUB_SIZE = 0x2120; static constexpr uint32_t MMHUB_OFFSET_SHIFT = 2; -/* NBIO base and size. */ -static constexpr uint32_t NBIO_BASE = 0x0; -static constexpr uint32_t NBIO_SIZE = 0x4280; - } // namespace gem5 #endif // __DEV_AMDGPU_AMDGPU_DEFINES_HH__ diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc index 4b684aa221..5ddd7756ba 100644 --- a/src/dev/amdgpu/amdgpu_device.cc +++ b/src/dev/amdgpu/amdgpu_device.cc @@ -54,8 +54,7 @@ namespace gem5 AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) : PciDevice(p), gpuMemMgr(p.memory_manager), deviceIH(p.device_ih), - pm4PktProc(p.pm4_pkt_proc), cp(p.cp), - checkpoint_before_mmios(p.checkpoint_before_mmios), + cp(p.cp), checkpoint_before_mmios(p.checkpoint_before_mmios), init_interrupt_count(0), _lastVMID(0), deviceMem(name() + ".deviceMem", p.memories, false, "", false) { @@ -81,6 +80,16 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) romRange = RangeSize(VGA_ROM_DEFAULT, ROM_SIZE); } + if (p.device_name == "Vega10") { + gfx_version = GfxVersion::gfx900; + } else if (p.device_name == "MI100") { + gfx_version = GfxVersion::gfx908; + } else if (p.device_name == "MI200") { + gfx_version = GfxVersion::gfx90a; + } else { + panic("Unknown GPU device %s\n", p.device_name); + } + if (p.trace_file != "") { mmioReader.readMMIOTrace(p.trace_file); } @@ -126,8 +135,22 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) panic("Unknown GPU device %s\n", p.device_name); } + // Setup PM4 packet processors and sanity check IDs + std::set pm4_ids; + for (auto& pm4 : p.pm4_pkt_procs) { + pm4->setGPUDevice(this); + fatal_if(pm4_ids.count(pm4->getIpId()), + "Two PM4s with same IP IDs is not allowed"); + pm4_ids.insert(pm4->getIpId()); + pm4PktProcs.insert({pm4->getIpId(), pm4}); + + pm4Ranges.insert({pm4->getMMIORange(), pm4}); + } + + // There should be at least one PM4 packet processor with ID 0 + fatal_if(!pm4PktProcs.count(0), "No default PM4 processor found"); + deviceIH->setGPUDevice(this); - pm4PktProc->setGPUDevice(this); cp->hsaPacketProc().setGPUDevice(this); cp->setGPUDevice(this); nbio.setGPUDevice(this); @@ -136,6 +159,23 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) // could possibly be anything, but these are the values used by hardware. uint64_t mmhubBase = 0x8000ULL << 24; uint64_t mmhubTop = 0x83ffULL << 24; + uint64_t mem_size = 0x3ff0; // 16 GB of memory + + gpuvm.setMMHUBBase(mmhubBase); + gpuvm.setMMHUBTop(mmhubTop); + + // Map other MMIO apertures based on gfx version. This must be done before + // any calls to get/setRegVal. + // NBIO 0x0 - 0x4280 + // IH 0x4280 - 0x4980 + // GRBM 0x8000 - 0xC000 + // GFX 0x28000 - 0x3F000 + // MMHUB 0x68000 - 0x6a120 + gpuvm.setMMIOAperture(NBIO_MMIO_RANGE, AddrRange(0x0, 0x4280)); + gpuvm.setMMIOAperture(IH_MMIO_RANGE, AddrRange(0x4280, 0x4980)); + gpuvm.setMMIOAperture(GRBM_MMIO_RANGE, AddrRange(0x8000, 0xC000)); + gpuvm.setMMIOAperture(GFX_MMIO_RANGE, AddrRange(0x28000, 0x3F000)); + gpuvm.setMMIOAperture(MMHUB_MMIO_RANGE, AddrRange(0x68000, 0x6A120)); // These are hardcoded register values to return what the driver expects setRegVal(AMDGPU_MP0_SMN_C2PMSG_33, 0x80000000); @@ -145,25 +185,19 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) if (p.device_name == "Vega10") { setRegVal(VEGA10_FB_LOCATION_BASE, mmhubBase >> 24); setRegVal(VEGA10_FB_LOCATION_TOP, mmhubTop >> 24); - gfx_version = GfxVersion::gfx900; } else if (p.device_name == "MI100") { setRegVal(MI100_FB_LOCATION_BASE, mmhubBase >> 24); setRegVal(MI100_FB_LOCATION_TOP, mmhubTop >> 24); - setRegVal(MI100_MEM_SIZE_REG, 0x3ff0); // 16GB of memory - gfx_version = GfxVersion::gfx908; + setRegVal(MI100_MEM_SIZE_REG, mem_size); } else if (p.device_name == "MI200") { // This device can have either 64GB or 128GB of device memory. // This limits to 16GB for simulation. setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24); setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24); - setRegVal(MI200_MEM_SIZE_REG, 0x3ff0); - gfx_version = GfxVersion::gfx90a; + setRegVal(MI200_MEM_SIZE_REG, mem_size); } else { panic("Unknown GPU device %s\n", p.device_name); } - - gpuvm.setMMHUBBase(mmhubBase); - gpuvm.setMMHUBTop(mmhubTop); } void @@ -356,29 +390,28 @@ AMDGPUDevice::readDoorbell(PacketPtr pkt, Addr offset) void AMDGPUDevice::readMMIO(PacketPtr pkt, Addr offset) { - Addr aperture = gpuvm.getMmioAperture(offset); - Addr aperture_offset = offset - aperture; + AddrRange aperture = gpuvm.getMMIOAperture(offset); + Addr aperture_offset = offset - aperture.start(); // By default read from MMIO trace. Overwrite the packet for a select // few more dynamic MMIOs. DPRINTF(AMDGPUDevice, "Read MMIO %#lx\n", offset); mmioReader.readFromTrace(pkt, MMIO_BAR, offset); - switch (aperture) { - case NBIO_BASE: + if (aperture == gpuvm.getMMIORange(NBIO_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "NBIO base\n"); nbio.readMMIO(pkt, aperture_offset); - break; - case GRBM_BASE: + } else if (aperture == gpuvm.getMMIORange(GRBM_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "GRBM base\n"); gpuvm.readMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT); - break; - case GFX_BASE: + } else if (aperture == gpuvm.getMMIORange(GFX_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "GFX base\n"); gfx.readMMIO(pkt, aperture_offset); - break; - case MMHUB_BASE: + } else if (aperture == gpuvm.getMMIORange(MMHUB_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "MMHUB base\n"); gpuvm.readMMIO(pkt, aperture_offset >> MMHUB_OFFSET_SHIFT); - break; - default: - break; + } else { + DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for read %#x\n", offset); } } @@ -422,17 +455,22 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset) DPRINTF(AMDGPUDevice, "Wrote doorbell %#lx\n", offset); if (doorbells.find(offset) != doorbells.end()) { - QueueType q_type = doorbells[offset]; + QueueType q_type = doorbells[offset].qtype; + int ip_id = doorbells[offset].ip_id; DPRINTF(AMDGPUDevice, "Doorbell offset %p queue: %d\n", offset, q_type); switch (q_type) { case Compute: - pm4PktProc->process(pm4PktProc->getQueue(offset), - pkt->getLE()); + assert(pm4PktProcs.count(ip_id)); + pm4PktProcs[ip_id]->process( + pm4PktProcs[ip_id]->getQueue(offset), + pkt->getLE()); break; case Gfx: - pm4PktProc->process(pm4PktProc->getQueue(offset, true), - pkt->getLE()); + assert(pm4PktProcs.count(ip_id)); + pm4PktProcs[ip_id]->process( + pm4PktProcs[ip_id]->getQueue(offset, true), + pkt->getLE()); break; case SDMAGfx: { SDMAEngine *sdmaEng = getSDMAEngine(offset); @@ -443,9 +481,11 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset) sdmaEng->processPage(pkt->getLE()); } break; case ComputeAQL: { + assert(pm4PktProcs.count(ip_id)); cp->hsaPacketProc().hwScheduler()->write(offset, pkt->getLE() + 1); - pm4PktProc->updateReadIndex(offset, pkt->getLE() + 1); + pm4PktProcs[ip_id]->updateReadIndex(offset, + pkt->getLE() + 1); } break; case InterruptHandler: deviceIH->updateRptr(pkt->getLE()); @@ -475,12 +515,12 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset) void AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset) { - Addr aperture = gpuvm.getMmioAperture(offset); - Addr aperture_offset = offset - aperture; + AddrRange aperture = gpuvm.getMMIOAperture(offset); + Addr aperture_offset = offset - aperture.start(); DPRINTF(AMDGPUDevice, "Wrote MMIO %#lx\n", offset); - // Check SDMA functions first, then fallback to switch statement + // Check SDMA functions first, then fallback to MMIO ranges. for (int idx = 0; idx < sdmaIds.size(); ++idx) { if (sdmaMmios[idx].contains(offset)) { Addr sdma_offset = (offset - sdmaMmios[idx].start()) >> 2; @@ -498,26 +538,31 @@ AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset) } } - switch (aperture) { - /* Write a general register to the graphics register bus manager. */ - case GRBM_BASE: + // Check PM4s next, returning to avoid duplicate writes. + for (auto& [range, pm4_proc] : pm4Ranges) { + if (range.contains(offset)) { + // PM4 MMIOs are offset based on the MMIO range start + Addr ip_offset = offset - range.start(); + pm4_proc->writeMMIO(pkt, ip_offset >> GRBM_OFFSET_SHIFT); + + return; + } + } + + if (aperture == gpuvm.getMMIORange(GRBM_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "GRBM base\n"); gpuvm.writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT); - pm4PktProc->writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT); - break; - /* Write a register to the interrupt handler. */ - case IH_BASE: + } else if (aperture == gpuvm.getMMIORange(IH_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "IH base\n"); deviceIH->writeMMIO(pkt, aperture_offset >> IH_OFFSET_SHIFT); - break; - /* Write an IO space register */ - case NBIO_BASE: + } else if (aperture == gpuvm.getMMIORange(NBIO_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "NBIO base\n"); nbio.writeMMIO(pkt, aperture_offset); - break; - case GFX_BASE: + } else if (aperture == gpuvm.getMMIORange(GFX_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "GFX base\n"); gfx.writeMMIO(pkt, aperture_offset); - break; - default: - DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for %#x\n", offset); - break; + } else { + DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for write %#x\n", offset); } } @@ -638,10 +683,11 @@ AMDGPUDevice::setRegVal(uint64_t addr, uint32_t value) } void -AMDGPUDevice::setDoorbellType(uint32_t offset, QueueType qt) +AMDGPUDevice::setDoorbellType(uint32_t offset, QueueType qt, int ip_id) { DPRINTF(AMDGPUDevice, "Setting doorbell type for %x\n", offset); - doorbells[offset] = qt; + doorbells[offset].qtype = qt; + doorbells[offset].ip_id = ip_id; } void @@ -692,6 +738,7 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const // Make a c-style array of the regs to serialize uint32_t doorbells_offset[doorbells_size]; QueueType doorbells_queues[doorbells_size]; + int doorbells_ip_ids[doorbells_size]; uint32_t sdma_engs_offset[sdma_engs_size]; int sdma_engs[sdma_engs_size]; int used_vmids[used_vmid_map_size]; @@ -701,7 +748,8 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const int idx = 0; for (auto & it : doorbells) { doorbells_offset[idx] = it.first; - doorbells_queues[idx] = it.second; + doorbells_queues[idx] = it.second.qtype; + doorbells_ip_ids[idx] = it.second.ip_id; ++idx; } @@ -730,6 +778,8 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const sizeof(doorbells_offset[0])); SERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/ sizeof(doorbells_queues[0])); + SERIALIZE_ARRAY(doorbells_ip_ids, sizeof(doorbells_ip_ids)/ + sizeof(doorbells_ip_ids[0])); SERIALIZE_ARRAY(sdma_engs_offset, sizeof(sdma_engs_offset)/ sizeof(sdma_engs_offset[0])); SERIALIZE_ARRAY(sdma_engs, sizeof(sdma_engs)/sizeof(sdma_engs[0])); @@ -768,14 +818,18 @@ AMDGPUDevice::unserialize(CheckpointIn &cp) if (doorbells_size > 0) { uint32_t doorbells_offset[doorbells_size]; QueueType doorbells_queues[doorbells_size]; + int doorbells_ip_ids[doorbells_size]; UNSERIALIZE_ARRAY(doorbells_offset, sizeof(doorbells_offset)/ sizeof(doorbells_offset[0])); UNSERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/ sizeof(doorbells_queues[0])); + UNSERIALIZE_ARRAY(doorbells_ip_ids, sizeof(doorbells_ip_ids)/ + sizeof(doorbells_ip_ids[0])); for (int idx = 0; idx < doorbells_size; ++idx) { - doorbells[doorbells_offset[idx]] = doorbells_queues[idx]; + doorbells[doorbells_offset[idx]].qtype = doorbells_queues[idx]; + doorbells[doorbells_offset[idx]].ip_id = doorbells_ip_ids[idx]; } } diff --git a/src/dev/amdgpu/amdgpu_device.hh b/src/dev/amdgpu/amdgpu_device.hh index fface5fb3e..33b6a9f3e7 100644 --- a/src/dev/amdgpu/amdgpu_device.hh +++ b/src/dev/amdgpu/amdgpu_device.hh @@ -87,7 +87,7 @@ class AMDGPUDevice : public PciDevice /** * Structures to hold registers, doorbells, and some frame memory */ - std::unordered_map doorbells; + std::unordered_map doorbells; std::unordered_map pendingDoorbellPkts; /** @@ -113,9 +113,19 @@ class AMDGPUDevice : public PciDevice AMDGPUMemoryManager *gpuMemMgr; AMDGPUInterruptHandler *deviceIH; AMDGPUVM gpuvm; - PM4PacketProcessor *pm4PktProc; GPUCommandProcessor *cp; + struct AddrRangeHasher + { + std::size_t operator()(const AddrRange& k) const + { + return k.start(); + } + }; + std::unordered_map pm4PktProcs; + std::unordered_map pm4Ranges; + // SDMAs mapped by doorbell offset std::unordered_map sdmaEngs; // SDMAs mapped by ID @@ -185,7 +195,7 @@ class AMDGPUDevice : public PciDevice /** * Set handles to GPU blocks. */ - void setDoorbellType(uint32_t offset, QueueType qt); + void setDoorbellType(uint32_t offset, QueueType qt, int ip_id = 0); void processPendingDoorbells(uint32_t offset); void setSDMAEngine(Addr offset, SDMAEngine *eng); diff --git a/src/dev/amdgpu/amdgpu_vm.cc b/src/dev/amdgpu/amdgpu_vm.cc index 5a13ac9ba0..0eea590c5a 100644 --- a/src/dev/amdgpu/amdgpu_vm.cc +++ b/src/dev/amdgpu/amdgpu_vm.cc @@ -37,6 +37,7 @@ #include "base/trace.hh" #include "debug/AMDGPUDevice.hh" #include "dev/amdgpu/amdgpu_defines.hh" +#include "dev/amdgpu/amdgpu_device.hh" #include "mem/packet_access.hh" namespace gem5 @@ -51,6 +52,35 @@ AMDGPUVM::AMDGPUVM() for (int i = 0; i < AMDGPU_VM_COUNT; ++i) { memset(&vmContexts[0], 0, sizeof(AMDGPUVMContext)); } + + for (int i = 0; i < NUM_MMIO_RANGES; ++i) { + mmioRanges[i] = AddrRange(); + } +} + +void +AMDGPUVM::setMMIOAperture(mmio_range_t mmio_aperture, AddrRange range) +{ + mmioRanges[mmio_aperture] = range; +} + +AddrRange +AMDGPUVM::getMMIORange(mmio_range_t mmio_aperture) +{ + return mmioRanges[mmio_aperture]; +} + +const AddrRange& +AMDGPUVM::getMMIOAperture(Addr offset) +{ + for (int i = 0; i < NUM_MMIO_RANGES; ++i) { + if (mmioRanges[i].contains(offset)) { + return mmioRanges[i]; + } + } + + // Default to NBIO + return mmioRanges[NBIO_MMIO_RANGE]; } Addr diff --git a/src/dev/amdgpu/amdgpu_vm.hh b/src/dev/amdgpu/amdgpu_vm.hh index 5af666f379..857ef724da 100644 --- a/src/dev/amdgpu/amdgpu_vm.hh +++ b/src/dev/amdgpu/amdgpu_vm.hh @@ -99,9 +99,23 @@ static constexpr int AMDGPU_USER_PAGE_SIZE = 4096; namespace gem5 { +typedef enum : int +{ + NBIO_MMIO_RANGE, + MMHUB_MMIO_RANGE, + GFX_MMIO_RANGE, + GRBM_MMIO_RANGE, + IH_MMIO_RANGE, + NUM_MMIO_RANGES +} mmio_range_t; + +class AMDGPUDevice; + class AMDGPUVM : public Serializable { private: + AMDGPUDevice *gpuDevice; + typedef struct GEM5_PACKED { // Page table addresses: from (Base + Start) to (End) @@ -160,9 +174,13 @@ class AMDGPUVM : public Serializable */ std::vector gpu_tlbs; + std::array mmioRanges; + public: AMDGPUVM(); + void setGPUDevice(AMDGPUDevice *gpu_device) { gpuDevice = gpu_device; } + /** * Return base address of GART table in framebuffer. */ @@ -232,38 +250,11 @@ class AMDGPUVM : public Serializable Addr getSysAddrRangeLow () { return vmContext0.sysAddrL; } Addr getSysAddrRangeHigh () { return vmContext0.sysAddrH; } - Addr - getMmioAperture(Addr addr) - { - // Aperture ranges: - // NBIO 0x0 - 0x4280 - // IH 0x4280 - 0x4980 - // SDMA0 0x4980 - 0x5180 - // SDMA1 0x5180 - 0x5980 - // GRBM 0x8000 - 0xD000 - // GFX 0x28000 - 0x3F000 - // MMHUB 0x68000 - 0x6a120 + void setMMIOAperture(mmio_range_t mmio_aperture, AddrRange range); + const AddrRange& getMMIOAperture(Addr addr); + AddrRange getMMIORange(mmio_range_t mmio_aperture); - if (IH_BASE <= addr && addr < IH_BASE + IH_SIZE) - return IH_BASE; - else if (SDMA0_BASE <= addr && addr < SDMA0_BASE + SDMA_SIZE) - return SDMA0_BASE; - else if (SDMA1_BASE <= addr && addr < SDMA1_BASE + SDMA_SIZE) - return SDMA1_BASE; - else if (GRBM_BASE <= addr && addr < GRBM_BASE + GRBM_SIZE) - return GRBM_BASE; - else if (GFX_BASE <= addr && addr < GFX_BASE + GFX_SIZE) - return GFX_BASE; - else if (MMHUB_BASE <= addr && addr < MMHUB_BASE + MMHUB_SIZE) - return MMHUB_BASE; - else { - warn_once("Accessing unsupported MMIO aperture! Assuming NBIO\n"); - return NBIO_BASE; - } - - } - - // Gettig mapped aperture base addresses + // Getting mapped aperture base addresses Addr getFrameAperture(Addr addr) { diff --git a/src/dev/amdgpu/pm4_mmio.hh b/src/dev/amdgpu/pm4_mmio.hh index 3801223175..e9e504c3cd 100644 --- a/src/dev/amdgpu/pm4_mmio.hh +++ b/src/dev/amdgpu/pm4_mmio.hh @@ -36,34 +36,34 @@ namespace gem5 { -#define mmCP_RB0_BASE 0x1040 -#define mmCP_RB0_CNTL 0x1041 -#define mmCP_RB_WPTR_POLL_ADDR_LO 0x1046 -#define mmCP_RB_WPTR_POLL_ADDR_HI 0x1047 -#define mmCP_RB_VMID 0x1051 -#define mmCP_RB0_RPTR_ADDR 0x1043 -#define mmCP_RB0_RPTR_ADDR_HI 0x1044 -#define mmCP_RB0_WPTR 0x1054 -#define mmCP_RB0_WPTR_HI 0x1055 -#define mmCP_RB_DOORBELL_CONTROL 0x1059 -#define mmCP_RB_DOORBELL_RANGE_LOWER 0x105a -#define mmCP_RB_DOORBELL_RANGE_UPPER 0x105b -#define mmCP_RB0_BASE_HI 0x10b1 +#define mmCP_RB0_BASE 0x040 +#define mmCP_RB0_CNTL 0x041 +#define mmCP_RB_WPTR_POLL_ADDR_LO 0x046 +#define mmCP_RB_WPTR_POLL_ADDR_HI 0x047 +#define mmCP_RB_VMID 0x051 +#define mmCP_RB0_RPTR_ADDR 0x043 +#define mmCP_RB0_RPTR_ADDR_HI 0x044 +#define mmCP_RB0_WPTR 0x054 +#define mmCP_RB0_WPTR_HI 0x055 +#define mmCP_RB_DOORBELL_CONTROL 0x059 +#define mmCP_RB_DOORBELL_RANGE_LOWER 0x05a +#define mmCP_RB_DOORBELL_RANGE_UPPER 0x05b +#define mmCP_RB0_BASE_HI 0x0b1 -#define mmCP_HQD_ACTIVE 0x1247 -#define mmCP_HQD_VMID 0x1248 -#define mmCP_HQD_PQ_BASE 0x124d -#define mmCP_HQD_PQ_BASE_HI 0x124e -#define mmCP_HQD_PQ_DOORBELL_CONTROL 0x1254 -#define mmCP_HQD_PQ_RPTR 0x124f -#define mmCP_HQD_PQ_RPTR_REPORT_ADDR 0x1250 -#define mmCP_HQD_PQ_RPTR_REPORT_ADDR_HI 0x1251 -#define mmCP_HQD_PQ_WPTR_POLL_ADDR 0x1252 -#define mmCP_HQD_PQ_WPTR_POLL_ADDR_HI 0x1253 -#define mmCP_HQD_PQ_CONTROL 0x1256 -#define mmCP_HQD_IB_CONTROL 0x125a -#define mmCP_HQD_PQ_WPTR_LO 0x127b -#define mmCP_HQD_PQ_WPTR_HI 0x127c +#define mmCP_HQD_ACTIVE 0x247 +#define mmCP_HQD_VMID 0x248 +#define mmCP_HQD_PQ_BASE 0x24d +#define mmCP_HQD_PQ_BASE_HI 0x24e +#define mmCP_HQD_PQ_DOORBELL_CONTROL 0x254 +#define mmCP_HQD_PQ_RPTR 0x24f +#define mmCP_HQD_PQ_RPTR_REPORT_ADDR 0x250 +#define mmCP_HQD_PQ_RPTR_REPORT_ADDR_HI 0x251 +#define mmCP_HQD_PQ_WPTR_POLL_ADDR 0x252 +#define mmCP_HQD_PQ_WPTR_POLL_ADDR_HI 0x253 +#define mmCP_HQD_PQ_CONTROL 0x256 +#define mmCP_HQD_IB_CONTROL 0x25a +#define mmCP_HQD_PQ_WPTR_LO 0x27b +#define mmCP_HQD_PQ_WPTR_HI 0x27c } // namespace gem5 diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc index c8baa5eab4..62e817aa98 100644 --- a/src/dev/amdgpu/pm4_packet_processor.cc +++ b/src/dev/amdgpu/pm4_packet_processor.cc @@ -49,7 +49,7 @@ namespace gem5 { PM4PacketProcessor::PM4PacketProcessor(const PM4PacketProcessorParams &p) - : DmaVirtDevice(p) + : DmaVirtDevice(p), _ipId(p.ip_id), _mmioRange(p.mmio_range) { memset(&kiq, 0, sizeof(QueueDesc)); memset(&pq, 0, sizeof(QueueDesc)); @@ -144,7 +144,7 @@ PM4PacketProcessor::newQueue(QueueDesc *mqd, Addr offset, QueueType qt; qt = mqd->aql ? QueueType::ComputeAQL : QueueType::Compute; - gpuDevice->setDoorbellType(offset, qt); + gpuDevice->setDoorbellType(offset, qt, getIpId()); DPRINTF(PM4PacketProcessor, "New PM4 queue %d, base: %p offset: %p, me: " "%d, pipe %d queue: %d size: %d\n", id, q->base(), q->offset(), @@ -521,7 +521,7 @@ PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, // Register doorbell with GPU device gpuDevice->setSDMAEngine(pkt->doorbellOffset << 2, sdma_eng); - gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC); + gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC, getIpId()); gpuDevice->processPendingDoorbells(pkt->doorbellOffset << 2); } @@ -774,9 +774,14 @@ PM4PacketProcessor::setUconfigReg(PM4Queue *q, PM4SetUconfigReg *pkt) { q->incRptr(sizeof(PM4SetUconfigReg)); + DPRINTF(PM4PacketProcessor, "SetUconfig offset %x data %x\n", + pkt->offset, pkt->data); + // SET_UCONFIG_REG_START and pkt->offset are dword addresses uint32_t reg_addr = (PACKET3_SET_UCONFIG_REG_START + pkt->offset) * 4; + // Additional CPs respond to addresses 0x40000 apart. + reg_addr += 0x40000 * getIpId(); gpuDevice->setRegVal(reg_addr, pkt->data); decodeNext(q); @@ -851,7 +856,7 @@ PM4PacketProcessor::writeMMIO(PacketPtr pkt, Addr mmio_offset) break; case mmCP_HQD_PQ_DOORBELL_CONTROL: setHqdPqDoorbellCtrl(pkt->getLE()); - gpuDevice->setDoorbellType(getKiqDoorbellOffset(), Compute); + gpuDevice->setDoorbellType(getKiqDoorbellOffset(), Compute, getIpId()); break; case mmCP_HQD_PQ_RPTR: setHqdPqPtr(pkt->getLE()); @@ -913,7 +918,7 @@ PM4PacketProcessor::writeMMIO(PacketPtr pkt, Addr mmio_offset) break; case mmCP_RB_DOORBELL_CONTROL: setRbDoorbellCntrl(pkt->getLE()); - gpuDevice->setDoorbellType(getPqDoorbellOffset(), Gfx); + gpuDevice->setDoorbellType(getPqDoorbellOffset(), Gfx, getIpId()); break; case mmCP_RB_DOORBELL_RANGE_LOWER: setRbDoorbellRangeLo(pkt->getLE()); diff --git a/src/dev/amdgpu/pm4_packet_processor.hh b/src/dev/amdgpu/pm4_packet_processor.hh index 4782e70829..82c3c2716f 100644 --- a/src/dev/amdgpu/pm4_packet_processor.hh +++ b/src/dev/amdgpu/pm4_packet_processor.hh @@ -63,6 +63,10 @@ class PM4PacketProcessor : public DmaVirtDevice std::unordered_map queues; /* A map of PM4 queues based on doorbell offset */ std::unordered_map queuesMap; + + int _ipId; + AddrRange _mmioRange; + public: PM4PacketProcessor(const PM4PacketProcessorParams &p); @@ -188,6 +192,9 @@ class PM4PacketProcessor : public DmaVirtDevice void setRbDoorbellCntrl(uint32_t data); void setRbDoorbellRangeLo(uint32_t data); void setRbDoorbellRangeHi(uint32_t data); + + int getIpId() const { return _ipId; } + AddrRange getMMIORange() const { return _mmioRange; } }; } // namespace gem5