From 6bbde8fbb885abae5d7f3ed630d19a9b982dd302 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Thu, 8 Feb 2024 12:26:27 -0600 Subject: [PATCH] dev-amdgpu: Rework handling of unknown registers The top level AMDGPUDevice currently reads/writes all unknown registers to/from a map containing the previously written value. This is intended as a way to handle registers that are not part of the model but the driver requires for functionality. Since this is at the top level, it can mask changes to register values which do not go through the same interface. For example, reading an MMIO, changing via PM4 queue, and reading again returns the stale cached value. This commit removes the usage of the regs map in AMDGPUDevice, implements some important MMIOs that were previously handled by it, and moves the unknown register handling to the NBIO aperture only. To reduce the number of additional MMIOs to implement, the display manager in vega10 is now disabled. Change-Id: Iff0a599dd82d663c7e710b79c6ef6d0ad1fc44a2 --- configs/example/gpufs/vega10.py | 2 +- src/dev/amdgpu/amdgpu_device.cc | 75 +++++++++++---------------------- src/dev/amdgpu/amdgpu_device.hh | 7 +-- src/dev/amdgpu/amdgpu_gfx.cc | 13 ++++++ src/dev/amdgpu/amdgpu_gfx.hh | 11 ++++- src/dev/amdgpu/amdgpu_nbio.cc | 41 ++++++++++++++---- src/dev/amdgpu/amdgpu_nbio.hh | 14 +++++- 7 files changed, 96 insertions(+), 67 deletions(-) diff --git a/configs/example/gpufs/vega10.py b/configs/example/gpufs/vega10.py index ae74efd39b..9c3116d415 100644 --- a/configs/example/gpufs/vega10.py +++ b/configs/example/gpufs/vega10.py @@ -52,7 +52,7 @@ if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5." /sbin/m5 exit fi -modprobe -v amdgpu ip_block_mask=0xff ppfeaturemask=0 dpm=0 audio=0 +modprobe -v amdgpu ip_block_mask=0xdf ppfeaturemask=0 dpm=0 audio=0 echo "Running {} {}" echo "{}" | base64 -d > myapp chmod +x myapp diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc index 48f450c2b2..4b684aa221 100644 --- a/src/dev/amdgpu/amdgpu_device.cc +++ b/src/dev/amdgpu/amdgpu_device.cc @@ -130,6 +130,7 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) pm4PktProc->setGPUDevice(this); cp->hsaPacketProc().setGPUDevice(this); cp->setGPUDevice(this); + nbio.setGPUDevice(this); // Address aperture for device memory. We tell this to the driver and // could possibly be anything, but these are the values used by hardware. @@ -163,8 +164,6 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) gpuvm.setMMHUBBase(mmhubBase); gpuvm.setMMHUBTop(mmhubTop); - - nbio.setGPUDevice(this); } void @@ -365,13 +364,6 @@ AMDGPUDevice::readMMIO(PacketPtr pkt, Addr offset) DPRINTF(AMDGPUDevice, "Read MMIO %#lx\n", offset); mmioReader.readFromTrace(pkt, MMIO_BAR, offset); - if (regs.find(offset) != regs.end()) { - uint64_t value = regs[offset]; - DPRINTF(AMDGPUDevice, "Reading what kernel wrote before: %#x\n", - value); - pkt->setUintX(value, ByteOrder::little); - } - switch (aperture) { case NBIO_BASE: nbio.readMMIO(pkt, aperture_offset); @@ -610,26 +602,39 @@ AMDGPUDevice::processPendingDoorbells(uint32_t offset) } } -bool -AMDGPUDevice::haveRegVal(uint32_t addr) -{ - return regs.count(addr); -} - uint32_t -AMDGPUDevice::getRegVal(uint32_t addr) +AMDGPUDevice::getRegVal(uint64_t addr) { + // This is somewhat of a guess based on amdgpu_device_mm_access + // in amdgpu_device.c in the ROCk driver. If bit 32 is 1 then + // assume VRAM and use full address, otherwise assume register + // address and only user lower 31 bits. + Addr fixup_addr = bits(addr, 31, 31) ? addr : addr & 0x7fffffff; + + uint32_t pkt_data = 0; + RequestPtr request = std::make_shared(fixup_addr, + sizeof(uint32_t), 0 /* flags */, vramRequestorId()); + PacketPtr pkt = Packet::createRead(request); + pkt->dataStatic((uint8_t *)&pkt_data); + readMMIO(pkt, addr); DPRINTF(AMDGPUDevice, "Getting register 0x%lx = %x\n", - addr, regs[addr]); - return regs[addr]; + fixup_addr, pkt->getLE()); + + return pkt->getLE(); } void -AMDGPUDevice::setRegVal(uint32_t addr, uint32_t value) +AMDGPUDevice::setRegVal(uint64_t addr, uint32_t value) { DPRINTF(AMDGPUDevice, "Setting register 0x%lx to %x\n", addr, value); - regs[addr] = value; + + uint32_t pkt_data = value; + RequestPtr request = std::make_shared(addr, + sizeof(uint32_t), 0 /* flags */, vramRequestorId()); + PacketPtr pkt = Packet::createWrite(request); + pkt->dataStatic((uint8_t *)&pkt_data); + writeMMIO(pkt, addr); } void @@ -675,20 +680,16 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const // Serialize the PciDevice base class PciDevice::serialize(cp); - uint64_t regs_size = regs.size(); uint64_t doorbells_size = doorbells.size(); uint64_t sdma_engs_size = sdmaEngs.size(); uint64_t used_vmid_map_size = usedVMIDs.size(); - SERIALIZE_SCALAR(regs_size); SERIALIZE_SCALAR(doorbells_size); SERIALIZE_SCALAR(sdma_engs_size); // Save the number of vmids used SERIALIZE_SCALAR(used_vmid_map_size); // Make a c-style array of the regs to serialize - uint32_t reg_addrs[regs_size]; - uint64_t reg_values[regs_size]; uint32_t doorbells_offset[doorbells_size]; QueueType doorbells_queues[doorbells_size]; uint32_t sdma_engs_offset[sdma_engs_size]; @@ -698,13 +699,6 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const std::vector used_vmid_sets; int idx = 0; - for (auto & it : regs) { - reg_addrs[idx] = it.first; - reg_values[idx] = it.second; - ++idx; - } - - idx = 0; for (auto & it : doorbells) { doorbells_offset[idx] = it.first; doorbells_queues[idx] = it.second; @@ -732,8 +726,6 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const int* vmid_array = new int[num_queue_id]; std::copy(used_vmid_sets.begin(), used_vmid_sets.end(), vmid_array); - SERIALIZE_ARRAY(reg_addrs, sizeof(reg_addrs)/sizeof(reg_addrs[0])); - SERIALIZE_ARRAY(reg_values, sizeof(reg_values)/sizeof(reg_values[0])); SERIALIZE_ARRAY(doorbells_offset, sizeof(doorbells_offset)/ sizeof(doorbells_offset[0])); SERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/ @@ -764,30 +756,15 @@ AMDGPUDevice::unserialize(CheckpointIn &cp) // Unserialize the PciDevice base class PciDevice::unserialize(cp); - uint64_t regs_size = 0; uint64_t doorbells_size = 0; uint64_t sdma_engs_size = 0; uint64_t used_vmid_map_size = 0; - UNSERIALIZE_SCALAR(regs_size); UNSERIALIZE_SCALAR(doorbells_size); UNSERIALIZE_SCALAR(sdma_engs_size); UNSERIALIZE_SCALAR(used_vmid_map_size); - if (regs_size > 0) { - uint32_t reg_addrs[regs_size]; - uint64_t reg_values[regs_size]; - - UNSERIALIZE_ARRAY(reg_addrs, sizeof(reg_addrs)/sizeof(reg_addrs[0])); - UNSERIALIZE_ARRAY(reg_values, - sizeof(reg_values)/sizeof(reg_values[0])); - - for (int idx = 0; idx < regs_size; ++idx) { - regs.insert(std::make_pair(reg_addrs[idx], reg_values[idx])); - } - } - if (doorbells_size > 0) { uint32_t doorbells_offset[doorbells_size]; QueueType doorbells_queues[doorbells_size]; @@ -798,8 +775,6 @@ AMDGPUDevice::unserialize(CheckpointIn &cp) sizeof(doorbells_queues[0])); for (int idx = 0; idx < doorbells_size; ++idx) { - regs.insert(std::make_pair(doorbells_offset[idx], - doorbells_queues[idx])); doorbells[doorbells_offset[idx]] = doorbells_queues[idx]; } } diff --git a/src/dev/amdgpu/amdgpu_device.hh b/src/dev/amdgpu/amdgpu_device.hh index b6b6e2a81a..fface5fb3e 100644 --- a/src/dev/amdgpu/amdgpu_device.hh +++ b/src/dev/amdgpu/amdgpu_device.hh @@ -87,8 +87,6 @@ class AMDGPUDevice : public PciDevice /** * Structures to hold registers, doorbells, and some frame memory */ - using GPURegMap = std::unordered_map; - GPURegMap regs; std::unordered_map doorbells; std::unordered_map pendingDoorbellPkts; @@ -195,9 +193,8 @@ class AMDGPUDevice : public PciDevice * Register value getter/setter. Used by other GPU blocks to change * values from incoming driver/user packets. */ - bool haveRegVal(uint32_t addr); - uint32_t getRegVal(uint32_t addr); - void setRegVal(uint32_t addr, uint32_t value); + uint32_t getRegVal(uint64_t addr); + void setRegVal(uint64_t addr, uint32_t value); /** * Methods related to translations and system/device memory. diff --git a/src/dev/amdgpu/amdgpu_gfx.cc b/src/dev/amdgpu/amdgpu_gfx.cc index 3d5b274b86..60fabaf31d 100644 --- a/src/dev/amdgpu/amdgpu_gfx.cc +++ b/src/dev/amdgpu/amdgpu_gfx.cc @@ -37,6 +37,13 @@ namespace gem5 { +AMDGPUGfx::AMDGPUGfx() +{ + for (int i = 0; i < SCRATCH_REGS; ++i) { + scratchRegs[i] = 0; + } +} + void AMDGPUGfx::readMMIO(PacketPtr pkt, Addr offset) { @@ -47,6 +54,9 @@ AMDGPUGfx::readMMIO(PacketPtr pkt, Addr offset) case AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB: pkt->setLE(captured_clock_count >> 32); break; + case AMDGPU_MM_SCRATCH_REG0: + pkt->setLE(scratchRegs[0]); + break; default: break; } @@ -65,6 +75,9 @@ AMDGPUGfx::writeMMIO(PacketPtr pkt, Addr offset) captured_clock_count = curTick() / sim_clock::as_int::ns; } break; + case AMDGPU_MM_SCRATCH_REG0: + scratchRegs[0] = pkt->getLE(); + break; default: break; } diff --git a/src/dev/amdgpu/amdgpu_gfx.hh b/src/dev/amdgpu/amdgpu_gfx.hh index c32b8624cf..9fb1d82553 100644 --- a/src/dev/amdgpu/amdgpu_gfx.hh +++ b/src/dev/amdgpu/amdgpu_gfx.hh @@ -52,13 +52,16 @@ #define AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB 0x13094 #define AMDGPU_MM_RLC_CAPTURE_GPU_CLOCK_COUNT 0x13098 +// Scratch registers used for GPU post +#define AMDGPU_MM_SCRATCH_REG0 0x08100 + namespace gem5 { class AMDGPUGfx { public: - AMDGPUGfx() { } + AMDGPUGfx(); void readMMIO(PacketPtr pkt, Addr offset); void writeMMIO(PacketPtr pkt, Addr offset); @@ -68,6 +71,12 @@ class AMDGPUGfx * GPU clock count at the time capture MMIO is received. */ uint64_t captured_clock_count = 1; + + /* + * Scratch registers. + */ + static constexpr int SCRATCH_REGS = 8; + std::array scratchRegs; }; } // namespace gem5 diff --git a/src/dev/amdgpu/amdgpu_nbio.cc b/src/dev/amdgpu/amdgpu_nbio.cc index 07027c3765..89b1682631 100644 --- a/src/dev/amdgpu/amdgpu_nbio.cc +++ b/src/dev/amdgpu/amdgpu_nbio.cc @@ -54,13 +54,21 @@ void AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset) { switch (offset) { + case AMDGPU_PCIE_DATA: + { + uint32_t value = gpuDevice->getRegVal(pcie_index_reg); + DPRINTF(AMDGPUDevice, "Read PCIe index %lx data %x\n", + pcie_index_reg, value); + pkt->setLE(value); + } + break; // This is a PCIe status register. At some point during driver init // the driver checks that interrupts are enabled. This is only // checked once, so if the MMIO trace does not exactly line up with // what the driver is doing in gem5, this may still have the first // bit zero causing driver to fail. Therefore, we always set this // bit to one as there is no harm to do so. - case AMDGPU_PCIE_DATA_REG: + case AMDGPU_PCIE_DATA2: { uint32_t value = pkt->getLE() | 0x1; DPRINTF(AMDGPUDevice, "Marking interrupts enabled: %#lx\n", value); @@ -68,7 +76,6 @@ AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset) } break; case AMDGPU_MM_DATA: - //pkt->setLE(regs[mm_index_reg]); pkt->setLE(gpuDevice->getRegVal(mm_index_reg)); break; case VEGA10_INV_ENG17_ACK1: @@ -89,17 +96,17 @@ AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset) case AMDGPU_MP0_SMN_C2PMSG_35: pkt->setLE(0x80000000); break; + case AMDGPU_MP1_SMN_C2PMSG_90: + pkt->setLE(0x1); + break; default: if (triggered_reads.count(offset)) { DPRINTF(AMDGPUDevice, "Found triggered read for %#x\n", offset); pkt->setLE(triggered_reads[offset]); - } else if (gpuDevice->haveRegVal(offset)) { - uint32_t reg_val = gpuDevice->getRegVal(offset); - - DPRINTF(AMDGPUDevice, "Reading value of %#lx from regs: %#lx\n", - offset, reg_val); - - pkt->setLE(reg_val); + } else if (regs.count(offset)) { + DPRINTF(AMDGPUDevice, "Returning value of unknown MMIO offset " + "%x: %x\n", offset, regs[offset]); + pkt->setLE(regs[offset]); } else { DPRINTF(AMDGPUDevice, "NBIO Unknown MMIO %#x (%#x)\n", offset, pkt->getAddr()); @@ -123,6 +130,14 @@ AMDGPUNbio::writeMMIO(PacketPtr pkt, Addr offset) DPRINTF(AMDGPUDevice, "MM write to reg %#lx data %#lx\n", mm_index_reg, pkt->getLE()); gpuDevice->setRegVal(AMDGPU_MM_DATA, pkt->getLE()); + } else if (offset == AMDGPU_PCIE_INDEX) { + assert(pkt->getSize() == 4); + pcie_index_reg = insertBits(pcie_index_reg, 31, 0, + pkt->getLE()); + } else if (offset == AMDGPU_PCIE_INDEX2) { + assert(pkt->getSize() == 4); + pcie_index_reg = insertBits(pcie_index_reg, 63, 32, + pkt->getLE()); } else if (offset == AMDGPU_MP0_SMN_C2PMSG_35) { // See psp_v3_1_bootloader_load_sos in amdgpu driver code. if (pkt->getLE() == 0x10000) { @@ -144,6 +159,14 @@ AMDGPUNbio::writeMMIO(PacketPtr pkt, Addr offset) } else if (offset == AMDGPU_MP0_SMN_C2PMSG_71) { // PSP ring size psp_ring_size = pkt->getLE(); + } else { + // Fallback to a map of register values. This was previously in the + // AMDGPUDevice, however that short-circuited some reads from other + // IP blocks. Since this is an end point IP block it is safer to use + // here. + regs[offset] = pkt->getLE(); + DPRINTF(AMDGPUDevice, "Writing value of unknown MMIO offset " + "%x: %x\n", offset, regs[offset]); } } diff --git a/src/dev/amdgpu/amdgpu_nbio.hh b/src/dev/amdgpu/amdgpu_nbio.hh index dc95443916..0d839d0e22 100644 --- a/src/dev/amdgpu/amdgpu_nbio.hh +++ b/src/dev/amdgpu/amdgpu_nbio.hh @@ -56,7 +56,11 @@ class AMDGPUDevice; #define AMDGPU_MM_INDEX 0x00000 #define AMDGPU_MM_INDEX_HI 0x00018 #define AMDGPU_MM_DATA 0x00004 -#define AMDGPU_PCIE_DATA_REG 0x0003c + +#define AMDGPU_PCIE_INDEX 0x00030 +#define AMDGPU_PCIE_INDEX2 0x00038 +#define AMDGPU_PCIE_DATA 0x00034 +#define AMDGPU_PCIE_DATA2 0x0003c // Message bus related to psp #define AMDGPU_MP0_SMN_C2PMSG_33 0x58184 @@ -66,6 +70,7 @@ class AMDGPUDevice; #define AMDGPU_MP0_SMN_C2PMSG_70 0x58218 #define AMDGPU_MP0_SMN_C2PMSG_71 0x5821c #define AMDGPU_MP0_SMN_C2PMSG_81 0x58244 +#define AMDGPU_MP1_SMN_C2PMSG_90 0x58a68 // Device specific invalidation engines used during initialization #define VEGA10_INV_ENG17_ACK1 0x0a318 @@ -105,6 +110,7 @@ class AMDGPUNbio * Driver initialization sequence helper variables. */ uint64_t mm_index_reg = 0; + uint64_t pcie_index_reg = 0; std::unordered_map triggered_reads; /* @@ -115,6 +121,12 @@ class AMDGPUNbio Addr psp_ring_listen_addr = 0; int psp_ring_size = 0; int psp_ring_value = 0; + + /* + * Hold values of other registers not explicitly modelled by other blocks. + */ + using GPURegMap = std::unordered_map; + GPURegMap regs; }; } // namespace gem5