diff --git a/configs/example/gpufs/vega10.py b/configs/example/gpufs/vega10.py index ae74efd39b..9c3116d415 100644 --- a/configs/example/gpufs/vega10.py +++ b/configs/example/gpufs/vega10.py @@ -52,7 +52,7 @@ if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5." /sbin/m5 exit fi -modprobe -v amdgpu ip_block_mask=0xff ppfeaturemask=0 dpm=0 audio=0 +modprobe -v amdgpu ip_block_mask=0xdf ppfeaturemask=0 dpm=0 audio=0 echo "Running {} {}" echo "{}" | base64 -d > myapp chmod +x myapp diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc index 48f450c2b2..4b684aa221 100644 --- a/src/dev/amdgpu/amdgpu_device.cc +++ b/src/dev/amdgpu/amdgpu_device.cc @@ -130,6 +130,7 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) pm4PktProc->setGPUDevice(this); cp->hsaPacketProc().setGPUDevice(this); cp->setGPUDevice(this); + nbio.setGPUDevice(this); // Address aperture for device memory. We tell this to the driver and // could possibly be anything, but these are the values used by hardware. @@ -163,8 +164,6 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) gpuvm.setMMHUBBase(mmhubBase); gpuvm.setMMHUBTop(mmhubTop); - - nbio.setGPUDevice(this); } void @@ -365,13 +364,6 @@ AMDGPUDevice::readMMIO(PacketPtr pkt, Addr offset) DPRINTF(AMDGPUDevice, "Read MMIO %#lx\n", offset); mmioReader.readFromTrace(pkt, MMIO_BAR, offset); - if (regs.find(offset) != regs.end()) { - uint64_t value = regs[offset]; - DPRINTF(AMDGPUDevice, "Reading what kernel wrote before: %#x\n", - value); - pkt->setUintX(value, ByteOrder::little); - } - switch (aperture) { case NBIO_BASE: nbio.readMMIO(pkt, aperture_offset); @@ -610,26 +602,39 @@ AMDGPUDevice::processPendingDoorbells(uint32_t offset) } } -bool -AMDGPUDevice::haveRegVal(uint32_t addr) -{ - return regs.count(addr); -} - uint32_t -AMDGPUDevice::getRegVal(uint32_t addr) +AMDGPUDevice::getRegVal(uint64_t addr) { + // This is somewhat of a guess based on amdgpu_device_mm_access + // in amdgpu_device.c in the ROCk driver. If bit 32 is 1 then + // assume VRAM and use full address, otherwise assume register + // address and only user lower 31 bits. + Addr fixup_addr = bits(addr, 31, 31) ? addr : addr & 0x7fffffff; + + uint32_t pkt_data = 0; + RequestPtr request = std::make_shared(fixup_addr, + sizeof(uint32_t), 0 /* flags */, vramRequestorId()); + PacketPtr pkt = Packet::createRead(request); + pkt->dataStatic((uint8_t *)&pkt_data); + readMMIO(pkt, addr); DPRINTF(AMDGPUDevice, "Getting register 0x%lx = %x\n", - addr, regs[addr]); - return regs[addr]; + fixup_addr, pkt->getLE()); + + return pkt->getLE(); } void -AMDGPUDevice::setRegVal(uint32_t addr, uint32_t value) +AMDGPUDevice::setRegVal(uint64_t addr, uint32_t value) { DPRINTF(AMDGPUDevice, "Setting register 0x%lx to %x\n", addr, value); - regs[addr] = value; + + uint32_t pkt_data = value; + RequestPtr request = std::make_shared(addr, + sizeof(uint32_t), 0 /* flags */, vramRequestorId()); + PacketPtr pkt = Packet::createWrite(request); + pkt->dataStatic((uint8_t *)&pkt_data); + writeMMIO(pkt, addr); } void @@ -675,20 +680,16 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const // Serialize the PciDevice base class PciDevice::serialize(cp); - uint64_t regs_size = regs.size(); uint64_t doorbells_size = doorbells.size(); uint64_t sdma_engs_size = sdmaEngs.size(); uint64_t used_vmid_map_size = usedVMIDs.size(); - SERIALIZE_SCALAR(regs_size); SERIALIZE_SCALAR(doorbells_size); SERIALIZE_SCALAR(sdma_engs_size); // Save the number of vmids used SERIALIZE_SCALAR(used_vmid_map_size); // Make a c-style array of the regs to serialize - uint32_t reg_addrs[regs_size]; - uint64_t reg_values[regs_size]; uint32_t doorbells_offset[doorbells_size]; QueueType doorbells_queues[doorbells_size]; uint32_t sdma_engs_offset[sdma_engs_size]; @@ -698,13 +699,6 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const std::vector used_vmid_sets; int idx = 0; - for (auto & it : regs) { - reg_addrs[idx] = it.first; - reg_values[idx] = it.second; - ++idx; - } - - idx = 0; for (auto & it : doorbells) { doorbells_offset[idx] = it.first; doorbells_queues[idx] = it.second; @@ -732,8 +726,6 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const int* vmid_array = new int[num_queue_id]; std::copy(used_vmid_sets.begin(), used_vmid_sets.end(), vmid_array); - SERIALIZE_ARRAY(reg_addrs, sizeof(reg_addrs)/sizeof(reg_addrs[0])); - SERIALIZE_ARRAY(reg_values, sizeof(reg_values)/sizeof(reg_values[0])); SERIALIZE_ARRAY(doorbells_offset, sizeof(doorbells_offset)/ sizeof(doorbells_offset[0])); SERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/ @@ -764,30 +756,15 @@ AMDGPUDevice::unserialize(CheckpointIn &cp) // Unserialize the PciDevice base class PciDevice::unserialize(cp); - uint64_t regs_size = 0; uint64_t doorbells_size = 0; uint64_t sdma_engs_size = 0; uint64_t used_vmid_map_size = 0; - UNSERIALIZE_SCALAR(regs_size); UNSERIALIZE_SCALAR(doorbells_size); UNSERIALIZE_SCALAR(sdma_engs_size); UNSERIALIZE_SCALAR(used_vmid_map_size); - if (regs_size > 0) { - uint32_t reg_addrs[regs_size]; - uint64_t reg_values[regs_size]; - - UNSERIALIZE_ARRAY(reg_addrs, sizeof(reg_addrs)/sizeof(reg_addrs[0])); - UNSERIALIZE_ARRAY(reg_values, - sizeof(reg_values)/sizeof(reg_values[0])); - - for (int idx = 0; idx < regs_size; ++idx) { - regs.insert(std::make_pair(reg_addrs[idx], reg_values[idx])); - } - } - if (doorbells_size > 0) { uint32_t doorbells_offset[doorbells_size]; QueueType doorbells_queues[doorbells_size]; @@ -798,8 +775,6 @@ AMDGPUDevice::unserialize(CheckpointIn &cp) sizeof(doorbells_queues[0])); for (int idx = 0; idx < doorbells_size; ++idx) { - regs.insert(std::make_pair(doorbells_offset[idx], - doorbells_queues[idx])); doorbells[doorbells_offset[idx]] = doorbells_queues[idx]; } } diff --git a/src/dev/amdgpu/amdgpu_device.hh b/src/dev/amdgpu/amdgpu_device.hh index b6b6e2a81a..fface5fb3e 100644 --- a/src/dev/amdgpu/amdgpu_device.hh +++ b/src/dev/amdgpu/amdgpu_device.hh @@ -87,8 +87,6 @@ class AMDGPUDevice : public PciDevice /** * Structures to hold registers, doorbells, and some frame memory */ - using GPURegMap = std::unordered_map; - GPURegMap regs; std::unordered_map doorbells; std::unordered_map pendingDoorbellPkts; @@ -195,9 +193,8 @@ class AMDGPUDevice : public PciDevice * Register value getter/setter. Used by other GPU blocks to change * values from incoming driver/user packets. */ - bool haveRegVal(uint32_t addr); - uint32_t getRegVal(uint32_t addr); - void setRegVal(uint32_t addr, uint32_t value); + uint32_t getRegVal(uint64_t addr); + void setRegVal(uint64_t addr, uint32_t value); /** * Methods related to translations and system/device memory. diff --git a/src/dev/amdgpu/amdgpu_gfx.cc b/src/dev/amdgpu/amdgpu_gfx.cc index 3d5b274b86..60fabaf31d 100644 --- a/src/dev/amdgpu/amdgpu_gfx.cc +++ b/src/dev/amdgpu/amdgpu_gfx.cc @@ -37,6 +37,13 @@ namespace gem5 { +AMDGPUGfx::AMDGPUGfx() +{ + for (int i = 0; i < SCRATCH_REGS; ++i) { + scratchRegs[i] = 0; + } +} + void AMDGPUGfx::readMMIO(PacketPtr pkt, Addr offset) { @@ -47,6 +54,9 @@ AMDGPUGfx::readMMIO(PacketPtr pkt, Addr offset) case AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB: pkt->setLE(captured_clock_count >> 32); break; + case AMDGPU_MM_SCRATCH_REG0: + pkt->setLE(scratchRegs[0]); + break; default: break; } @@ -65,6 +75,9 @@ AMDGPUGfx::writeMMIO(PacketPtr pkt, Addr offset) captured_clock_count = curTick() / sim_clock::as_int::ns; } break; + case AMDGPU_MM_SCRATCH_REG0: + scratchRegs[0] = pkt->getLE(); + break; default: break; } diff --git a/src/dev/amdgpu/amdgpu_gfx.hh b/src/dev/amdgpu/amdgpu_gfx.hh index c32b8624cf..9fb1d82553 100644 --- a/src/dev/amdgpu/amdgpu_gfx.hh +++ b/src/dev/amdgpu/amdgpu_gfx.hh @@ -52,13 +52,16 @@ #define AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB 0x13094 #define AMDGPU_MM_RLC_CAPTURE_GPU_CLOCK_COUNT 0x13098 +// Scratch registers used for GPU post +#define AMDGPU_MM_SCRATCH_REG0 0x08100 + namespace gem5 { class AMDGPUGfx { public: - AMDGPUGfx() { } + AMDGPUGfx(); void readMMIO(PacketPtr pkt, Addr offset); void writeMMIO(PacketPtr pkt, Addr offset); @@ -68,6 +71,12 @@ class AMDGPUGfx * GPU clock count at the time capture MMIO is received. */ uint64_t captured_clock_count = 1; + + /* + * Scratch registers. + */ + static constexpr int SCRATCH_REGS = 8; + std::array scratchRegs; }; } // namespace gem5 diff --git a/src/dev/amdgpu/amdgpu_nbio.cc b/src/dev/amdgpu/amdgpu_nbio.cc index 07027c3765..89b1682631 100644 --- a/src/dev/amdgpu/amdgpu_nbio.cc +++ b/src/dev/amdgpu/amdgpu_nbio.cc @@ -54,13 +54,21 @@ void AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset) { switch (offset) { + case AMDGPU_PCIE_DATA: + { + uint32_t value = gpuDevice->getRegVal(pcie_index_reg); + DPRINTF(AMDGPUDevice, "Read PCIe index %lx data %x\n", + pcie_index_reg, value); + pkt->setLE(value); + } + break; // This is a PCIe status register. At some point during driver init // the driver checks that interrupts are enabled. This is only // checked once, so if the MMIO trace does not exactly line up with // what the driver is doing in gem5, this may still have the first // bit zero causing driver to fail. Therefore, we always set this // bit to one as there is no harm to do so. - case AMDGPU_PCIE_DATA_REG: + case AMDGPU_PCIE_DATA2: { uint32_t value = pkt->getLE() | 0x1; DPRINTF(AMDGPUDevice, "Marking interrupts enabled: %#lx\n", value); @@ -68,7 +76,6 @@ AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset) } break; case AMDGPU_MM_DATA: - //pkt->setLE(regs[mm_index_reg]); pkt->setLE(gpuDevice->getRegVal(mm_index_reg)); break; case VEGA10_INV_ENG17_ACK1: @@ -89,17 +96,17 @@ AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset) case AMDGPU_MP0_SMN_C2PMSG_35: pkt->setLE(0x80000000); break; + case AMDGPU_MP1_SMN_C2PMSG_90: + pkt->setLE(0x1); + break; default: if (triggered_reads.count(offset)) { DPRINTF(AMDGPUDevice, "Found triggered read for %#x\n", offset); pkt->setLE(triggered_reads[offset]); - } else if (gpuDevice->haveRegVal(offset)) { - uint32_t reg_val = gpuDevice->getRegVal(offset); - - DPRINTF(AMDGPUDevice, "Reading value of %#lx from regs: %#lx\n", - offset, reg_val); - - pkt->setLE(reg_val); + } else if (regs.count(offset)) { + DPRINTF(AMDGPUDevice, "Returning value of unknown MMIO offset " + "%x: %x\n", offset, regs[offset]); + pkt->setLE(regs[offset]); } else { DPRINTF(AMDGPUDevice, "NBIO Unknown MMIO %#x (%#x)\n", offset, pkt->getAddr()); @@ -123,6 +130,14 @@ AMDGPUNbio::writeMMIO(PacketPtr pkt, Addr offset) DPRINTF(AMDGPUDevice, "MM write to reg %#lx data %#lx\n", mm_index_reg, pkt->getLE()); gpuDevice->setRegVal(AMDGPU_MM_DATA, pkt->getLE()); + } else if (offset == AMDGPU_PCIE_INDEX) { + assert(pkt->getSize() == 4); + pcie_index_reg = insertBits(pcie_index_reg, 31, 0, + pkt->getLE()); + } else if (offset == AMDGPU_PCIE_INDEX2) { + assert(pkt->getSize() == 4); + pcie_index_reg = insertBits(pcie_index_reg, 63, 32, + pkt->getLE()); } else if (offset == AMDGPU_MP0_SMN_C2PMSG_35) { // See psp_v3_1_bootloader_load_sos in amdgpu driver code. if (pkt->getLE() == 0x10000) { @@ -144,6 +159,14 @@ AMDGPUNbio::writeMMIO(PacketPtr pkt, Addr offset) } else if (offset == AMDGPU_MP0_SMN_C2PMSG_71) { // PSP ring size psp_ring_size = pkt->getLE(); + } else { + // Fallback to a map of register values. This was previously in the + // AMDGPUDevice, however that short-circuited some reads from other + // IP blocks. Since this is an end point IP block it is safer to use + // here. + regs[offset] = pkt->getLE(); + DPRINTF(AMDGPUDevice, "Writing value of unknown MMIO offset " + "%x: %x\n", offset, regs[offset]); } } diff --git a/src/dev/amdgpu/amdgpu_nbio.hh b/src/dev/amdgpu/amdgpu_nbio.hh index dc95443916..0d839d0e22 100644 --- a/src/dev/amdgpu/amdgpu_nbio.hh +++ b/src/dev/amdgpu/amdgpu_nbio.hh @@ -56,7 +56,11 @@ class AMDGPUDevice; #define AMDGPU_MM_INDEX 0x00000 #define AMDGPU_MM_INDEX_HI 0x00018 #define AMDGPU_MM_DATA 0x00004 -#define AMDGPU_PCIE_DATA_REG 0x0003c + +#define AMDGPU_PCIE_INDEX 0x00030 +#define AMDGPU_PCIE_INDEX2 0x00038 +#define AMDGPU_PCIE_DATA 0x00034 +#define AMDGPU_PCIE_DATA2 0x0003c // Message bus related to psp #define AMDGPU_MP0_SMN_C2PMSG_33 0x58184 @@ -66,6 +70,7 @@ class AMDGPUDevice; #define AMDGPU_MP0_SMN_C2PMSG_70 0x58218 #define AMDGPU_MP0_SMN_C2PMSG_71 0x5821c #define AMDGPU_MP0_SMN_C2PMSG_81 0x58244 +#define AMDGPU_MP1_SMN_C2PMSG_90 0x58a68 // Device specific invalidation engines used during initialization #define VEGA10_INV_ENG17_ACK1 0x0a318 @@ -105,6 +110,7 @@ class AMDGPUNbio * Driver initialization sequence helper variables. */ uint64_t mm_index_reg = 0; + uint64_t pcie_index_reg = 0; std::unordered_map triggered_reads; /* @@ -115,6 +121,12 @@ class AMDGPUNbio Addr psp_ring_listen_addr = 0; int psp_ring_size = 0; int psp_ring_value = 0; + + /* + * Hold values of other registers not explicitly modelled by other blocks. + */ + using GPURegMap = std::unordered_map; + GPURegMap regs; }; } // namespace gem5