dev-amdgpu: Support for ROCm 6.0 (#926)

Implement several features new in ROCm 6.0 and features required for future devices. Includes the following: - Support for multiple command processors - Improve handling of unknown register addresses - Use AddrRange for MMIO address regions - Handle GART writes through SDMA copy - Implement PCIe indirect reads and writes - Improve PM4 write to check dword count - Implement common MI300X instruction
2024-03-21 21:12:09 -07:00
parent dca040983b 823b5a6eb8
commit 7d62da6d10
23 changed files with 565 additions and 261 deletions
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -188,9 +188,15 @@ def makeGpuFSSystem(args):

    system.pc.south_bridge.gpu.sdmas = sdma_engines

-    # Setup PM4 packet processor
-    pm4_pkt_proc = PM4PacketProcessor()
-    system.pc.south_bridge.gpu.pm4_pkt_proc = pm4_pkt_proc
+    # Setup PM4 packet processors
+    pm4_procs = []
+    pm4_procs.append(
+        PM4PacketProcessor(
+            ip_id=0, mmio_range=AddrRange(start=0xC000, end=0xD000)
+        )
+    )
+
+    system.pc.south_bridge.gpu.pm4_pkt_procs = pm4_procs

    # GPU data path
    gpu_mem_mgr = AMDGPUMemoryManager()
@@ -207,7 +213,8 @@ def makeGpuFSSystem(args):
    for sdma in sdma_engines:
        system._dma_ports.append(sdma)
    system._dma_ports.append(device_ih)
-    system._dma_ports.append(pm4_pkt_proc)
+    for pm4_proc in pm4_procs:
+        system._dma_ports.append(pm4_proc)
    system._dma_ports.append(system_hub)
    system._dma_ports.append(gpu_mem_mgr)
    system._dma_ports.append(hsapp_pt_walker)
@@ -221,7 +228,8 @@ def makeGpuFSSystem(args):
    for sdma in sdma_engines:
        sdma.pio = system.iobus.mem_side_ports
    device_ih.pio = system.iobus.mem_side_ports
-    pm4_pkt_proc.pio = system.iobus.mem_side_ports
+    for pm4_proc in pm4_procs:
+        pm4_proc.pio = system.iobus.mem_side_ports
    system_hub.pio = system.iobus.mem_side_ports

    # Full system needs special TLBs for SQC, Scalar, and vector data ports
--- a/configs/example/gpufs/vega10.py
+++ b/configs/example/gpufs/vega10.py
@@ -52,7 +52,7 @@ if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
    echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
    /sbin/m5 exit
 fi
-modprobe -v amdgpu ip_block_mask=0xff ppfeaturemask=0 dpm=0 audio=0
+modprobe -v amdgpu ip_block_mask=0xdf ppfeaturemask=0 dpm=0 audio=0
 echo "Running {} {}"
 echo "{}" | base64 -d > myapp
 chmod +x myapp
--- a/src/arch/amdgpu/vega/gpu_decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -500,10 +500,10 @@ namespace VegaISA
        &Decoder::subDecode_OP_FLAT,
        &Decoder::subDecode_OP_FLAT,
        &Decoder::subDecode_OP_FLAT,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::subDecode_OP_FLAT,
+        &Decoder::subDecode_OP_FLAT,
+        &Decoder::subDecode_OP_FLAT,
+        &Decoder::subDecode_OP_FLAT,
        &Decoder::subDecode_OP_MUBUF,
        &Decoder::subDecode_OP_MUBUF,
        &Decoder::subDecode_OP_MUBUF,
@@ -1091,7 +1091,7 @@ namespace VegaISA
        &Decoder::decode_OPU_VOP3__V_MAD_I16,
        &Decoder::decode_OPU_VOP3__V_FMA_F16,
        &Decoder::decode_OPU_VOP3__V_DIV_FIXUP_F16,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OPU_VOP3__V_LSHL_ADD_U64,
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
        &Decoder::decode_invalid,
@@ -7053,6 +7053,12 @@ namespace VegaISA
        return new Inst_VOP3__V_DIV_FIXUP_F16(&iFmt->iFmt_VOP3A);
    }

+    GPUStaticInst*
+    Decoder::decode_OPU_VOP3__V_LSHL_ADD_U64(MachInst iFmt)
+    {
+        return new Inst_VOP3__V_LSHL_ADD_U64(&iFmt->iFmt_VOP3A);
+    }
+
    GPUStaticInst*
    Decoder::decode_OPU_VOP3__V_INTERP_P1_F32(MachInst iFmt)
    {
--- a/src/arch/amdgpu/vega/gpu_decoder.hh
+++ b/src/arch/amdgpu/vega/gpu_decoder.hh
@@ -470,6 +470,7 @@ namespace VegaISA
        GPUStaticInst* decode_OPU_VOP3__V_MAD_I16(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_FMA_F16(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_DIV_FIXUP_F16(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_LSHL_ADD_U64(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_INTERP_P1_F32(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_INTERP_P2_F32(MachInst);
        GPUStaticInst* decode_OPU_VOP3__V_INTERP_MOV_F32(MachInst);
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
@@ -30192,6 +30192,42 @@ namespace VegaISA
        void execute(GPUDynInstPtr) override;
    }; // Inst_VOP3__V_DIV_FIXUP_F16

+    class Inst_VOP3__V_LSHL_ADD_U64 : public Inst_VOP3A
+    {
+      public:
+        Inst_VOP3__V_LSHL_ADD_U64(InFmt_VOP3A*);
+        ~Inst_VOP3__V_LSHL_ADD_U64();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src_0
+                return 8;
+              case 1: //src_1
+                return 4;
+              case 2: //src_2
+                return 8;
+              case 3: //vdst
+                return 8;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3__V_LSHL_ADD_U64
+
    class Inst_VOP3__V_CVT_PKACCUM_U8_F32 : public Inst_VOP3A
    {
      public:
--- a/src/arch/amdgpu/vega/insts/vop3.cc
+++ b/src/arch/amdgpu/vega/insts/vop3.cc
@@ -7630,6 +7630,54 @@ namespace VegaISA
    {
        panicUnimplemented();
    } // execute
+    // --- Inst_VOP3__V_LSHL_ADD_U64 class methods ---
+
+    Inst_VOP3__V_LSHL_ADD_U64::Inst_VOP3__V_LSHL_ADD_U64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshl_add_u64", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHL_ADD_U64
+
+    Inst_VOP3__V_LSHL_ADD_U64::~Inst_VOP3__V_LSHL_ADD_U64()
+    {
+    } // ~Inst_VOP3__V_LSHL_ADD_U64
+
+    // --- description from .arch file ---
+    // D.u = (S0.u << S1.u[4:0]) + S2.u.
+    void
+    Inst_VOP3__V_LSHL_ADD_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int shift_amount = bits(src1[lane], 2, 0);
+                shift_amount = shift_amount > 4 ? 0 : shift_amount;
+                vdst[lane] = (src0[lane] << shift_amount)
+                           + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
    // --- Inst_VOP3__V_CVT_PKACCUM_U8_F32 class methods ---

    Inst_VOP3__V_CVT_PKACCUM_U8_F32::Inst_VOP3__V_CVT_PKACCUM_U8_F32(
--- a/src/dev/amdgpu/AMDGPU.py
+++ b/src/dev/amdgpu/AMDGPU.py
@@ -95,7 +95,7 @@ class AMDGPUDevice(PciDevice):
    # The config script should not create a new cp here but rather assign the
    # same cp that is assigned to the Shader SimObject.
    cp = Param.GPUCommandProcessor(NULL, "Command Processor")
-    pm4_pkt_proc = Param.PM4PacketProcessor("PM4 Packet Processor")
+    pm4_pkt_procs = VectorParam.PM4PacketProcessor("PM4 Packet Processor")
    memory_manager = Param.AMDGPUMemoryManager("GPU Memory Manager")
    memories = VectorParam.AbstractMemory([], "All memories in the device")
    device_ih = Param.AMDGPUInterruptHandler("GPU Interrupt handler")
@@ -118,6 +118,10 @@ class PM4PacketProcessor(DmaVirtDevice):
    cxx_header = "dev/amdgpu/pm4_packet_processor.hh"
    cxx_class = "gem5::PM4PacketProcessor"

+    # Default to 0 as the common case is one PM4 packet processor
+    ip_id = Param.Int(0, "Instance ID of this PM4 processor")
+    mmio_range = Param.AddrRange("Range of MMIO addresses")
+

 class AMDGPUMemoryManager(ClockedObject):
    type = "AMDGPUMemoryManager"
--- a/src/dev/amdgpu/amdgpu_defines.hh
+++ b/src/dev/amdgpu/amdgpu_defines.hh
@@ -49,6 +49,16 @@ enum QueueType
    RLC
 };

+/*
+ * Hold information about doorbells including queue type and the IP
+ * block ID if the IP can have multiple instances.
+ */
+typedef struct
+{
+    QueueType qtype;
+    int ip_id;
+} DoorbellInfo;
+
 // AMD GPUs support 16 different virtual address spaces
 static constexpr int AMDGPU_VM_COUNT = 16;

@@ -61,36 +71,11 @@ constexpr int MMIO_BAR = 5;
 constexpr uint32_t VGA_ROM_DEFAULT = 0xc0000;
 constexpr uint32_t ROM_SIZE = 0x20000;        // 128kB

-/* SDMA base, size, mmio offset shift. */
-static constexpr uint32_t SDMA0_BASE  = 0x4980;
-static constexpr uint32_t SDMA1_BASE  = 0x5180;
-static constexpr uint32_t SDMA_SIZE  = 0x800;
-static constexpr uint32_t SDMA_OFFSET_SHIFT  = 2;
-
-/* Interrupt handler base, size, mmio offset shift. */
-static constexpr uint32_t IH_BASE = 0x4280;
-static constexpr uint32_t IH_SIZE = 0x700;
+/* Most MMIOs use DWORD addresses and thus need to be shifted. */
 static constexpr uint32_t IH_OFFSET_SHIFT = 2;
-
-/* Graphics register bus manager base, size, mmio offset shift. */
-static constexpr uint32_t GRBM_BASE  = 0x8000;
-static constexpr uint32_t GRBM_SIZE  = 0x5000;
 static constexpr uint32_t GRBM_OFFSET_SHIFT  = 2;
-
-/* GFX base, size, mmio offset shift. */
-static constexpr uint32_t GFX_BASE  = 0x28000;
-static constexpr uint32_t GFX_SIZE  = 0x17000;
-static constexpr uint32_t GFX_OFFSET_SHIFT  = 2;
-
-/* MMHUB base, size, mmio offset shift. */
-static constexpr uint32_t MMHUB_BASE = 0x68000;
-static constexpr uint32_t MMHUB_SIZE = 0x2120;
 static constexpr uint32_t MMHUB_OFFSET_SHIFT = 2;

-/* NBIO base and size. */
-static constexpr uint32_t NBIO_BASE = 0x0;
-static constexpr uint32_t NBIO_SIZE = 0x4280;
-
 } // namespace gem5

 #endif // __DEV_AMDGPU_AMDGPU_DEFINES_HH__
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -54,8 +54,7 @@ namespace gem5

 AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
    : PciDevice(p), gpuMemMgr(p.memory_manager), deviceIH(p.device_ih),
-      pm4PktProc(p.pm4_pkt_proc), cp(p.cp),
-      checkpoint_before_mmios(p.checkpoint_before_mmios),
+      cp(p.cp), checkpoint_before_mmios(p.checkpoint_before_mmios),
      init_interrupt_count(0), _lastVMID(0),
      deviceMem(name() + ".deviceMem", p.memories, false, "", false)
 {
@@ -81,6 +80,16 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
        romRange = RangeSize(VGA_ROM_DEFAULT, ROM_SIZE);
    }

+    if (p.device_name == "Vega10") {
+        gfx_version = GfxVersion::gfx900;
+    } else if (p.device_name == "MI100") {
+        gfx_version = GfxVersion::gfx908;
+    } else if (p.device_name == "MI200") {
+        gfx_version = GfxVersion::gfx90a;
+    } else {
+        panic("Unknown GPU device %s\n", p.device_name);
+    }
+
    if (p.trace_file != "") {
        mmioReader.readMMIOTrace(p.trace_file);
    }
@@ -126,15 +135,47 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
        panic("Unknown GPU device %s\n", p.device_name);
    }

+    // Setup PM4 packet processors and sanity check IDs
+    std::set<int> pm4_ids;
+    for (auto& pm4 : p.pm4_pkt_procs) {
+        pm4->setGPUDevice(this);
+        fatal_if(pm4_ids.count(pm4->getIpId()),
+                "Two PM4s with same IP IDs is not allowed");
+        pm4_ids.insert(pm4->getIpId());
+        pm4PktProcs.insert({pm4->getIpId(), pm4});
+
+        pm4Ranges.insert({pm4->getMMIORange(), pm4});
+    }
+
+    // There should be at least one PM4 packet processor with ID 0
+    fatal_if(!pm4PktProcs.count(0), "No default PM4 processor found");
+
    deviceIH->setGPUDevice(this);
-    pm4PktProc->setGPUDevice(this);
    cp->hsaPacketProc().setGPUDevice(this);
    cp->setGPUDevice(this);
+    nbio.setGPUDevice(this);

    // Address aperture for device memory. We tell this to the driver and
    // could possibly be anything, but these are the values used by hardware.
    uint64_t mmhubBase = 0x8000ULL << 24;
    uint64_t mmhubTop = 0x83ffULL << 24;
+    uint64_t mem_size = 0x3ff0; // 16 GB of memory
+
+    gpuvm.setMMHUBBase(mmhubBase);
+    gpuvm.setMMHUBTop(mmhubTop);
+
+    // Map other MMIO apertures based on gfx version. This must be done before
+    // any calls to get/setRegVal.
+    // NBIO               0x0     - 0x4280
+    // IH                 0x4280  - 0x4980
+    // GRBM               0x8000  - 0xC000
+    // GFX                0x28000 - 0x3F000
+    // MMHUB              0x68000 - 0x6a120
+    gpuvm.setMMIOAperture(NBIO_MMIO_RANGE, AddrRange(0x0, 0x4280));
+    gpuvm.setMMIOAperture(IH_MMIO_RANGE,   AddrRange(0x4280, 0x4980));
+    gpuvm.setMMIOAperture(GRBM_MMIO_RANGE, AddrRange(0x8000, 0xC000));
+    gpuvm.setMMIOAperture(GFX_MMIO_RANGE,  AddrRange(0x28000, 0x3F000));
+    gpuvm.setMMIOAperture(MMHUB_MMIO_RANGE,  AddrRange(0x68000, 0x6A120));

    // These are hardcoded register values to return what the driver expects
    setRegVal(AMDGPU_MP0_SMN_C2PMSG_33, 0x80000000);
@@ -144,27 +185,19 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
    if (p.device_name == "Vega10") {
        setRegVal(VEGA10_FB_LOCATION_BASE, mmhubBase >> 24);
        setRegVal(VEGA10_FB_LOCATION_TOP, mmhubTop >> 24);
-        gfx_version = GfxVersion::gfx900;
    } else if (p.device_name == "MI100") {
        setRegVal(MI100_FB_LOCATION_BASE, mmhubBase >> 24);
        setRegVal(MI100_FB_LOCATION_TOP, mmhubTop >> 24);
-        setRegVal(MI100_MEM_SIZE_REG, 0x3ff0); // 16GB of memory
-        gfx_version = GfxVersion::gfx908;
+        setRegVal(MI100_MEM_SIZE_REG, mem_size);
    } else if (p.device_name == "MI200") {
        // This device can have either 64GB or 128GB of device memory.
        // This limits to 16GB for simulation.
        setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
        setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
-        setRegVal(MI200_MEM_SIZE_REG, 0x3ff0);
-        gfx_version = GfxVersion::gfx90a;
+        setRegVal(MI200_MEM_SIZE_REG, mem_size);
    } else {
        panic("Unknown GPU device %s\n", p.device_name);
    }
-
-    gpuvm.setMMHUBBase(mmhubBase);
-    gpuvm.setMMHUBTop(mmhubTop);
-
-    nbio.setGPUDevice(this);
 }

 void
@@ -357,36 +390,28 @@ AMDGPUDevice::readDoorbell(PacketPtr pkt, Addr offset)
 void
 AMDGPUDevice::readMMIO(PacketPtr pkt, Addr offset)
 {
-    Addr aperture = gpuvm.getMmioAperture(offset);
-    Addr aperture_offset = offset - aperture;
+    AddrRange aperture = gpuvm.getMMIOAperture(offset);
+    Addr aperture_offset = offset - aperture.start();

    // By default read from MMIO trace. Overwrite the packet for a select
    // few more dynamic MMIOs.
    DPRINTF(AMDGPUDevice, "Read MMIO %#lx\n", offset);
    mmioReader.readFromTrace(pkt, MMIO_BAR, offset);

-    if (regs.find(offset) != regs.end()) {
-        uint64_t value = regs[offset];
-        DPRINTF(AMDGPUDevice, "Reading what kernel wrote before: %#x\n",
-                value);
-        pkt->setUintX(value, ByteOrder::little);
-    }
-
-    switch (aperture) {
-      case NBIO_BASE:
+    if (aperture == gpuvm.getMMIORange(NBIO_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "NBIO base\n");
        nbio.readMMIO(pkt, aperture_offset);
-        break;
-      case GRBM_BASE:
+    } else if (aperture == gpuvm.getMMIORange(GRBM_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "GRBM base\n");
        gpuvm.readMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
-        break;
-      case GFX_BASE:
+    } else if (aperture == gpuvm.getMMIORange(GFX_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "GFX base\n");
        gfx.readMMIO(pkt, aperture_offset);
-        break;
-      case MMHUB_BASE:
+    } else if (aperture == gpuvm.getMMIORange(MMHUB_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "MMHUB base\n");
        gpuvm.readMMIO(pkt, aperture_offset >> MMHUB_OFFSET_SHIFT);
-        break;
-      default:
-        break;
+    } else {
+        DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for read %#x\n", offset);
    }
 }

@@ -430,17 +455,22 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
    DPRINTF(AMDGPUDevice, "Wrote doorbell %#lx\n", offset);

    if (doorbells.find(offset) != doorbells.end()) {
-        QueueType q_type = doorbells[offset];
+        QueueType q_type = doorbells[offset].qtype;
+        int ip_id = doorbells[offset].ip_id;
        DPRINTF(AMDGPUDevice, "Doorbell offset %p queue: %d\n",
                              offset, q_type);
        switch (q_type) {
          case Compute:
-            pm4PktProc->process(pm4PktProc->getQueue(offset),
-                                pkt->getLE<uint64_t>());
+            assert(pm4PktProcs.count(ip_id));
+            pm4PktProcs[ip_id]->process(
+                pm4PktProcs[ip_id]->getQueue(offset),
+                pkt->getLE<uint64_t>());
          break;
          case Gfx:
-            pm4PktProc->process(pm4PktProc->getQueue(offset, true),
-                                pkt->getLE<uint64_t>());
+            assert(pm4PktProcs.count(ip_id));
+            pm4PktProcs[ip_id]->process(
+                pm4PktProcs[ip_id]->getQueue(offset, true),
+                pkt->getLE<uint64_t>());
          break;
          case SDMAGfx: {
            SDMAEngine *sdmaEng = getSDMAEngine(offset);
@@ -451,9 +481,11 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
            sdmaEng->processPage(pkt->getLE<uint64_t>());
          } break;
          case ComputeAQL: {
+            assert(pm4PktProcs.count(ip_id));
            cp->hsaPacketProc().hwScheduler()->write(offset,
                pkt->getLE<uint64_t>() + 1);
-            pm4PktProc->updateReadIndex(offset, pkt->getLE<uint64_t>() + 1);
+            pm4PktProcs[ip_id]->updateReadIndex(offset,
+                pkt->getLE<uint64_t>() + 1);
          } break;
          case InterruptHandler:
            deviceIH->updateRptr(pkt->getLE<uint32_t>());
@@ -483,12 +515,12 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
 void
 AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset)
 {
-    Addr aperture = gpuvm.getMmioAperture(offset);
-    Addr aperture_offset = offset - aperture;
+    AddrRange aperture = gpuvm.getMMIOAperture(offset);
+    Addr aperture_offset = offset - aperture.start();

    DPRINTF(AMDGPUDevice, "Wrote MMIO %#lx\n", offset);

-    // Check SDMA functions first, then fallback to switch statement
+    // Check SDMA functions first, then fallback to MMIO ranges.
    for (int idx = 0; idx < sdmaIds.size(); ++idx) {
        if (sdmaMmios[idx].contains(offset)) {
            Addr sdma_offset = (offset - sdmaMmios[idx].start()) >> 2;
@@ -506,26 +538,31 @@ AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset)
        }
    }

-    switch (aperture) {
-      /* Write a general register to the graphics register bus manager. */
-      case GRBM_BASE:
+    // Check PM4s next, returning to avoid duplicate writes.
+    for (auto& [range, pm4_proc] : pm4Ranges) {
+        if (range.contains(offset)) {
+            // PM4 MMIOs are offset based on the MMIO range start
+            Addr ip_offset = offset - range.start();
+            pm4_proc->writeMMIO(pkt, ip_offset >> GRBM_OFFSET_SHIFT);
+
+            return;
+        }
+    }
+
+    if (aperture == gpuvm.getMMIORange(GRBM_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "GRBM base\n");
        gpuvm.writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
-        pm4PktProc->writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
-        break;
-      /* Write a register to the interrupt handler. */
-      case IH_BASE:
+    } else if (aperture == gpuvm.getMMIORange(IH_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "IH base\n");
        deviceIH->writeMMIO(pkt, aperture_offset >> IH_OFFSET_SHIFT);
-        break;
-      /* Write an IO space register */
-      case NBIO_BASE:
+    } else if (aperture == gpuvm.getMMIORange(NBIO_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "NBIO base\n");
        nbio.writeMMIO(pkt, aperture_offset);
-        break;
-      case GFX_BASE:
+    } else if (aperture == gpuvm.getMMIORange(GFX_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "GFX base\n");
        gfx.writeMMIO(pkt, aperture_offset);
-        break;
-      default:
-        DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for %#x\n", offset);
-        break;
+    } else {
+        DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for write %#x\n", offset);
    }
 }

@@ -610,33 +647,47 @@ AMDGPUDevice::processPendingDoorbells(uint32_t offset)
    }
 }

-bool
-AMDGPUDevice::haveRegVal(uint32_t addr)
-{
-    return regs.count(addr);
-}
-
 uint32_t
-AMDGPUDevice::getRegVal(uint32_t addr)
+AMDGPUDevice::getRegVal(uint64_t addr)
 {
+    // This is somewhat of a guess based on amdgpu_device_mm_access
+    // in amdgpu_device.c in the ROCk driver. If bit 32 is 1 then
+    // assume VRAM and use full address, otherwise assume register
+    // address and only user lower 31 bits.
+    Addr fixup_addr = bits(addr, 31, 31) ? addr : addr & 0x7fffffff;
+
+    uint32_t pkt_data = 0;
+    RequestPtr request = std::make_shared<Request>(fixup_addr,
+            sizeof(uint32_t), 0 /* flags */, vramRequestorId());
+    PacketPtr pkt = Packet::createRead(request);
+    pkt->dataStatic((uint8_t *)&pkt_data);
+    readMMIO(pkt, addr);
    DPRINTF(AMDGPUDevice, "Getting register 0x%lx = %x\n",
-            addr, regs[addr]);
-    return regs[addr];
+            fixup_addr, pkt->getLE<uint32_t>());
+
+    return pkt->getLE<uint32_t>();
 }

 void
-AMDGPUDevice::setRegVal(uint32_t addr, uint32_t value)
+AMDGPUDevice::setRegVal(uint64_t addr, uint32_t value)
 {
    DPRINTF(AMDGPUDevice, "Setting register 0x%lx to %x\n",
            addr, value);
-    regs[addr] = value;
+
+    uint32_t pkt_data = value;
+    RequestPtr request = std::make_shared<Request>(addr,
+            sizeof(uint32_t), 0 /* flags */, vramRequestorId());
+    PacketPtr pkt = Packet::createWrite(request);
+    pkt->dataStatic((uint8_t *)&pkt_data);
+    writeMMIO(pkt, addr);
 }

 void
-AMDGPUDevice::setDoorbellType(uint32_t offset, QueueType qt)
+AMDGPUDevice::setDoorbellType(uint32_t offset, QueueType qt, int ip_id)
 {
    DPRINTF(AMDGPUDevice, "Setting doorbell type for %x\n", offset);
-    doorbells[offset] = qt;
+    doorbells[offset].qtype = qt;
+    doorbells[offset].ip_id = ip_id;
 }

 void
@@ -675,22 +726,19 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
    // Serialize the PciDevice base class
    PciDevice::serialize(cp);

-    uint64_t regs_size = regs.size();
    uint64_t doorbells_size = doorbells.size();
    uint64_t sdma_engs_size = sdmaEngs.size();
    uint64_t used_vmid_map_size = usedVMIDs.size();

-    SERIALIZE_SCALAR(regs_size);
    SERIALIZE_SCALAR(doorbells_size);
    SERIALIZE_SCALAR(sdma_engs_size);
    // Save the number of vmids used
    SERIALIZE_SCALAR(used_vmid_map_size);

    // Make a c-style array of the regs to serialize
-    uint32_t reg_addrs[regs_size];
-    uint64_t reg_values[regs_size];
    uint32_t doorbells_offset[doorbells_size];
    QueueType doorbells_queues[doorbells_size];
+    int doorbells_ip_ids[doorbells_size];
    uint32_t sdma_engs_offset[sdma_engs_size];
    int sdma_engs[sdma_engs_size];
    int used_vmids[used_vmid_map_size];
@@ -698,16 +746,10 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
    std::vector<int> used_vmid_sets;

    int idx = 0;
-    for (auto & it : regs) {
-        reg_addrs[idx] = it.first;
-        reg_values[idx] = it.second;
-        ++idx;
-    }
-
-    idx = 0;
    for (auto & it : doorbells) {
        doorbells_offset[idx] = it.first;
-        doorbells_queues[idx] = it.second;
+        doorbells_queues[idx] = it.second.qtype;
+        doorbells_ip_ids[idx] = it.second.ip_id;
        ++idx;
    }

@@ -732,12 +774,12 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
    int* vmid_array = new int[num_queue_id];
    std::copy(used_vmid_sets.begin(), used_vmid_sets.end(), vmid_array);

-    SERIALIZE_ARRAY(reg_addrs, sizeof(reg_addrs)/sizeof(reg_addrs[0]));
-    SERIALIZE_ARRAY(reg_values, sizeof(reg_values)/sizeof(reg_values[0]));
    SERIALIZE_ARRAY(doorbells_offset, sizeof(doorbells_offset)/
        sizeof(doorbells_offset[0]));
    SERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/
        sizeof(doorbells_queues[0]));
+    SERIALIZE_ARRAY(doorbells_ip_ids, sizeof(doorbells_ip_ids)/
+        sizeof(doorbells_ip_ids[0]));
    SERIALIZE_ARRAY(sdma_engs_offset, sizeof(sdma_engs_offset)/
        sizeof(sdma_engs_offset[0]));
    SERIALIZE_ARRAY(sdma_engs, sizeof(sdma_engs)/sizeof(sdma_engs[0]));
@@ -764,43 +806,30 @@ AMDGPUDevice::unserialize(CheckpointIn &cp)
    // Unserialize the PciDevice base class
    PciDevice::unserialize(cp);

-    uint64_t regs_size = 0;
    uint64_t doorbells_size = 0;
    uint64_t sdma_engs_size = 0;
    uint64_t used_vmid_map_size = 0;

-    UNSERIALIZE_SCALAR(regs_size);
    UNSERIALIZE_SCALAR(doorbells_size);
    UNSERIALIZE_SCALAR(sdma_engs_size);
    UNSERIALIZE_SCALAR(used_vmid_map_size);


-    if (regs_size > 0) {
-        uint32_t reg_addrs[regs_size];
-        uint64_t reg_values[regs_size];
-
-        UNSERIALIZE_ARRAY(reg_addrs, sizeof(reg_addrs)/sizeof(reg_addrs[0]));
-        UNSERIALIZE_ARRAY(reg_values,
-                          sizeof(reg_values)/sizeof(reg_values[0]));
-
-        for (int idx = 0; idx < regs_size; ++idx) {
-            regs.insert(std::make_pair(reg_addrs[idx], reg_values[idx]));
-        }
-    }
-
    if (doorbells_size > 0) {
        uint32_t doorbells_offset[doorbells_size];
        QueueType doorbells_queues[doorbells_size];
+        int doorbells_ip_ids[doorbells_size];

        UNSERIALIZE_ARRAY(doorbells_offset, sizeof(doorbells_offset)/
                sizeof(doorbells_offset[0]));
        UNSERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/
                sizeof(doorbells_queues[0]));
+        UNSERIALIZE_ARRAY(doorbells_ip_ids, sizeof(doorbells_ip_ids)/
+                sizeof(doorbells_ip_ids[0]));

        for (int idx = 0; idx < doorbells_size; ++idx) {
-            regs.insert(std::make_pair(doorbells_offset[idx],
-                      doorbells_queues[idx]));
-            doorbells[doorbells_offset[idx]] = doorbells_queues[idx];
+            doorbells[doorbells_offset[idx]].qtype = doorbells_queues[idx];
+            doorbells[doorbells_offset[idx]].ip_id = doorbells_ip_ids[idx];
        }
    }

--- a/src/dev/amdgpu/amdgpu_device.hh
+++ b/src/dev/amdgpu/amdgpu_device.hh
@@ -87,9 +87,7 @@ class AMDGPUDevice : public PciDevice
    /**
     * Structures to hold registers, doorbells, and some frame memory
     */
-    using GPURegMap = std::unordered_map<uint32_t, uint64_t>;
-    GPURegMap regs;
-    std::unordered_map<uint32_t, QueueType> doorbells;
+    std::unordered_map<uint32_t, DoorbellInfo> doorbells;
    std::unordered_map<uint32_t, PacketPtr> pendingDoorbellPkts;

    /**
@@ -115,9 +113,19 @@ class AMDGPUDevice : public PciDevice
    AMDGPUMemoryManager *gpuMemMgr;
    AMDGPUInterruptHandler *deviceIH;
    AMDGPUVM gpuvm;
-    PM4PacketProcessor *pm4PktProc;
    GPUCommandProcessor *cp;

+    struct AddrRangeHasher
+    {
+        std::size_t operator()(const AddrRange& k) const
+        {
+            return k.start();
+        }
+    };
+    std::unordered_map<int, PM4PacketProcessor *> pm4PktProcs;
+    std::unordered_map<AddrRange, PM4PacketProcessor *,
+                       AddrRangeHasher> pm4Ranges;
+
    // SDMAs mapped by doorbell offset
    std::unordered_map<uint32_t, SDMAEngine *> sdmaEngs;
    // SDMAs mapped by ID
@@ -187,7 +195,7 @@ class AMDGPUDevice : public PciDevice
    /**
     * Set handles to GPU blocks.
     */
-    void setDoorbellType(uint32_t offset, QueueType qt);
+    void setDoorbellType(uint32_t offset, QueueType qt, int ip_id = 0);
    void processPendingDoorbells(uint32_t offset);
    void setSDMAEngine(Addr offset, SDMAEngine *eng);

@@ -195,9 +203,8 @@ class AMDGPUDevice : public PciDevice
     * Register value getter/setter. Used by other GPU blocks to change
     * values from incoming driver/user packets.
     */
-    bool haveRegVal(uint32_t addr);
-    uint32_t getRegVal(uint32_t addr);
-    void setRegVal(uint32_t addr, uint32_t value);
+    uint32_t getRegVal(uint64_t addr);
+    void setRegVal(uint64_t addr, uint32_t value);

    /**
     * Methods related to translations and system/device memory.
--- a/src/dev/amdgpu/amdgpu_gfx.cc
+++ b/src/dev/amdgpu/amdgpu_gfx.cc
@@ -37,6 +37,13 @@
 namespace gem5
 {

+AMDGPUGfx::AMDGPUGfx()
+{
+    for (int i = 0; i < SCRATCH_REGS; ++i) {
+        scratchRegs[i] = 0;
+    }
+}
+
 void
 AMDGPUGfx::readMMIO(PacketPtr pkt, Addr offset)
 {
@@ -47,6 +54,9 @@ AMDGPUGfx::readMMIO(PacketPtr pkt, Addr offset)
      case AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB:
        pkt->setLE<uint32_t>(captured_clock_count >> 32);
        break;
+      case AMDGPU_MM_SCRATCH_REG0:
+        pkt->setLE<uint32_t>(scratchRegs[0]);
+        break;
      default:
        break;
    }
@@ -65,6 +75,9 @@ AMDGPUGfx::writeMMIO(PacketPtr pkt, Addr offset)
          captured_clock_count = curTick() / sim_clock::as_int::ns;
        }
        break;
+      case AMDGPU_MM_SCRATCH_REG0:
+        scratchRegs[0] = pkt->getLE<uint32_t>();
+        break;
      default:
        break;
    }
--- a/src/dev/amdgpu/amdgpu_gfx.hh
+++ b/src/dev/amdgpu/amdgpu_gfx.hh
@@ -52,13 +52,16 @@
 #define AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB                 0x13094
 #define AMDGPU_MM_RLC_CAPTURE_GPU_CLOCK_COUNT             0x13098

+// Scratch registers used for GPU post
+#define AMDGPU_MM_SCRATCH_REG0                            0x08100
+
 namespace gem5
 {

 class AMDGPUGfx
 {
  public:
-    AMDGPUGfx() { }
+    AMDGPUGfx();

    void readMMIO(PacketPtr pkt, Addr offset);
    void writeMMIO(PacketPtr pkt, Addr offset);
@@ -68,6 +71,12 @@ class AMDGPUGfx
     * GPU clock count at the time capture MMIO is received.
     */
    uint64_t captured_clock_count = 1;
+
+    /*
+     * Scratch registers.
+     */
+    static constexpr int SCRATCH_REGS = 8;
+    std::array<uint32_t, SCRATCH_REGS> scratchRegs;
 };

 } // namespace gem5
--- a/src/dev/amdgpu/amdgpu_nbio.cc
+++ b/src/dev/amdgpu/amdgpu_nbio.cc
@@ -53,22 +53,44 @@ AMDGPUNbio::setGPUDevice(AMDGPUDevice *gpu_device)
 void
 AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset)
 {
+    // For Vega10 we rely on the golden values in an MMIO trace. Return
+    // immediately as to not clobber those values.
+    if (gpuDevice->getGfxVersion() == GfxVersion::gfx900) {
+        if (offset == AMDGPU_PCIE_DATA || offset == AMDGPU_PCIE_DATA2) {
+            return;
+        }
+    }
+
    switch (offset) {
-      // This is a PCIe status register. At some point during driver init
-      // the driver checks that interrupts are enabled. This is only
-      // checked once, so if the MMIO trace does not exactly line up with
-      // what the driver is doing in gem5, this may still have the first
-      // bit zero causing driver to fail. Therefore, we always set this
-      // bit to one as there is no harm to do so.
-      case AMDGPU_PCIE_DATA_REG:
+      // PCIE_DATA, PCIE_DATA2, PCIE_INDEX, and PCIE_INDEX2 handle "indirect
+      // "register reads/writes from the driver. This provides a way to read
+      // any register by providing a 32-bit address to one of the two INDEX
+      // registers and then reading the corresponding DATA register. See:
+      // https://github.com/ROCm/ROCK-Kernel-Driver/blob/roc-6.0.x/drivers/
+      //     gpu/drm/amd/amdgpu/amdgpu_device.c#L459
+      case AMDGPU_PCIE_DATA:
        {
-          uint32_t value = pkt->getLE<uint32_t>() | 0x1;
-          DPRINTF(AMDGPUDevice, "Marking interrupts enabled: %#lx\n", value);
+          uint32_t value = gpuDevice->getRegVal(pcie_index_reg);
+          DPRINTF(AMDGPUDevice, "Read PCIe index %lx data %x\n",
+                  pcie_index_reg, value);
          pkt->setLE<uint32_t>(value);
        }
        break;
+      case AMDGPU_PCIE_DATA2:
+        {
+          uint32_t value = gpuDevice->getRegVal(pcie_index2_reg);
+          DPRINTF(AMDGPUDevice, "Read PCIe index2 %lx data2 %x\n",
+                  pcie_index2_reg, value);
+          pkt->setLE<uint32_t>(value);
+        }
+        break;
+      case AMDGPU_PCIE_INDEX:
+        pkt->setLE<uint32_t>(pcie_index_reg);
+        break;
+      case AMDGPU_PCIE_INDEX2:
+        pkt->setLE<uint32_t>(pcie_index2_reg);
+        break;
      case AMDGPU_MM_DATA:
-        //pkt->setLE<uint32_t>(regs[mm_index_reg]);
        pkt->setLE<uint32_t>(gpuDevice->getRegVal(mm_index_reg));
        break;
      case VEGA10_INV_ENG17_ACK1:
@@ -89,17 +111,17 @@ AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset)
      case AMDGPU_MP0_SMN_C2PMSG_35:
        pkt->setLE<uint32_t>(0x80000000);
        break;
+      case AMDGPU_MP1_SMN_C2PMSG_90:
+        pkt->setLE<uint32_t>(0x1);
+        break;
      default:
        if (triggered_reads.count(offset)) {
            DPRINTF(AMDGPUDevice, "Found triggered read for %#x\n", offset);
            pkt->setLE<uint32_t>(triggered_reads[offset]);
-        } else if (gpuDevice->haveRegVal(offset)) {
-            uint32_t reg_val = gpuDevice->getRegVal(offset);
-
-            DPRINTF(AMDGPUDevice, "Reading value of %#lx from regs: %#lx\n",
-                    offset, reg_val);
-
-            pkt->setLE<uint32_t>(reg_val);
+        } else if (regs.count(offset)) {
+            DPRINTF(AMDGPUDevice, "Returning value of unknown MMIO offset "
+                    "%x: %x\n", offset, regs[offset]);
+            pkt->setLE<uint32_t>(regs[offset]);
        } else {
            DPRINTF(AMDGPUDevice, "NBIO Unknown MMIO %#x (%#x)\n", offset,
                    pkt->getAddr());
@@ -123,6 +145,24 @@ AMDGPUNbio::writeMMIO(PacketPtr pkt, Addr offset)
        DPRINTF(AMDGPUDevice, "MM write to reg %#lx data %#lx\n",
                mm_index_reg, pkt->getLE<uint32_t>());
        gpuDevice->setRegVal(AMDGPU_MM_DATA, pkt->getLE<uint32_t>());
+    // PCIE_DATA, PCIE_DATA2, PCIE_INDEX, and PCIE_INDEX2 handle "indirect
+    // "register reads/writes from the driver. This provides a way to read
+    // any register by providing a 32-bit address to one of the two INDEX
+    // registers and then reading the corresponding DATA register. See:
+    // https://github.com/ROCm/ROCK-Kernel-Driver/blob/roc-6.0.x/drivers/
+    //     gpu/drm/amd/amdgpu/amdgpu_device.c#L459
+    } else if (offset == AMDGPU_PCIE_INDEX) {
+        assert(pkt->getSize() == 4);
+        pcie_index_reg = pkt->getLE<uint32_t>();
+    } else if (offset == AMDGPU_PCIE_DATA) {
+        assert(pkt->getSize() == 4);
+        gpuDevice->setRegVal(pcie_index_reg, pkt->getLE<uint32_t>());
+    } else if (offset == AMDGPU_PCIE_INDEX2) {
+        assert(pkt->getSize() == 4);
+        pcie_index2_reg = pkt->getLE<uint32_t>();
+    } else if (offset == AMDGPU_PCIE_DATA2) {
+        assert(pkt->getSize() == 4);
+        gpuDevice->setRegVal(pcie_index2_reg, pkt->getLE<uint32_t>());
    } else if (offset == AMDGPU_MP0_SMN_C2PMSG_35) {
        // See psp_v3_1_bootloader_load_sos in amdgpu driver code.
        if (pkt->getLE<uint32_t>() == 0x10000) {
@@ -144,6 +184,14 @@ AMDGPUNbio::writeMMIO(PacketPtr pkt, Addr offset)
    } else if (offset == AMDGPU_MP0_SMN_C2PMSG_71) {
        // PSP ring size
        psp_ring_size = pkt->getLE<uint32_t>();
+    } else {
+        // Fallback to a map of register values. This was previously in the
+        // AMDGPUDevice, however that short-circuited some reads from other
+        // IP blocks. Since this is an end point IP block it is safer to use
+        // here.
+        regs[offset] = pkt->getLE<uint32_t>();
+        DPRINTF(AMDGPUDevice, "Writing value of unknown MMIO offset "
+                "%x: %x\n", offset, regs[offset]);
    }
 }

--- a/src/dev/amdgpu/amdgpu_nbio.hh
+++ b/src/dev/amdgpu/amdgpu_nbio.hh
@@ -56,7 +56,11 @@ class AMDGPUDevice;
 #define AMDGPU_MM_INDEX                                   0x00000
 #define AMDGPU_MM_INDEX_HI                                0x00018
 #define AMDGPU_MM_DATA                                    0x00004
-#define AMDGPU_PCIE_DATA_REG                              0x0003c
+
+#define AMDGPU_PCIE_INDEX                                 0x00030
+#define AMDGPU_PCIE_INDEX2                                0x00038
+#define AMDGPU_PCIE_DATA                                  0x00034
+#define AMDGPU_PCIE_DATA2                                 0x0003c

 // Message bus related to psp
 #define AMDGPU_MP0_SMN_C2PMSG_33                          0x58184
@@ -66,6 +70,7 @@ class AMDGPUDevice;
 #define AMDGPU_MP0_SMN_C2PMSG_70                          0x58218
 #define AMDGPU_MP0_SMN_C2PMSG_71                          0x5821c
 #define AMDGPU_MP0_SMN_C2PMSG_81                          0x58244
+#define AMDGPU_MP1_SMN_C2PMSG_90                          0x58a68

 // Device specific invalidation engines used during initialization
 #define VEGA10_INV_ENG17_ACK1                             0x0a318
@@ -105,6 +110,8 @@ class AMDGPUNbio
     * Driver initialization sequence helper variables.
     */
    uint64_t mm_index_reg = 0;
+    uint32_t pcie_index_reg = 0;
+    uint32_t pcie_index2_reg = 0;
    std::unordered_map<uint32_t, uint32_t> triggered_reads;

    /*
@@ -115,6 +122,12 @@ class AMDGPUNbio
    Addr psp_ring_listen_addr = 0;
    int psp_ring_size = 0;
    int psp_ring_value = 0;
+
+    /*
+     * Hold values of other registers not explicitly modelled by other blocks.
+     */
+    using GPURegMap = std::unordered_map<uint64_t, uint32_t>;
+    GPURegMap regs;
 };

 } // namespace gem5
--- a/src/dev/amdgpu/amdgpu_vm.cc
+++ b/src/dev/amdgpu/amdgpu_vm.cc
@@ -37,6 +37,7 @@
 #include "base/trace.hh"
 #include "debug/AMDGPUDevice.hh"
 #include "dev/amdgpu/amdgpu_defines.hh"
+#include "dev/amdgpu/amdgpu_device.hh"
 #include "mem/packet_access.hh"

 namespace gem5
@@ -51,6 +52,35 @@ AMDGPUVM::AMDGPUVM()
    for (int i = 0; i < AMDGPU_VM_COUNT; ++i) {
        memset(&vmContexts[0], 0, sizeof(AMDGPUVMContext));
    }
+
+    for (int i = 0; i < NUM_MMIO_RANGES; ++i) {
+        mmioRanges[i] = AddrRange();
+    }
+}
+
+void
+AMDGPUVM::setMMIOAperture(mmio_range_t mmio_aperture, AddrRange range)
+{
+    mmioRanges[mmio_aperture] = range;
+}
+
+AddrRange
+AMDGPUVM::getMMIORange(mmio_range_t mmio_aperture)
+{
+    return mmioRanges[mmio_aperture];
+}
+
+const AddrRange&
+AMDGPUVM::getMMIOAperture(Addr offset)
+{
+    for (int i = 0; i < NUM_MMIO_RANGES; ++i) {
+        if (mmioRanges[i].contains(offset)) {
+            return mmioRanges[i];
+        }
+    }
+
+    // Default to NBIO
+    return mmioRanges[NBIO_MMIO_RANGE];
 }

 Addr
--- a/src/dev/amdgpu/amdgpu_vm.hh
+++ b/src/dev/amdgpu/amdgpu_vm.hh
@@ -99,9 +99,23 @@ static constexpr int AMDGPU_USER_PAGE_SIZE = 4096;
 namespace gem5
 {

+typedef enum : int
+{
+    NBIO_MMIO_RANGE,
+    MMHUB_MMIO_RANGE,
+    GFX_MMIO_RANGE,
+    GRBM_MMIO_RANGE,
+    IH_MMIO_RANGE,
+    NUM_MMIO_RANGES
+} mmio_range_t;
+
+class AMDGPUDevice;
+
 class AMDGPUVM : public Serializable
 {
  private:
+    AMDGPUDevice *gpuDevice;
+
    typedef struct GEM5_PACKED
    {
        // Page table addresses: from (Base + Start) to (End)
@@ -160,9 +174,13 @@ class AMDGPUVM : public Serializable
     */
    std::vector<VegaISA::GpuTLB *> gpu_tlbs;

+    std::array<AddrRange, NUM_MMIO_RANGES> mmioRanges;
+
  public:
    AMDGPUVM();

+    void setGPUDevice(AMDGPUDevice *gpu_device) { gpuDevice = gpu_device; }
+
    /**
     * Return base address of GART table in framebuffer.
     */
@@ -172,6 +190,12 @@ class AMDGPUVM : public Serializable
     */
    Addr gartSize();

+    bool
+    inGARTRange(Addr paddr)
+    {
+        return ((paddr >= gartBase()) && (paddr <= (gartBase() + gartSize())));
+    }
+
    /**
     * Copy of GART table. Typically resides in device memory, however we use
     * a copy in gem5 to simplify the interface.
@@ -226,38 +250,11 @@ class AMDGPUVM : public Serializable
    Addr getSysAddrRangeLow () { return vmContext0.sysAddrL; }
    Addr getSysAddrRangeHigh () { return vmContext0.sysAddrH; }

-    Addr
-    getMmioAperture(Addr addr)
-    {
-        // Aperture ranges:
-        // NBIO               0x0     - 0x4280
-        // IH                 0x4280  - 0x4980
-        // SDMA0              0x4980  - 0x5180
-        // SDMA1              0x5180  - 0x5980
-        // GRBM               0x8000  - 0xD000
-        // GFX                0x28000 - 0x3F000
-        // MMHUB              0x68000 - 0x6a120
+    void setMMIOAperture(mmio_range_t mmio_aperture, AddrRange range);
+    const AddrRange& getMMIOAperture(Addr addr);
+    AddrRange getMMIORange(mmio_range_t mmio_aperture);

-        if (IH_BASE <= addr && addr < IH_BASE + IH_SIZE)
-            return IH_BASE;
-        else if (SDMA0_BASE <= addr && addr < SDMA0_BASE + SDMA_SIZE)
-            return SDMA0_BASE;
-        else if (SDMA1_BASE <= addr && addr < SDMA1_BASE + SDMA_SIZE)
-            return SDMA1_BASE;
-        else if (GRBM_BASE <= addr && addr < GRBM_BASE + GRBM_SIZE)
-            return GRBM_BASE;
-        else if (GFX_BASE <= addr && addr < GFX_BASE + GFX_SIZE)
-            return GFX_BASE;
-        else if (MMHUB_BASE <= addr && addr < MMHUB_BASE + MMHUB_SIZE)
-            return MMHUB_BASE;
-        else {
-            warn_once("Accessing unsupported MMIO aperture! Assuming NBIO\n");
-            return NBIO_BASE;
-        }
-
-    }
-
-    // Gettig mapped aperture base addresses
+    // Getting mapped aperture base addresses
    Addr
    getFrameAperture(Addr addr)
    {
--- a/src/dev/amdgpu/interrupt_handler.cc
+++ b/src/dev/amdgpu/interrupt_handler.cc
@@ -75,7 +75,8 @@ void
 AMDGPUInterruptHandler::prepareInterruptCookie(ContextID cntxt_id,
                                                uint32_t ring_id,
                                                uint32_t client_id,
-                                                uint32_t source_id)
+                                                uint32_t source_id,
+                                                unsigned node_id)
 {
    assert(client_id == SOC15_IH_CLIENTID_RLC ||
           client_id == SOC15_IH_CLIENTID_SDMA0 ||
@@ -112,6 +113,7 @@ AMDGPUInterruptHandler::prepareInterruptCookie(ContextID cntxt_id,
    cookie->clientId = client_id;
    cookie->sourceId = source_id;
    cookie->ringId = ring_id;
+    cookie->nodeId = node_id;
    cookie->source_data_dw1 = cntxt_id;
    interruptQueue.push(cookie);
 }
--- a/src/dev/amdgpu/interrupt_handler.hh
+++ b/src/dev/amdgpu/interrupt_handler.hh
@@ -101,7 +101,8 @@ typedef struct
    uint32_t reserved2 : 15;
    uint32_t timestamp_src : 1;
    uint32_t pasid : 16;
-    uint32_t reserved3 : 15;
+    uint32_t nodeId : 8;
+    uint32_t reserved3 : 7;
    uint32_t pasid_src : 1;
    uint32_t source_data_dw1;
    uint32_t source_data_dw2;
@@ -171,7 +172,7 @@ class AMDGPUInterruptHandler : public DmaDevice

    void setGPUDevice(AMDGPUDevice *gpu_device) { gpuDevice = gpu_device; }
    void prepareInterruptCookie(ContextID cntxtId, uint32_t ring_id,
-        uint32_t client_id, uint32_t source_id);
+        uint32_t client_id, uint32_t source_id, unsigned node_id);
    void submitInterruptCookie();
    void submitWritePointer();
    void intrPost();
--- a/src/dev/amdgpu/pm4_mmio.hh
+++ b/src/dev/amdgpu/pm4_mmio.hh
@@ -36,34 +36,34 @@
 namespace gem5
 {

-#define mmCP_RB0_BASE                                                 0x1040
-#define mmCP_RB0_CNTL                                                 0x1041
-#define mmCP_RB_WPTR_POLL_ADDR_LO                                     0x1046
-#define mmCP_RB_WPTR_POLL_ADDR_HI                                     0x1047
-#define mmCP_RB_VMID                                                  0x1051
-#define mmCP_RB0_RPTR_ADDR                                            0x1043
-#define mmCP_RB0_RPTR_ADDR_HI                                         0x1044
-#define mmCP_RB0_WPTR                                                 0x1054
-#define mmCP_RB0_WPTR_HI                                              0x1055
-#define mmCP_RB_DOORBELL_CONTROL                                      0x1059
-#define mmCP_RB_DOORBELL_RANGE_LOWER                                  0x105a
-#define mmCP_RB_DOORBELL_RANGE_UPPER                                  0x105b
-#define mmCP_RB0_BASE_HI                                              0x10b1
+#define mmCP_RB0_BASE                                                 0x040
+#define mmCP_RB0_CNTL                                                 0x041
+#define mmCP_RB_WPTR_POLL_ADDR_LO                                     0x046
+#define mmCP_RB_WPTR_POLL_ADDR_HI                                     0x047
+#define mmCP_RB_VMID                                                  0x051
+#define mmCP_RB0_RPTR_ADDR                                            0x043
+#define mmCP_RB0_RPTR_ADDR_HI                                         0x044
+#define mmCP_RB0_WPTR                                                 0x054
+#define mmCP_RB0_WPTR_HI                                              0x055
+#define mmCP_RB_DOORBELL_CONTROL                                      0x059
+#define mmCP_RB_DOORBELL_RANGE_LOWER                                  0x05a
+#define mmCP_RB_DOORBELL_RANGE_UPPER                                  0x05b
+#define mmCP_RB0_BASE_HI                                              0x0b1

-#define mmCP_HQD_ACTIVE                                               0x1247
-#define mmCP_HQD_VMID                                                 0x1248
-#define mmCP_HQD_PQ_BASE                                              0x124d
-#define mmCP_HQD_PQ_BASE_HI                                           0x124e
-#define mmCP_HQD_PQ_DOORBELL_CONTROL                                  0x1254
-#define mmCP_HQD_PQ_RPTR                                              0x124f
-#define mmCP_HQD_PQ_RPTR_REPORT_ADDR                                  0x1250
-#define mmCP_HQD_PQ_RPTR_REPORT_ADDR_HI                               0x1251
-#define mmCP_HQD_PQ_WPTR_POLL_ADDR                                    0x1252
-#define mmCP_HQD_PQ_WPTR_POLL_ADDR_HI                                 0x1253
-#define mmCP_HQD_PQ_CONTROL                                           0x1256
-#define mmCP_HQD_IB_CONTROL                                           0x125a
-#define mmCP_HQD_PQ_WPTR_LO                                           0x127b
-#define mmCP_HQD_PQ_WPTR_HI                                           0x127c
+#define mmCP_HQD_ACTIVE                                               0x247
+#define mmCP_HQD_VMID                                                 0x248
+#define mmCP_HQD_PQ_BASE                                              0x24d
+#define mmCP_HQD_PQ_BASE_HI                                           0x24e
+#define mmCP_HQD_PQ_DOORBELL_CONTROL                                  0x254
+#define mmCP_HQD_PQ_RPTR                                              0x24f
+#define mmCP_HQD_PQ_RPTR_REPORT_ADDR                                  0x250
+#define mmCP_HQD_PQ_RPTR_REPORT_ADDR_HI                               0x251
+#define mmCP_HQD_PQ_WPTR_POLL_ADDR                                    0x252
+#define mmCP_HQD_PQ_WPTR_POLL_ADDR_HI                                 0x253
+#define mmCP_HQD_PQ_CONTROL                                           0x256
+#define mmCP_HQD_IB_CONTROL                                           0x25a
+#define mmCP_HQD_PQ_WPTR_LO                                           0x27b
+#define mmCP_HQD_PQ_WPTR_HI                                           0x27c

 } // namespace gem5

--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -49,7 +49,7 @@ namespace gem5
 {

 PM4PacketProcessor::PM4PacketProcessor(const PM4PacketProcessorParams &p)
-    : DmaVirtDevice(p)
+    : DmaVirtDevice(p), _ipId(p.ip_id), _mmioRange(p.mmio_range)
 {
    memset(&kiq, 0, sizeof(QueueDesc));
    memset(&pq, 0, sizeof(QueueDesc));
@@ -144,7 +144,7 @@ PM4PacketProcessor::newQueue(QueueDesc *mqd, Addr offset,
    QueueType qt;
    qt = mqd->aql ? QueueType::ComputeAQL
                  : QueueType::Compute;
-    gpuDevice->setDoorbellType(offset, qt);
+    gpuDevice->setDoorbellType(offset, qt, getIpId());

    DPRINTF(PM4PacketProcessor, "New PM4 queue %d, base: %p offset: %p, me: "
            "%d, pipe %d queue: %d size: %d\n", id, q->base(), q->offset(),
@@ -227,9 +227,11 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header)
        } break;
      case IT_WRITE_DATA: {
        dmaBuffer = new PM4WriteData();
+        DPRINTF(PM4PacketProcessor, "PM4 writeData header: %x, count: %d\n",
+                header.ordinal, header.count);
        cb = new DmaVirtCallback<uint64_t>(
            [ = ] (const uint64_t &)
-                { writeData(q, (PM4WriteData *)dmaBuffer); });
+                { writeData(q, (PM4WriteData *)dmaBuffer, header); });
        dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4WriteData), cb,
                    dmaBuffer);
        } break;
@@ -350,21 +352,46 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header)
 }

 void
-PM4PacketProcessor::writeData(PM4Queue *q, PM4WriteData *pkt)
+PM4PacketProcessor::writeData(PM4Queue *q, PM4WriteData *pkt, PM4Header header)
 {
    q->incRptr(sizeof(PM4WriteData));

-    Addr addr = getGARTAddr(pkt->destAddr);
-    DPRINTF(PM4PacketProcessor, "PM4 write addr: %p data: %p.\n", addr,
-            pkt->data);
-    auto cb = new DmaVirtCallback<uint32_t>(
-        [ = ](const uint32_t &) { writeDataDone(q, pkt, addr); });
-    //TODO: the specs indicate that pkt->data holds the number of dword that
-    //need to be written.
-    dmaWriteVirt(addr, sizeof(uint32_t), cb, &pkt->data);
+    DPRINTF(PM4PacketProcessor, "PM4 write addr: %p data: %p destSel: %d "
+            "addrIncr: %d resume: %d writeConfirm: %d cachePolicy: %d\n",
+            pkt->destAddr, pkt->data, pkt->destSel, pkt->addrIncr,
+            pkt->resume, pkt->writeConfirm, pkt->cachePolicy);

-    if (!pkt->writeConfirm)
+    if (pkt->destSel == 5) {
+        // Memory address destination
+        Addr addr = getGARTAddr(pkt->destAddr);
+
+        // This is a variable length packet. The size of the packet is in
+        // the header.count field and is set as Number Of Dwords - 1. This
+        // packet is 4 bytes minuimum meaning the count is minimum 3. To
+        // get the number of dwords of data subtract two from the count.
+        unsigned size = (header.count - 2) * sizeof(uint32_t);
+
+        DPRINTF(PM4PacketProcessor, "Writing %d bytes to %p\n", size, addr);
+        auto cb = new DmaVirtCallback<uint32_t>(
+            [ = ](const uint32_t &) { writeDataDone(q, pkt, addr); });
+        dmaWriteVirt(addr, size, cb, &pkt->data);
+
+        if (!pkt->writeConfirm) {
+            decodeNext(q);
+        }
+    } else if (pkt->destSel == 0) {
+        // Register dword address destination
+        Addr byte_addr = pkt->destAddr << 2;
+
+        gpuDevice->setRegVal(byte_addr, pkt->data);
+
+        // setRegVal is instant on the simulated device so we ignore write
+        // confirm.
+        delete pkt;
        decodeNext(q);
+    } else {
+        fatal("Unknown PM4 writeData destination %d\n", pkt->destSel);
+    }
 }

 void
@@ -373,8 +400,9 @@ PM4PacketProcessor::writeDataDone(PM4Queue *q, PM4WriteData *pkt, Addr addr)
    DPRINTF(PM4PacketProcessor, "PM4 write completed to %p, %p.\n", addr,
            pkt->data);

-    if (pkt->writeConfirm)
+    if (pkt->writeConfirm) {
        decodeNext(q);
+    }

    delete pkt;
 }
@@ -493,7 +521,7 @@ PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,

    // Register doorbell with GPU device
    gpuDevice->setSDMAEngine(pkt->doorbellOffset << 2, sdma_eng);
-    gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC);
+    gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC, getIpId());

    gpuDevice->processPendingDoorbells(pkt->doorbellOffset << 2);
 }
@@ -537,7 +565,8 @@ PM4PacketProcessor::releaseMemDone(PM4Queue *q, PM4ReleaseMem *pkt, Addr addr)
            ringId = (q->queue() << 4) | (q->me() << 2) | q->pipe();
        }
        gpuDevice->getIH()->prepareInterruptCookie(pkt->intCtxId, ringId,
-                                            SOC15_IH_CLIENTID_GRBM_CP, CP_EOP);
+                                            SOC15_IH_CLIENTID_GRBM_CP, CP_EOP,
+                                            0);
        gpuDevice->getIH()->submitInterruptCookie();
    }

@@ -745,9 +774,14 @@ PM4PacketProcessor::setUconfigReg(PM4Queue *q, PM4SetUconfigReg *pkt)
 {
    q->incRptr(sizeof(PM4SetUconfigReg));

+    DPRINTF(PM4PacketProcessor, "SetUconfig offset %x data %x\n",
+            pkt->offset, pkt->data);
+
    // SET_UCONFIG_REG_START and pkt->offset are dword addresses
    uint32_t reg_addr = (PACKET3_SET_UCONFIG_REG_START + pkt->offset) * 4;

+    // Additional CPs respond to addresses 0x40000 apart.
+    reg_addr += 0x40000 * getIpId();
    gpuDevice->setRegVal(reg_addr, pkt->data);

    decodeNext(q);
@@ -822,7 +856,7 @@ PM4PacketProcessor::writeMMIO(PacketPtr pkt, Addr mmio_offset)
        break;
      case mmCP_HQD_PQ_DOORBELL_CONTROL:
        setHqdPqDoorbellCtrl(pkt->getLE<uint32_t>());
-        gpuDevice->setDoorbellType(getKiqDoorbellOffset(), Compute);
+        gpuDevice->setDoorbellType(getKiqDoorbellOffset(), Compute, getIpId());
        break;
      case mmCP_HQD_PQ_RPTR:
        setHqdPqPtr(pkt->getLE<uint32_t>());
@@ -884,7 +918,7 @@ PM4PacketProcessor::writeMMIO(PacketPtr pkt, Addr mmio_offset)
        break;
      case mmCP_RB_DOORBELL_CONTROL:
        setRbDoorbellCntrl(pkt->getLE<uint32_t>());
-        gpuDevice->setDoorbellType(getPqDoorbellOffset(), Gfx);
+        gpuDevice->setDoorbellType(getPqDoorbellOffset(), Gfx, getIpId());
        break;
      case mmCP_RB_DOORBELL_RANGE_LOWER:
        setRbDoorbellRangeLo(pkt->getLE<uint32_t>());
--- a/src/dev/amdgpu/pm4_packet_processor.hh
+++ b/src/dev/amdgpu/pm4_packet_processor.hh
@@ -63,6 +63,10 @@ class PM4PacketProcessor : public DmaVirtDevice
    std::unordered_map<uint16_t, PM4Queue *> queues;
    /* A map of PM4 queues based on doorbell offset */
    std::unordered_map<uint32_t, PM4Queue *> queuesMap;
+
+    int _ipId;
+    AddrRange _mmioRange;
+
  public:
    PM4PacketProcessor(const PM4PacketProcessorParams &p);

@@ -136,7 +140,7 @@ class PM4PacketProcessor : public DmaVirtDevice
    void decodeHeader(PM4Queue *q, PM4Header header);

    /* Methods that implement PM4 packets */
-    void writeData(PM4Queue *q, PM4WriteData *pkt);
+    void writeData(PM4Queue *q, PM4WriteData *pkt, PM4Header header);
    void writeDataDone(PM4Queue *q, PM4WriteData *pkt, Addr addr);
    void mapQueues(PM4Queue *q, PM4MapQueues *pkt);
    void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt);
@@ -188,6 +192,9 @@ class PM4PacketProcessor : public DmaVirtDevice
    void setRbDoorbellCntrl(uint32_t data);
    void setRbDoorbellRangeLo(uint32_t data);
    void setRbDoorbellRangeHi(uint32_t data);
+
+    int getIpId() const { return _ipId; }
+    AddrRange getMMIORange() const { return _mmioRange; }
 };

 } // namespace gem5
--- a/src/dev/amdgpu/sdma_engine.cc
+++ b/src/dev/amdgpu/sdma_engine.cc
@@ -81,9 +81,9 @@ SDMAEngine::setGPUDevice(AMDGPUDevice *gpu_device)
 }

 int
-SDMAEngine::getIHClientId()
+SDMAEngine::getIHClientId(int _id)
 {
-    switch (id) {
+    switch (_id) {
      case 0:
        return SOC15_IH_CLIENTID_SDMA0;
      case 1:
@@ -627,10 +627,14 @@ SDMAEngine::writeReadData(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer)

    // lastly we write read data to the destination address
    if (gpuDevice->getVM().inMMHUB(pkt->dest)) {
-        Addr mmhubAddr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
+        Addr mmhub_addr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
+
+        fatal_if(gpuDevice->getVM().inGARTRange(mmhub_addr),
+                "SDMA write to GART not implemented");
+
        auto cb = new EventFunctionWrapper(
            [ = ]{ writeDone(q, pkt, dmaBuffer); }, name());
-        gpuDevice->getMemMgr()->writeRequest(mmhubAddr, (uint8_t *)dmaBuffer,
+        gpuDevice->getMemMgr()->writeRequest(mmhub_addr, (uint8_t *)dmaBuffer,
                                           bufferSize, 0, cb);
    } else {
        if (q->priv()) {
@@ -663,9 +667,11 @@ SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt)
    // count represents the number of bytes - 1 to be copied
    pkt->count++;
    if (q->priv()) {
-        DPRINTF(SDMAEngine, "Getting GART addr for %lx\n", pkt->source);
-        pkt->source = getGARTAddr(pkt->source);
-        DPRINTF(SDMAEngine, "GART addr %lx\n", pkt->source);
+        if (!gpuDevice->getVM().inMMHUB(pkt->source)) {
+            DPRINTF(SDMAEngine, "Getting GART addr for %lx\n", pkt->source);
+            pkt->source = getGARTAddr(pkt->source);
+            DPRINTF(SDMAEngine, "GART addr %lx\n", pkt->source);
+        }
    }

    // Read data from the source first, then call the copyReadData method
@@ -742,6 +748,19 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
            [ = ] (const uint64_t &) { copyDone(q, pkt, dmaBuffer); });
        dmaWriteVirt(pkt->dest, pkt->count, cb, (void *)dmaBuffer);
    }
+
+    // For destinations in the GART table, gem5 uses a mapping tables instead
+    // of functionally going to device memory, so we need to update that copy.
+    if (gpuDevice->getVM().inGARTRange(device_addr)) {
+        // GART entries are always 8 bytes.
+        assert((pkt->count % 8) == 0);
+        for (int i = 0; i < pkt->count/8; ++i) {
+            Addr gart_addr = device_addr + i*8 - gpuDevice->getVM().gartBase();
+            DPRINTF(SDMAEngine, "Shadow copying to GART table %lx -> %lx\n",
+                    gart_addr, dmaBuffer64[i]);
+            gpuDevice->getVM().gartTable[gart_addr] = dmaBuffer64[i];
+        }
+    }
 }

 /* Completion of a copy packet. */
@@ -809,8 +828,12 @@ SDMAEngine::trap(SDMAQueue *q, sdmaTrap *pkt)

    uint32_t ring_id = (q->queueType() == SDMAPage) ? 3 : 0;

+    int node_id = 0;
+    int local_id = getId();
+
    gpuDevice->getIH()->prepareInterruptCookie(pkt->intrContext, ring_id,
-                                               getIHClientId(), TRAP_ID);
+                                               getIHClientId(local_id),
+                                               TRAP_ID, 2*node_id);
    gpuDevice->getIH()->submitInterruptCookie();

    delete pkt;
@@ -836,8 +859,7 @@ SDMAEngine::srbmWrite(SDMAQueue *q, sdmaSRBMWriteHeader *header,
    DPRINTF(SDMAEngine, "SRBM write to %#x with data %#x\n",
            reg_addr, pkt->data);

-    warn_once("SRBM write not performed, no SRBM model. This needs to be fixed"
-              " if correct system simulation is relying on SRBM registers.");
+    gpuDevice->setRegVal(reg_addr, pkt->data);

    delete header;
    delete pkt;
@@ -967,10 +989,14 @@ SDMAEngine::ptePde(SDMAQueue *q, sdmaPtePde *pkt)

    // Writing generated data to the destination address.
    if (gpuDevice->getVM().inMMHUB(pkt->dest)) {
-        Addr mmhubAddr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
+        Addr mmhub_addr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
+
+        fatal_if(gpuDevice->getVM().inGARTRange(mmhub_addr),
+                "SDMA write to GART not implemented");
+
        auto cb = new EventFunctionWrapper(
            [ = ]{ ptePdeDone(q, pkt, dmaBuffer); }, name());
-        gpuDevice->getMemMgr()->writeRequest(mmhubAddr, (uint8_t *)dmaBuffer,
+        gpuDevice->getMemMgr()->writeRequest(mmhub_addr, (uint8_t *)dmaBuffer,
                                             sizeof(uint64_t) * pkt->count, 0,
                                             cb);
    } else {
--- a/src/dev/amdgpu/sdma_engine.hh
+++ b/src/dev/amdgpu/sdma_engine.hh
@@ -172,7 +172,7 @@ class SDMAEngine : public DmaVirtDevice
    /**
     * Returns the client id for the Interrupt Handler.
     */
-    int getIHClientId();
+    int getIHClientId(int _id);

    /**
     * Methods for translation.