From f36be791aa8c25cd1531ca9e84f85dcdf2acfb31 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Tue, 13 Feb 2024 16:27:55 -0600
Subject: [PATCH 1/9] arch-vega: Expand FLAT subDecode range in main decoder

The main decoder for GPU instructions looks at the first 9 bits of a
dword to determine either the instruction or a subDecode table with more
information for specific instructions types. For flat instructions the
first 9 bits currently consist of 6 fixed encoding bits, a reserved bit,
and the first two bits of the opcode. Hence to support all opcodes there
are four indirections to the flat subDecode table. In MI300 the reserved
bit is part of a field to determine memory scope and therefore may be
non-zero.

This commit adds four addition calls to the subDecode table for the
cases where the scope bit is 1. See page 468 (PDF page 478) below:

https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/
    instruction-set-architectures/
    amd-instinct-mi300-cdna3-instruction-set-architecture.pdf

Change-Id: Ic3c786f0ca00a758cbe87f42c5e3470576f73a32
---
 src/arch/amdgpu/vega/gpu_decoder.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc
index 940840719b..2220d820b1 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -500,10 +500,10 @@ namespace VegaISA
         &Decoder::subDecode_OP_FLAT,
         &Decoder::subDecode_OP_FLAT,
         &Decoder::subDecode_OP_FLAT,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
-        &Decoder::decode_invalid,
+        &Decoder::subDecode_OP_FLAT,
+        &Decoder::subDecode_OP_FLAT,
+        &Decoder::subDecode_OP_FLAT,
+        &Decoder::subDecode_OP_FLAT,
         &Decoder::subDecode_OP_MUBUF,
         &Decoder::subDecode_OP_MUBUF,
         &Decoder::subDecode_OP_MUBUF,

From 9ab004cccca501b56cd39be1cefc677fd12b7a4c Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Tue, 13 Feb 2024 16:34:05 -0600
Subject: [PATCH 2/9] arch-vega: Implement V_LSHL_ADD_U64

This is a new instruction in MI300 and operates similar to
V_LSHL_ADD_U32 but on 64-bit values.

Change-Id: Ia4ac65160bdad748fccdcb28286ba03157cc4046
---
 src/arch/amdgpu/vega/gpu_decoder.cc        |  8 +++-
 src/arch/amdgpu/vega/gpu_decoder.hh        |  1 +
 src/arch/amdgpu/vega/insts/instructions.hh | 36 ++++++++++++++++
 src/arch/amdgpu/vega/insts/vop3.cc         | 48 ++++++++++++++++++++++
 4 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc
index 2220d820b1..406ada6c52 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.cc
+++ b/src/arch/amdgpu/vega/gpu_decoder.cc
@@ -1091,7 +1091,7 @@ namespace VegaISA
         &Decoder::decode_OPU_VOP3__V_MAD_I16,
         &Decoder::decode_OPU_VOP3__V_FMA_F16,
         &Decoder::decode_OPU_VOP3__V_DIV_FIXUP_F16,
-        &Decoder::decode_invalid,
+        &Decoder::decode_OPU_VOP3__V_LSHL_ADD_U64,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
@@ -7054,6 +7054,12 @@ namespace VegaISA
         return new Inst_VOP3__V_DIV_FIXUP_F16(&iFmt->iFmt_VOP3A);
     }
 
+    GPUStaticInst*
+    Decoder::decode_OPU_VOP3__V_LSHL_ADD_U64(MachInst iFmt)
+    {
+        return new Inst_VOP3__V_LSHL_ADD_U64(&iFmt->iFmt_VOP3A);
+    }
+
     GPUStaticInst*
     Decoder::decode_OPU_VOP3__V_INTERP_P1_F32(MachInst iFmt)
     {
diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh
index 48084a6913..d3b39fd945 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.hh
+++ b/src/arch/amdgpu/vega/gpu_decoder.hh
@@ -470,6 +470,7 @@ namespace VegaISA
         GPUStaticInst* decode_OPU_VOP3__V_MAD_I16(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_FMA_F16(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_DIV_FIXUP_F16(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_LSHL_ADD_U64(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_INTERP_P1_F32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_INTERP_P2_F32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_INTERP_MOV_F32(MachInst);
diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh
index db03548a3d..4c96a3e34b 100644
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
@@ -30158,6 +30158,42 @@ namespace VegaISA
         void execute(GPUDynInstPtr) override;
     }; // Inst_VOP3__V_DIV_FIXUP_F16
 
+    class Inst_VOP3__V_LSHL_ADD_U64 : public Inst_VOP3A
+    {
+      public:
+        Inst_VOP3__V_LSHL_ADD_U64(InFmt_VOP3A*);
+        ~Inst_VOP3__V_LSHL_ADD_U64();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 3; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src_0
+                return 8;
+              case 1: //src_1
+                return 4;
+              case 2: //src_2
+                return 8;
+              case 3: //vdst
+                return 8;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP3__V_LSHL_ADD_U64
+
     class Inst_VOP3__V_CVT_PKACCUM_U8_F32 : public Inst_VOP3A
     {
       public:
diff --git a/src/arch/amdgpu/vega/insts/vop3.cc b/src/arch/amdgpu/vega/insts/vop3.cc
index 8f6794c9c2..f78f64bc91 100644
--- a/src/arch/amdgpu/vega/insts/vop3.cc
+++ b/src/arch/amdgpu/vega/insts/vop3.cc
@@ -7630,6 +7630,54 @@ namespace VegaISA
     {
         panicUnimplemented();
     } // execute
+    // --- Inst_VOP3__V_LSHL_ADD_U64 class methods ---
+
+    Inst_VOP3__V_LSHL_ADD_U64::Inst_VOP3__V_LSHL_ADD_U64(InFmt_VOP3A *iFmt)
+        : Inst_VOP3A(iFmt, "v_lshl_add_u64", false)
+    {
+        setFlag(ALU);
+    } // Inst_VOP3__V_LSHL_ADD_U64
+
+    Inst_VOP3__V_LSHL_ADD_U64::~Inst_VOP3__V_LSHL_ADD_U64()
+    {
+    } // ~Inst_VOP3__V_LSHL_ADD_U64
+
+    // --- description from .arch file ---
+    // D.u = (S0.u << S1.u[4:0]) + S2.u.
+    void
+    Inst_VOP3__V_LSHL_ADD_U64::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
+        ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
+        VecOperandU64 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.readSrc();
+        src2.readSrc();
+
+        /**
+         * input modifiers are supported by FP operations only
+         */
+        assert(!(instData.ABS & 0x1));
+        assert(!(instData.ABS & 0x2));
+        assert(!(instData.ABS & 0x4));
+        assert(!(extData.NEG & 0x1));
+        assert(!(extData.NEG & 0x2));
+        assert(!(extData.NEG & 0x4));
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                int shift_amount = bits(src1[lane], 2, 0);
+                shift_amount = shift_amount > 4 ? 0 : shift_amount;
+                vdst[lane] = (src0[lane] << shift_amount)
+                           + src2[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
     // --- Inst_VOP3__V_CVT_PKACCUM_U8_F32 class methods ---
 
     Inst_VOP3__V_CVT_PKACCUM_U8_F32::Inst_VOP3__V_CVT_PKACCUM_U8_F32(

From c045c6854007dc6784105a3dcbd990e4d7b7d36d Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Tue, 13 Feb 2024 16:39:50 -0600
Subject: [PATCH 3/9] dev-amdgpu: Add node_id to interrupt handler

The ROCm 6.0 driver adds a node_id field to interrupts which must match
before passing on the interrupt to be cleared by the cookie from gem5's
interrupt handler implementation. Add this field and enable for gfx942.

The usage of the field can be seen in event_interrupt_isr_v9_4_3 at
https://github.com/ROCm/ROCK-Kernel-Driver/blob/roc-6.0.x/drivers/
    gpu/drm/amd/amdkfd/kfd_int_process_v9.c#L449

Change-Id: Iae8b8f0386a5ad2852b4a3c69f2c161d965c4922
---
 src/dev/amdgpu/interrupt_handler.cc    |  4 +++-
 src/dev/amdgpu/interrupt_handler.hh    |  5 +++--
 src/dev/amdgpu/pm4_packet_processor.cc |  3 ++-
 src/dev/amdgpu/sdma_engine.cc          | 10 +++++++---
 src/dev/amdgpu/sdma_engine.hh          |  2 +-
 5 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/dev/amdgpu/interrupt_handler.cc b/src/dev/amdgpu/interrupt_handler.cc
index 6f277a1618..cb99ba7a39 100644
--- a/src/dev/amdgpu/interrupt_handler.cc
+++ b/src/dev/amdgpu/interrupt_handler.cc
@@ -75,7 +75,8 @@ void
 AMDGPUInterruptHandler::prepareInterruptCookie(ContextID cntxt_id,
                                                 uint32_t ring_id,
                                                 uint32_t client_id,
-                                                uint32_t source_id)
+                                                uint32_t source_id,
+                                                unsigned node_id)
 {
     assert(client_id == SOC15_IH_CLIENTID_RLC ||
            client_id == SOC15_IH_CLIENTID_SDMA0 ||
@@ -112,6 +113,7 @@ AMDGPUInterruptHandler::prepareInterruptCookie(ContextID cntxt_id,
     cookie->clientId = client_id;
     cookie->sourceId = source_id;
     cookie->ringId = ring_id;
+    cookie->nodeId = node_id;
     cookie->source_data_dw1 = cntxt_id;
     interruptQueue.push(cookie);
 }
diff --git a/src/dev/amdgpu/interrupt_handler.hh b/src/dev/amdgpu/interrupt_handler.hh
index 9b80e081cc..a895eabafc 100644
--- a/src/dev/amdgpu/interrupt_handler.hh
+++ b/src/dev/amdgpu/interrupt_handler.hh
@@ -101,7 +101,8 @@ typedef struct
     uint32_t reserved2 : 15;
     uint32_t timestamp_src : 1;
     uint32_t pasid : 16;
-    uint32_t reserved3 : 15;
+    uint32_t nodeId : 8;
+    uint32_t reserved3 : 7;
     uint32_t pasid_src : 1;
     uint32_t source_data_dw1;
     uint32_t source_data_dw2;
@@ -171,7 +172,7 @@ class AMDGPUInterruptHandler : public DmaDevice
 
     void setGPUDevice(AMDGPUDevice *gpu_device) { gpuDevice = gpu_device; }
     void prepareInterruptCookie(ContextID cntxtId, uint32_t ring_id,
-        uint32_t client_id, uint32_t source_id);
+        uint32_t client_id, uint32_t source_id, unsigned node_id);
     void submitInterruptCookie();
     void submitWritePointer();
     void intrPost();
diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc
index 5f270a0c70..b7952f0698 100644
--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -537,7 +537,8 @@ PM4PacketProcessor::releaseMemDone(PM4Queue *q, PM4ReleaseMem *pkt, Addr addr)
             ringId = (q->queue() << 4) | (q->me() << 2) | q->pipe();
         }
         gpuDevice->getIH()->prepareInterruptCookie(pkt->intCtxId, ringId,
-                                            SOC15_IH_CLIENTID_GRBM_CP, CP_EOP);
+                                            SOC15_IH_CLIENTID_GRBM_CP, CP_EOP,
+                                            2 * getIpId());
         gpuDevice->getIH()->submitInterruptCookie();
     }
 
diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc
index 4015e83eaf..34ad027234 100644
--- a/src/dev/amdgpu/sdma_engine.cc
+++ b/src/dev/amdgpu/sdma_engine.cc
@@ -81,9 +81,9 @@ SDMAEngine::setGPUDevice(AMDGPUDevice *gpu_device)
 }
 
 int
-SDMAEngine::getIHClientId()
+SDMAEngine::getIHClientId(int _id)
 {
-    switch (id) {
+    switch (_id) {
       case 0:
         return SOC15_IH_CLIENTID_SDMA0;
       case 1:
@@ -809,8 +809,12 @@ SDMAEngine::trap(SDMAQueue *q, sdmaTrap *pkt)
 
     uint32_t ring_id = (q->queueType() == SDMAPage) ? 3 : 0;
 
+    int node_id = 0;
+    int local_id = getId();
+
     gpuDevice->getIH()->prepareInterruptCookie(pkt->intrContext, ring_id,
-                                               getIHClientId(), TRAP_ID);
+                                               getIHClientId(local_id),
+                                               TRAP_ID, 2*node_id);
     gpuDevice->getIH()->submitInterruptCookie();
 
     delete pkt;
diff --git a/src/dev/amdgpu/sdma_engine.hh b/src/dev/amdgpu/sdma_engine.hh
index d8ab31bbde..9407b97d73 100644
--- a/src/dev/amdgpu/sdma_engine.hh
+++ b/src/dev/amdgpu/sdma_engine.hh
@@ -172,7 +172,7 @@ class SDMAEngine : public DmaVirtDevice
     /**
      * Returns the client id for the Interrupt Handler.
      */
-    int getIHClientId();
+    int getIHClientId(int _id);
 
     /**
      * Methods for translation.

From 998709d4fcf4bb2c60c2c98e5b5a001a730e7dcc Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 7 Feb 2024 13:27:30 -0600
Subject: [PATCH 4/9] dev-amdgpu: Improve PM4 write data packet

The write data packet can write multiple dwords but currently always
assumes there is one dword, which can cause some write data to be
missed. This case is not common, but the number of dwords is implicitly
defined in the PM4 header.

This changeset passes the PM4 header to write data so that the correct
number of dwords can be determined. For now we assume no page crossing
when writing multiple dwords as the driver should be checking for that.

Change-Id: I0e8c3cbc28873779f468c2a11fdcf177210a22b7
---
 src/dev/amdgpu/pm4_packet_processor.cc | 54 +++++++++++++++++++-------
 src/dev/amdgpu/pm4_packet_processor.hh |  2 +-
 2 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc
index b7952f0698..c8baa5eab4 100644
--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -227,9 +227,11 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header)
         } break;
       case IT_WRITE_DATA: {
         dmaBuffer = new PM4WriteData();
+        DPRINTF(PM4PacketProcessor, "PM4 writeData header: %x, count: %d\n",
+                header.ordinal, header.count);
         cb = new DmaVirtCallback<uint64_t>(
             [ = ] (const uint64_t &)
-                { writeData(q, (PM4WriteData *)dmaBuffer); });
+                { writeData(q, (PM4WriteData *)dmaBuffer, header); });
         dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4WriteData), cb,
                     dmaBuffer);
         } break;
@@ -350,21 +352,46 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header)
 }
 
 void
-PM4PacketProcessor::writeData(PM4Queue *q, PM4WriteData *pkt)
+PM4PacketProcessor::writeData(PM4Queue *q, PM4WriteData *pkt, PM4Header header)
 {
     q->incRptr(sizeof(PM4WriteData));
 
-    Addr addr = getGARTAddr(pkt->destAddr);
-    DPRINTF(PM4PacketProcessor, "PM4 write addr: %p data: %p.\n", addr,
-            pkt->data);
-    auto cb = new DmaVirtCallback<uint32_t>(
-        [ = ](const uint32_t &) { writeDataDone(q, pkt, addr); });
-    //TODO: the specs indicate that pkt->data holds the number of dword that
-    //need to be written.
-    dmaWriteVirt(addr, sizeof(uint32_t), cb, &pkt->data);
+    DPRINTF(PM4PacketProcessor, "PM4 write addr: %p data: %p destSel: %d "
+            "addrIncr: %d resume: %d writeConfirm: %d cachePolicy: %d\n",
+            pkt->destAddr, pkt->data, pkt->destSel, pkt->addrIncr,
+            pkt->resume, pkt->writeConfirm, pkt->cachePolicy);
 
-    if (!pkt->writeConfirm)
+    if (pkt->destSel == 5) {
+        // Memory address destination
+        Addr addr = getGARTAddr(pkt->destAddr);
+
+        // This is a variable length packet. The size of the packet is in
+        // the header.count field and is set as Number Of Dwords - 1. This
+        // packet is 4 bytes minuimum meaning the count is minimum 3. To
+        // get the number of dwords of data subtract two from the count.
+        unsigned size = (header.count - 2) * sizeof(uint32_t);
+
+        DPRINTF(PM4PacketProcessor, "Writing %d bytes to %p\n", size, addr);
+        auto cb = new DmaVirtCallback<uint32_t>(
+            [ = ](const uint32_t &) { writeDataDone(q, pkt, addr); });
+        dmaWriteVirt(addr, size, cb, &pkt->data);
+
+        if (!pkt->writeConfirm) {
+            decodeNext(q);
+        }
+    } else if (pkt->destSel == 0) {
+        // Register dword address destination
+        Addr byte_addr = pkt->destAddr << 2;
+
+        gpuDevice->setRegVal(byte_addr, pkt->data);
+
+        // setRegVal is instant on the simulated device so we ignore write
+        // confirm.
+        delete pkt;
         decodeNext(q);
+    } else {
+        fatal("Unknown PM4 writeData destination %d\n", pkt->destSel);
+    }
 }
 
 void
@@ -373,8 +400,9 @@ PM4PacketProcessor::writeDataDone(PM4Queue *q, PM4WriteData *pkt, Addr addr)
     DPRINTF(PM4PacketProcessor, "PM4 write completed to %p, %p.\n", addr,
             pkt->data);
 
-    if (pkt->writeConfirm)
+    if (pkt->writeConfirm) {
         decodeNext(q);
+    }
 
     delete pkt;
 }
@@ -538,7 +566,7 @@ PM4PacketProcessor::releaseMemDone(PM4Queue *q, PM4ReleaseMem *pkt, Addr addr)
         }
         gpuDevice->getIH()->prepareInterruptCookie(pkt->intCtxId, ringId,
                                             SOC15_IH_CLIENTID_GRBM_CP, CP_EOP,
-                                            2 * getIpId());
+                                            0);
         gpuDevice->getIH()->submitInterruptCookie();
     }
 
diff --git a/src/dev/amdgpu/pm4_packet_processor.hh b/src/dev/amdgpu/pm4_packet_processor.hh
index 3fb055148c..4782e70829 100644
--- a/src/dev/amdgpu/pm4_packet_processor.hh
+++ b/src/dev/amdgpu/pm4_packet_processor.hh
@@ -136,7 +136,7 @@ class PM4PacketProcessor : public DmaVirtDevice
     void decodeHeader(PM4Queue *q, PM4Header header);
 
     /* Methods that implement PM4 packets */
-    void writeData(PM4Queue *q, PM4WriteData *pkt);
+    void writeData(PM4Queue *q, PM4WriteData *pkt, PM4Header header);
     void writeDataDone(PM4Queue *q, PM4WriteData *pkt, Addr addr);
     void mapQueues(PM4Queue *q, PM4MapQueues *pkt);
     void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt);

From 009cec56e0e7a082ed684e98c5600babc2d2283e Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 7 Feb 2024 13:29:44 -0600
Subject: [PATCH 5/9] dev-amdgpu: Check for SDMA copies to GART range

The SDMA engine can potentially be used to write to the GART address
range. Since gem5 has a shadow copy of the GART table to avoid sending
functional reads to device memory, the GART table must be updated when
copying to the GART range.

This changeset adds a check in the VM for GART range and implements the
SDMA copy packet writing to the GART range. A fatal is added to write
and ptePde, which are the only other two ways to write to memory, as
using these packets to update the GART table has not been observed.

Change-Id: I1e62dfd9179cc9e987659e68414209fd77bba2bd
---
 src/dev/amdgpu/amdgpu_vm.hh   |  6 ++++++
 src/dev/amdgpu/sdma_engine.cc | 37 ++++++++++++++++++++++++++++-------
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/src/dev/amdgpu/amdgpu_vm.hh b/src/dev/amdgpu/amdgpu_vm.hh
index f35a735111..5af666f379 100644
--- a/src/dev/amdgpu/amdgpu_vm.hh
+++ b/src/dev/amdgpu/amdgpu_vm.hh
@@ -172,6 +172,12 @@ class AMDGPUVM : public Serializable
      */
     Addr gartSize();
 
+    bool
+    inGARTRange(Addr paddr)
+    {
+        return ((paddr >= gartBase()) && (paddr <= (gartBase() + gartSize())));
+    }
+
     /**
      * Copy of GART table. Typically resides in device memory, however we use
      * a copy in gem5 to simplify the interface.
diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc
index 34ad027234..94bcdf9cb9 100644
--- a/src/dev/amdgpu/sdma_engine.cc
+++ b/src/dev/amdgpu/sdma_engine.cc
@@ -627,10 +627,14 @@ SDMAEngine::writeReadData(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer)
 
     // lastly we write read data to the destination address
     if (gpuDevice->getVM().inMMHUB(pkt->dest)) {
-        Addr mmhubAddr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
+        Addr mmhub_addr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
+
+        fatal_if(gpuDevice->getVM().inGARTRange(mmhub_addr),
+                "SDMA write to GART not implemented");
+
         auto cb = new EventFunctionWrapper(
             [ = ]{ writeDone(q, pkt, dmaBuffer); }, name());
-        gpuDevice->getMemMgr()->writeRequest(mmhubAddr, (uint8_t *)dmaBuffer,
+        gpuDevice->getMemMgr()->writeRequest(mmhub_addr, (uint8_t *)dmaBuffer,
                                            bufferSize, 0, cb);
     } else {
         if (q->priv()) {
@@ -663,9 +667,11 @@ SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt)
     // count represents the number of bytes - 1 to be copied
     pkt->count++;
     if (q->priv()) {
-        DPRINTF(SDMAEngine, "Getting GART addr for %lx\n", pkt->source);
-        pkt->source = getGARTAddr(pkt->source);
-        DPRINTF(SDMAEngine, "GART addr %lx\n", pkt->source);
+        if (!gpuDevice->getVM().inMMHUB(pkt->source)) {
+            DPRINTF(SDMAEngine, "Getting GART addr for %lx\n", pkt->source);
+            pkt->source = getGARTAddr(pkt->source);
+            DPRINTF(SDMAEngine, "GART addr %lx\n", pkt->source);
+        }
     }
 
     // Read data from the source first, then call the copyReadData method
@@ -742,6 +748,19 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
             [ = ] (const uint64_t &) { copyDone(q, pkt, dmaBuffer); });
         dmaWriteVirt(pkt->dest, pkt->count, cb, (void *)dmaBuffer);
     }
+
+    // For destinations in the GART table, gem5 uses a mapping tables instead
+    // of functionally going to device memory, so we need to update that copy.
+    if (gpuDevice->getVM().inGARTRange(device_addr)) {
+        // GART entries are always 8 bytes.
+        assert((pkt->count % 8) == 0);
+        for (int i = 0; i < pkt->count/8; ++i) {
+            Addr gart_addr = device_addr + i*8 - gpuDevice->getVM().gartBase();
+            DPRINTF(SDMAEngine, "Shadow copying to GART table %lx -> %lx\n",
+                    gart_addr, dmaBuffer64[i]);
+            gpuDevice->getVM().gartTable[gart_addr] = dmaBuffer64[i];
+        }
+    }
 }
 
 /* Completion of a copy packet. */
@@ -971,10 +990,14 @@ SDMAEngine::ptePde(SDMAQueue *q, sdmaPtePde *pkt)
 
     // Writing generated data to the destination address.
     if (gpuDevice->getVM().inMMHUB(pkt->dest)) {
-        Addr mmhubAddr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
+        Addr mmhub_addr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
+
+        fatal_if(gpuDevice->getVM().inGARTRange(mmhub_addr),
+                "SDMA write to GART not implemented");
+
         auto cb = new EventFunctionWrapper(
             [ = ]{ ptePdeDone(q, pkt, dmaBuffer); }, name());
-        gpuDevice->getMemMgr()->writeRequest(mmhubAddr, (uint8_t *)dmaBuffer,
+        gpuDevice->getMemMgr()->writeRequest(mmhub_addr, (uint8_t *)dmaBuffer,
                                              sizeof(uint64_t) * pkt->count, 0,
                                              cb);
     } else {

From 6bbde8fbb885abae5d7f3ed630d19a9b982dd302 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Thu, 8 Feb 2024 12:26:27 -0600
Subject: [PATCH 6/9] dev-amdgpu: Rework handling of unknown registers

The top level AMDGPUDevice currently reads/writes all unknown registers
to/from a map containing the previously written value. This is intended
as a way to handle registers that are not part of the model but the
driver requires for functionality. Since this is at the top level, it
can mask changes to register values which do not go through the same
interface. For example, reading an MMIO, changing via PM4 queue, and
reading again returns the stale cached value.

This commit removes the usage of the regs map in AMDGPUDevice,
implements some important MMIOs that were previously handled by it, and
moves the unknown register handling to the NBIO aperture only. To reduce
the number of additional MMIOs to implement, the display manager in
vega10 is now disabled.

Change-Id: Iff0a599dd82d663c7e710b79c6ef6d0ad1fc44a2
---
 configs/example/gpufs/vega10.py |  2 +-
 src/dev/amdgpu/amdgpu_device.cc | 75 +++++++++++----------------------
 src/dev/amdgpu/amdgpu_device.hh |  7 +--
 src/dev/amdgpu/amdgpu_gfx.cc    | 13 ++++++
 src/dev/amdgpu/amdgpu_gfx.hh    | 11 ++++-
 src/dev/amdgpu/amdgpu_nbio.cc   | 41 ++++++++++++++----
 src/dev/amdgpu/amdgpu_nbio.hh   | 14 +++++-
 7 files changed, 96 insertions(+), 67 deletions(-)

diff --git a/configs/example/gpufs/vega10.py b/configs/example/gpufs/vega10.py
index ae74efd39b..9c3116d415 100644
--- a/configs/example/gpufs/vega10.py
+++ b/configs/example/gpufs/vega10.py
@@ -52,7 +52,7 @@ if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
     echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
     /sbin/m5 exit
 fi
-modprobe -v amdgpu ip_block_mask=0xff ppfeaturemask=0 dpm=0 audio=0
+modprobe -v amdgpu ip_block_mask=0xdf ppfeaturemask=0 dpm=0 audio=0
 echo "Running {} {}"
 echo "{}" | base64 -d > myapp
 chmod +x myapp
diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc
index 48f450c2b2..4b684aa221 100644
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -130,6 +130,7 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
     pm4PktProc->setGPUDevice(this);
     cp->hsaPacketProc().setGPUDevice(this);
     cp->setGPUDevice(this);
+    nbio.setGPUDevice(this);
 
     // Address aperture for device memory. We tell this to the driver and
     // could possibly be anything, but these are the values used by hardware.
@@ -163,8 +164,6 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
 
     gpuvm.setMMHUBBase(mmhubBase);
     gpuvm.setMMHUBTop(mmhubTop);
-
-    nbio.setGPUDevice(this);
 }
 
 void
@@ -365,13 +364,6 @@ AMDGPUDevice::readMMIO(PacketPtr pkt, Addr offset)
     DPRINTF(AMDGPUDevice, "Read MMIO %#lx\n", offset);
     mmioReader.readFromTrace(pkt, MMIO_BAR, offset);
 
-    if (regs.find(offset) != regs.end()) {
-        uint64_t value = regs[offset];
-        DPRINTF(AMDGPUDevice, "Reading what kernel wrote before: %#x\n",
-                value);
-        pkt->setUintX(value, ByteOrder::little);
-    }
-
     switch (aperture) {
       case NBIO_BASE:
         nbio.readMMIO(pkt, aperture_offset);
@@ -610,26 +602,39 @@ AMDGPUDevice::processPendingDoorbells(uint32_t offset)
     }
 }
 
-bool
-AMDGPUDevice::haveRegVal(uint32_t addr)
-{
-    return regs.count(addr);
-}
-
 uint32_t
-AMDGPUDevice::getRegVal(uint32_t addr)
+AMDGPUDevice::getRegVal(uint64_t addr)
 {
+    // This is somewhat of a guess based on amdgpu_device_mm_access
+    // in amdgpu_device.c in the ROCk driver. If bit 32 is 1 then
+    // assume VRAM and use full address, otherwise assume register
+    // address and only user lower 31 bits.
+    Addr fixup_addr = bits(addr, 31, 31) ? addr : addr & 0x7fffffff;
+
+    uint32_t pkt_data = 0;
+    RequestPtr request = std::make_shared<Request>(fixup_addr,
+            sizeof(uint32_t), 0 /* flags */, vramRequestorId());
+    PacketPtr pkt = Packet::createRead(request);
+    pkt->dataStatic((uint8_t *)&pkt_data);
+    readMMIO(pkt, addr);
     DPRINTF(AMDGPUDevice, "Getting register 0x%lx = %x\n",
-            addr, regs[addr]);
-    return regs[addr];
+            fixup_addr, pkt->getLE<uint32_t>());
+
+    return pkt->getLE<uint32_t>();
 }
 
 void
-AMDGPUDevice::setRegVal(uint32_t addr, uint32_t value)
+AMDGPUDevice::setRegVal(uint64_t addr, uint32_t value)
 {
     DPRINTF(AMDGPUDevice, "Setting register 0x%lx to %x\n",
             addr, value);
-    regs[addr] = value;
+
+    uint32_t pkt_data = value;
+    RequestPtr request = std::make_shared<Request>(addr,
+            sizeof(uint32_t), 0 /* flags */, vramRequestorId());
+    PacketPtr pkt = Packet::createWrite(request);
+    pkt->dataStatic((uint8_t *)&pkt_data);
+    writeMMIO(pkt, addr);
 }
 
 void
@@ -675,20 +680,16 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
     // Serialize the PciDevice base class
     PciDevice::serialize(cp);
 
-    uint64_t regs_size = regs.size();
     uint64_t doorbells_size = doorbells.size();
     uint64_t sdma_engs_size = sdmaEngs.size();
     uint64_t used_vmid_map_size = usedVMIDs.size();
 
-    SERIALIZE_SCALAR(regs_size);
     SERIALIZE_SCALAR(doorbells_size);
     SERIALIZE_SCALAR(sdma_engs_size);
     // Save the number of vmids used
     SERIALIZE_SCALAR(used_vmid_map_size);
 
     // Make a c-style array of the regs to serialize
-    uint32_t reg_addrs[regs_size];
-    uint64_t reg_values[regs_size];
     uint32_t doorbells_offset[doorbells_size];
     QueueType doorbells_queues[doorbells_size];
     uint32_t sdma_engs_offset[sdma_engs_size];
@@ -698,13 +699,6 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
     std::vector<int> used_vmid_sets;
 
     int idx = 0;
-    for (auto & it : regs) {
-        reg_addrs[idx] = it.first;
-        reg_values[idx] = it.second;
-        ++idx;
-    }
-
-    idx = 0;
     for (auto & it : doorbells) {
         doorbells_offset[idx] = it.first;
         doorbells_queues[idx] = it.second;
@@ -732,8 +726,6 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
     int* vmid_array = new int[num_queue_id];
     std::copy(used_vmid_sets.begin(), used_vmid_sets.end(), vmid_array);
 
-    SERIALIZE_ARRAY(reg_addrs, sizeof(reg_addrs)/sizeof(reg_addrs[0]));
-    SERIALIZE_ARRAY(reg_values, sizeof(reg_values)/sizeof(reg_values[0]));
     SERIALIZE_ARRAY(doorbells_offset, sizeof(doorbells_offset)/
         sizeof(doorbells_offset[0]));
     SERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/
@@ -764,30 +756,15 @@ AMDGPUDevice::unserialize(CheckpointIn &cp)
     // Unserialize the PciDevice base class
     PciDevice::unserialize(cp);
 
-    uint64_t regs_size = 0;
     uint64_t doorbells_size = 0;
     uint64_t sdma_engs_size = 0;
     uint64_t used_vmid_map_size = 0;
 
-    UNSERIALIZE_SCALAR(regs_size);
     UNSERIALIZE_SCALAR(doorbells_size);
     UNSERIALIZE_SCALAR(sdma_engs_size);
     UNSERIALIZE_SCALAR(used_vmid_map_size);
 
 
-    if (regs_size > 0) {
-        uint32_t reg_addrs[regs_size];
-        uint64_t reg_values[regs_size];
-
-        UNSERIALIZE_ARRAY(reg_addrs, sizeof(reg_addrs)/sizeof(reg_addrs[0]));
-        UNSERIALIZE_ARRAY(reg_values,
-                          sizeof(reg_values)/sizeof(reg_values[0]));
-
-        for (int idx = 0; idx < regs_size; ++idx) {
-            regs.insert(std::make_pair(reg_addrs[idx], reg_values[idx]));
-        }
-    }
-
     if (doorbells_size > 0) {
         uint32_t doorbells_offset[doorbells_size];
         QueueType doorbells_queues[doorbells_size];
@@ -798,8 +775,6 @@ AMDGPUDevice::unserialize(CheckpointIn &cp)
                 sizeof(doorbells_queues[0]));
 
         for (int idx = 0; idx < doorbells_size; ++idx) {
-            regs.insert(std::make_pair(doorbells_offset[idx],
-                      doorbells_queues[idx]));
             doorbells[doorbells_offset[idx]] = doorbells_queues[idx];
         }
     }
diff --git a/src/dev/amdgpu/amdgpu_device.hh b/src/dev/amdgpu/amdgpu_device.hh
index b6b6e2a81a..fface5fb3e 100644
--- a/src/dev/amdgpu/amdgpu_device.hh
+++ b/src/dev/amdgpu/amdgpu_device.hh
@@ -87,8 +87,6 @@ class AMDGPUDevice : public PciDevice
     /**
      * Structures to hold registers, doorbells, and some frame memory
      */
-    using GPURegMap = std::unordered_map<uint32_t, uint64_t>;
-    GPURegMap regs;
     std::unordered_map<uint32_t, QueueType> doorbells;
     std::unordered_map<uint32_t, PacketPtr> pendingDoorbellPkts;
 
@@ -195,9 +193,8 @@ class AMDGPUDevice : public PciDevice
      * Register value getter/setter. Used by other GPU blocks to change
      * values from incoming driver/user packets.
      */
-    bool haveRegVal(uint32_t addr);
-    uint32_t getRegVal(uint32_t addr);
-    void setRegVal(uint32_t addr, uint32_t value);
+    uint32_t getRegVal(uint64_t addr);
+    void setRegVal(uint64_t addr, uint32_t value);
 
     /**
      * Methods related to translations and system/device memory.
diff --git a/src/dev/amdgpu/amdgpu_gfx.cc b/src/dev/amdgpu/amdgpu_gfx.cc
index 3d5b274b86..60fabaf31d 100644
--- a/src/dev/amdgpu/amdgpu_gfx.cc
+++ b/src/dev/amdgpu/amdgpu_gfx.cc
@@ -37,6 +37,13 @@
 namespace gem5
 {
 
+AMDGPUGfx::AMDGPUGfx()
+{
+    for (int i = 0; i < SCRATCH_REGS; ++i) {
+        scratchRegs[i] = 0;
+    }
+}
+
 void
 AMDGPUGfx::readMMIO(PacketPtr pkt, Addr offset)
 {
@@ -47,6 +54,9 @@ AMDGPUGfx::readMMIO(PacketPtr pkt, Addr offset)
       case AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB:
         pkt->setLE<uint32_t>(captured_clock_count >> 32);
         break;
+      case AMDGPU_MM_SCRATCH_REG0:
+        pkt->setLE<uint32_t>(scratchRegs[0]);
+        break;
       default:
         break;
     }
@@ -65,6 +75,9 @@ AMDGPUGfx::writeMMIO(PacketPtr pkt, Addr offset)
           captured_clock_count = curTick() / sim_clock::as_int::ns;
         }
         break;
+      case AMDGPU_MM_SCRATCH_REG0:
+        scratchRegs[0] = pkt->getLE<uint32_t>();
+        break;
       default:
         break;
     }
diff --git a/src/dev/amdgpu/amdgpu_gfx.hh b/src/dev/amdgpu/amdgpu_gfx.hh
index c32b8624cf..9fb1d82553 100644
--- a/src/dev/amdgpu/amdgpu_gfx.hh
+++ b/src/dev/amdgpu/amdgpu_gfx.hh
@@ -52,13 +52,16 @@
 #define AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB                 0x13094
 #define AMDGPU_MM_RLC_CAPTURE_GPU_CLOCK_COUNT             0x13098
 
+// Scratch registers used for GPU post
+#define AMDGPU_MM_SCRATCH_REG0                            0x08100
+
 namespace gem5
 {
 
 class AMDGPUGfx
 {
   public:
-    AMDGPUGfx() { }
+    AMDGPUGfx();
 
     void readMMIO(PacketPtr pkt, Addr offset);
     void writeMMIO(PacketPtr pkt, Addr offset);
@@ -68,6 +71,12 @@ class AMDGPUGfx
      * GPU clock count at the time capture MMIO is received.
      */
     uint64_t captured_clock_count = 1;
+
+    /*
+     * Scratch registers.
+     */
+    static constexpr int SCRATCH_REGS = 8;
+    std::array<uint32_t, SCRATCH_REGS> scratchRegs;
 };
 
 } // namespace gem5
diff --git a/src/dev/amdgpu/amdgpu_nbio.cc b/src/dev/amdgpu/amdgpu_nbio.cc
index 07027c3765..89b1682631 100644
--- a/src/dev/amdgpu/amdgpu_nbio.cc
+++ b/src/dev/amdgpu/amdgpu_nbio.cc
@@ -54,13 +54,21 @@ void
 AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset)
 {
     switch (offset) {
+      case AMDGPU_PCIE_DATA:
+        {
+          uint32_t value = gpuDevice->getRegVal(pcie_index_reg);
+          DPRINTF(AMDGPUDevice, "Read PCIe index %lx data %x\n",
+                  pcie_index_reg, value);
+          pkt->setLE<uint32_t>(value);
+        }
+        break;
       // This is a PCIe status register. At some point during driver init
       // the driver checks that interrupts are enabled. This is only
       // checked once, so if the MMIO trace does not exactly line up with
       // what the driver is doing in gem5, this may still have the first
       // bit zero causing driver to fail. Therefore, we always set this
       // bit to one as there is no harm to do so.
-      case AMDGPU_PCIE_DATA_REG:
+      case AMDGPU_PCIE_DATA2:
         {
           uint32_t value = pkt->getLE<uint32_t>() | 0x1;
           DPRINTF(AMDGPUDevice, "Marking interrupts enabled: %#lx\n", value);
@@ -68,7 +76,6 @@ AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset)
         }
         break;
       case AMDGPU_MM_DATA:
-        //pkt->setLE<uint32_t>(regs[mm_index_reg]);
         pkt->setLE<uint32_t>(gpuDevice->getRegVal(mm_index_reg));
         break;
       case VEGA10_INV_ENG17_ACK1:
@@ -89,17 +96,17 @@ AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset)
       case AMDGPU_MP0_SMN_C2PMSG_35:
         pkt->setLE<uint32_t>(0x80000000);
         break;
+      case AMDGPU_MP1_SMN_C2PMSG_90:
+        pkt->setLE<uint32_t>(0x1);
+        break;
       default:
         if (triggered_reads.count(offset)) {
             DPRINTF(AMDGPUDevice, "Found triggered read for %#x\n", offset);
             pkt->setLE<uint32_t>(triggered_reads[offset]);
-        } else if (gpuDevice->haveRegVal(offset)) {
-            uint32_t reg_val = gpuDevice->getRegVal(offset);
-
-            DPRINTF(AMDGPUDevice, "Reading value of %#lx from regs: %#lx\n",
-                    offset, reg_val);
-
-            pkt->setLE<uint32_t>(reg_val);
+        } else if (regs.count(offset)) {
+            DPRINTF(AMDGPUDevice, "Returning value of unknown MMIO offset "
+                    "%x: %x\n", offset, regs[offset]);
+            pkt->setLE<uint32_t>(regs[offset]);
         } else {
             DPRINTF(AMDGPUDevice, "NBIO Unknown MMIO %#x (%#x)\n", offset,
                     pkt->getAddr());
@@ -123,6 +130,14 @@ AMDGPUNbio::writeMMIO(PacketPtr pkt, Addr offset)
         DPRINTF(AMDGPUDevice, "MM write to reg %#lx data %#lx\n",
                 mm_index_reg, pkt->getLE<uint32_t>());
         gpuDevice->setRegVal(AMDGPU_MM_DATA, pkt->getLE<uint32_t>());
+    } else if (offset == AMDGPU_PCIE_INDEX) {
+        assert(pkt->getSize() == 4);
+        pcie_index_reg = insertBits(pcie_index_reg, 31, 0,
+                                    pkt->getLE<uint32_t>());
+    } else if (offset == AMDGPU_PCIE_INDEX2) {
+        assert(pkt->getSize() == 4);
+        pcie_index_reg = insertBits(pcie_index_reg, 63, 32,
+                                    pkt->getLE<uint32_t>());
     } else if (offset == AMDGPU_MP0_SMN_C2PMSG_35) {
         // See psp_v3_1_bootloader_load_sos in amdgpu driver code.
         if (pkt->getLE<uint32_t>() == 0x10000) {
@@ -144,6 +159,14 @@ AMDGPUNbio::writeMMIO(PacketPtr pkt, Addr offset)
     } else if (offset == AMDGPU_MP0_SMN_C2PMSG_71) {
         // PSP ring size
         psp_ring_size = pkt->getLE<uint32_t>();
+    } else {
+        // Fallback to a map of register values. This was previously in the
+        // AMDGPUDevice, however that short-circuited some reads from other
+        // IP blocks. Since this is an end point IP block it is safer to use
+        // here.
+        regs[offset] = pkt->getLE<uint32_t>();
+        DPRINTF(AMDGPUDevice, "Writing value of unknown MMIO offset "
+                "%x: %x\n", offset, regs[offset]);
     }
 }
 
diff --git a/src/dev/amdgpu/amdgpu_nbio.hh b/src/dev/amdgpu/amdgpu_nbio.hh
index dc95443916..0d839d0e22 100644
--- a/src/dev/amdgpu/amdgpu_nbio.hh
+++ b/src/dev/amdgpu/amdgpu_nbio.hh
@@ -56,7 +56,11 @@ class AMDGPUDevice;
 #define AMDGPU_MM_INDEX                                   0x00000
 #define AMDGPU_MM_INDEX_HI                                0x00018
 #define AMDGPU_MM_DATA                                    0x00004
-#define AMDGPU_PCIE_DATA_REG                              0x0003c
+
+#define AMDGPU_PCIE_INDEX                                 0x00030
+#define AMDGPU_PCIE_INDEX2                                0x00038
+#define AMDGPU_PCIE_DATA                                  0x00034
+#define AMDGPU_PCIE_DATA2                                 0x0003c
 
 // Message bus related to psp
 #define AMDGPU_MP0_SMN_C2PMSG_33                          0x58184
@@ -66,6 +70,7 @@ class AMDGPUDevice;
 #define AMDGPU_MP0_SMN_C2PMSG_70                          0x58218
 #define AMDGPU_MP0_SMN_C2PMSG_71                          0x5821c
 #define AMDGPU_MP0_SMN_C2PMSG_81                          0x58244
+#define AMDGPU_MP1_SMN_C2PMSG_90                          0x58a68
 
 // Device specific invalidation engines used during initialization
 #define VEGA10_INV_ENG17_ACK1                             0x0a318
@@ -105,6 +110,7 @@ class AMDGPUNbio
      * Driver initialization sequence helper variables.
      */
     uint64_t mm_index_reg = 0;
+    uint64_t pcie_index_reg = 0;
     std::unordered_map<uint32_t, uint32_t> triggered_reads;
 
     /*
@@ -115,6 +121,12 @@ class AMDGPUNbio
     Addr psp_ring_listen_addr = 0;
     int psp_ring_size = 0;
     int psp_ring_value = 0;
+
+    /*
+     * Hold values of other registers not explicitly modelled by other blocks.
+     */
+    using GPURegMap = std::unordered_map<uint64_t, uint32_t>;
+    GPURegMap regs;
 };
 
 } // namespace gem5

From 047c19478023f5c467cc692e1ac717a9104a8ea7 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Tue, 13 Feb 2024 16:45:12 -0600
Subject: [PATCH 7/9] dev-amdgpu: Implement SRBM write

The SRBM write packets where previously not required. This commit
implements SRBM writes to set a register by using the new setRegVal
interface. SRBM writes seem to be used for SRIOV enabled devices.

Change-Id: I202653d339e882e8de59d69a995f65332b2dfb8c
---
 src/dev/amdgpu/sdma_engine.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc
index 94bcdf9cb9..070c04fe64 100644
--- a/src/dev/amdgpu/sdma_engine.cc
+++ b/src/dev/amdgpu/sdma_engine.cc
@@ -859,8 +859,7 @@ SDMAEngine::srbmWrite(SDMAQueue *q, sdmaSRBMWriteHeader *header,
     DPRINTF(SDMAEngine, "SRBM write to %#x with data %#x\n",
             reg_addr, pkt->data);
 
-    warn_once("SRBM write not performed, no SRBM model. This needs to be fixed"
-              " if correct system simulation is relying on SRBM registers.");
+    gpuDevice->setRegVal(reg_addr, pkt->data);
 
     delete header;
     delete pkt;

From 39153cd234c0dacd15351680df699fcd45d3fc01 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Tue, 13 Feb 2024 17:38:20 -0600
Subject: [PATCH 8/9] dev-amdgpu: Implement PCIe indirect read/write

PCIe can read/write to any 32-bit address using the PCI index/index2
registers as an address and then reading/writing the corresponding
data/data2 register.

This commit adds this functionality and removes one magic value being
written to support GPU POST. This feature is disabled for Vega10 which
relies on an MMIO trace for too many values to implement in the MMIO
interface.

Change-Id: Iacfdd1294a7652fc3e60304b57df536d318c847b
---
 src/dev/amdgpu/amdgpu_nbio.cc | 49 ++++++++++++++++++++++++++---------
 src/dev/amdgpu/amdgpu_nbio.hh |  3 ++-
 2 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/src/dev/amdgpu/amdgpu_nbio.cc b/src/dev/amdgpu/amdgpu_nbio.cc
index 89b1682631..ec44f16250 100644
--- a/src/dev/amdgpu/amdgpu_nbio.cc
+++ b/src/dev/amdgpu/amdgpu_nbio.cc
@@ -53,7 +53,21 @@ AMDGPUNbio::setGPUDevice(AMDGPUDevice *gpu_device)
 void
 AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset)
 {
+    // For Vega10 we rely on the golden values in an MMIO trace. Return
+    // immediately as to not clobber those values.
+    if (gpuDevice->getGfxVersion() == GfxVersion::gfx900) {
+        if (offset == AMDGPU_PCIE_DATA || offset == AMDGPU_PCIE_DATA2) {
+            return;
+        }
+    }
+
     switch (offset) {
+      // PCIE_DATA, PCIE_DATA2, PCIE_INDEX, and PCIE_INDEX2 handle "indirect
+      // "register reads/writes from the driver. This provides a way to read
+      // any register by providing a 32-bit address to one of the two INDEX
+      // registers and then reading the corresponding DATA register. See:
+      // https://github.com/ROCm/ROCK-Kernel-Driver/blob/roc-6.0.x/drivers/
+      //     gpu/drm/amd/amdgpu/amdgpu_device.c#L459
       case AMDGPU_PCIE_DATA:
         {
           uint32_t value = gpuDevice->getRegVal(pcie_index_reg);
@@ -62,19 +76,20 @@ AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset)
           pkt->setLE<uint32_t>(value);
         }
         break;
-      // This is a PCIe status register. At some point during driver init
-      // the driver checks that interrupts are enabled. This is only
-      // checked once, so if the MMIO trace does not exactly line up with
-      // what the driver is doing in gem5, this may still have the first
-      // bit zero causing driver to fail. Therefore, we always set this
-      // bit to one as there is no harm to do so.
       case AMDGPU_PCIE_DATA2:
         {
-          uint32_t value = pkt->getLE<uint32_t>() | 0x1;
-          DPRINTF(AMDGPUDevice, "Marking interrupts enabled: %#lx\n", value);
+          uint32_t value = gpuDevice->getRegVal(pcie_index2_reg);
+          DPRINTF(AMDGPUDevice, "Read PCIe index2 %lx data2 %x\n",
+                  pcie_index2_reg, value);
           pkt->setLE<uint32_t>(value);
         }
         break;
+      case AMDGPU_PCIE_INDEX:
+        pkt->setLE<uint32_t>(pcie_index_reg);
+        break;
+      case AMDGPU_PCIE_INDEX2:
+        pkt->setLE<uint32_t>(pcie_index2_reg);
+        break;
       case AMDGPU_MM_DATA:
         pkt->setLE<uint32_t>(gpuDevice->getRegVal(mm_index_reg));
         break;
@@ -130,14 +145,24 @@ AMDGPUNbio::writeMMIO(PacketPtr pkt, Addr offset)
         DPRINTF(AMDGPUDevice, "MM write to reg %#lx data %#lx\n",
                 mm_index_reg, pkt->getLE<uint32_t>());
         gpuDevice->setRegVal(AMDGPU_MM_DATA, pkt->getLE<uint32_t>());
+    // PCIE_DATA, PCIE_DATA2, PCIE_INDEX, and PCIE_INDEX2 handle "indirect
+    // "register reads/writes from the driver. This provides a way to read
+    // any register by providing a 32-bit address to one of the two INDEX
+    // registers and then reading the corresponding DATA register. See:
+    // https://github.com/ROCm/ROCK-Kernel-Driver/blob/roc-6.0.x/drivers/
+    //     gpu/drm/amd/amdgpu/amdgpu_device.c#L459
     } else if (offset == AMDGPU_PCIE_INDEX) {
         assert(pkt->getSize() == 4);
-        pcie_index_reg = insertBits(pcie_index_reg, 31, 0,
-                                    pkt->getLE<uint32_t>());
+        pcie_index_reg = pkt->getLE<uint32_t>();
+    } else if (offset == AMDGPU_PCIE_DATA) {
+        assert(pkt->getSize() == 4);
+        gpuDevice->setRegVal(pcie_index_reg, pkt->getLE<uint32_t>());
     } else if (offset == AMDGPU_PCIE_INDEX2) {
         assert(pkt->getSize() == 4);
-        pcie_index_reg = insertBits(pcie_index_reg, 63, 32,
-                                    pkt->getLE<uint32_t>());
+        pcie_index2_reg = pkt->getLE<uint32_t>();
+    } else if (offset == AMDGPU_PCIE_DATA2) {
+        assert(pkt->getSize() == 4);
+        gpuDevice->setRegVal(pcie_index2_reg, pkt->getLE<uint32_t>());
     } else if (offset == AMDGPU_MP0_SMN_C2PMSG_35) {
         // See psp_v3_1_bootloader_load_sos in amdgpu driver code.
         if (pkt->getLE<uint32_t>() == 0x10000) {
diff --git a/src/dev/amdgpu/amdgpu_nbio.hh b/src/dev/amdgpu/amdgpu_nbio.hh
index 0d839d0e22..87afb02c41 100644
--- a/src/dev/amdgpu/amdgpu_nbio.hh
+++ b/src/dev/amdgpu/amdgpu_nbio.hh
@@ -110,7 +110,8 @@ class AMDGPUNbio
      * Driver initialization sequence helper variables.
      */
     uint64_t mm_index_reg = 0;
-    uint64_t pcie_index_reg = 0;
+    uint32_t pcie_index_reg = 0;
+    uint32_t pcie_index2_reg = 0;
     std::unordered_map<uint32_t, uint32_t> triggered_reads;
 
     /*

From 823b5a6eb87e45f2cb54d3b1c736dad11e4e70e4 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Tue, 13 Feb 2024 17:43:23 -0600
Subject: [PATCH 9/9] dev-amdgpu: Support multiple CPs and MMIO AddrRanges

Currently gem5 assumes that there is only one command processor (CP)
which contains the PM4 packet processor. Some GPU devices have multiple
CPs which the driver tests individually during POST if they are used or
not. Therefore, these additional CPs need to be supported.

This commit allows for multiple PM4 packet processors which represent
multiple CPs. Each of these processors will have its own independent
MMIO address range. To more easily support ranges, the MMIO addresses
now use AddrRange to index a PM4 packet processor instead of the
hard-coded constexpr MMIO start and size pairs.

By default only one PM4 packet processor is created, meaning the
functionality of the simulation is unchanged for devices currently
supported in gem5.

Change-Id: I977f4fd3a169ef4a78671a4fb58c8ea0e19bf52c
---
 configs/example/gpufs/system/system.py |  18 ++-
 src/dev/amdgpu/AMDGPU.py               |   6 +-
 src/dev/amdgpu/amdgpu_defines.hh       |  37 ++----
 src/dev/amdgpu/amdgpu_device.cc        | 160 +++++++++++++++++--------
 src/dev/amdgpu/amdgpu_device.hh        |  16 ++-
 src/dev/amdgpu/amdgpu_vm.cc            |  30 +++++
 src/dev/amdgpu/amdgpu_vm.hh            |  53 ++++----
 src/dev/amdgpu/pm4_mmio.hh             |  54 ++++-----
 src/dev/amdgpu/pm4_packet_processor.cc |  15 ++-
 src/dev/amdgpu/pm4_packet_processor.hh |   7 ++
 10 files changed, 245 insertions(+), 151 deletions(-)

diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py
index 671d4efdc9..1f89bd935b 100644
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -188,9 +188,15 @@ def makeGpuFSSystem(args):
 
     system.pc.south_bridge.gpu.sdmas = sdma_engines
 
-    # Setup PM4 packet processor
-    pm4_pkt_proc = PM4PacketProcessor()
-    system.pc.south_bridge.gpu.pm4_pkt_proc = pm4_pkt_proc
+    # Setup PM4 packet processors
+    pm4_procs = []
+    pm4_procs.append(
+        PM4PacketProcessor(
+            ip_id=0, mmio_range=AddrRange(start=0xC000, end=0xD000)
+        )
+    )
+
+    system.pc.south_bridge.gpu.pm4_pkt_procs = pm4_procs
 
     # GPU data path
     gpu_mem_mgr = AMDGPUMemoryManager()
@@ -207,7 +213,8 @@ def makeGpuFSSystem(args):
     for sdma in sdma_engines:
         system._dma_ports.append(sdma)
     system._dma_ports.append(device_ih)
-    system._dma_ports.append(pm4_pkt_proc)
+    for pm4_proc in pm4_procs:
+        system._dma_ports.append(pm4_proc)
     system._dma_ports.append(system_hub)
     system._dma_ports.append(gpu_mem_mgr)
     system._dma_ports.append(hsapp_pt_walker)
@@ -221,7 +228,8 @@ def makeGpuFSSystem(args):
     for sdma in sdma_engines:
         sdma.pio = system.iobus.mem_side_ports
     device_ih.pio = system.iobus.mem_side_ports
-    pm4_pkt_proc.pio = system.iobus.mem_side_ports
+    for pm4_proc in pm4_procs:
+        pm4_proc.pio = system.iobus.mem_side_ports
     system_hub.pio = system.iobus.mem_side_ports
 
     # Full system needs special TLBs for SQC, Scalar, and vector data ports
diff --git a/src/dev/amdgpu/AMDGPU.py b/src/dev/amdgpu/AMDGPU.py
index 0370f09e01..0e0f597927 100644
--- a/src/dev/amdgpu/AMDGPU.py
+++ b/src/dev/amdgpu/AMDGPU.py
@@ -95,7 +95,7 @@ class AMDGPUDevice(PciDevice):
     # The config script should not create a new cp here but rather assign the
     # same cp that is assigned to the Shader SimObject.
     cp = Param.GPUCommandProcessor(NULL, "Command Processor")
-    pm4_pkt_proc = Param.PM4PacketProcessor("PM4 Packet Processor")
+    pm4_pkt_procs = VectorParam.PM4PacketProcessor("PM4 Packet Processor")
     memory_manager = Param.AMDGPUMemoryManager("GPU Memory Manager")
     memories = VectorParam.AbstractMemory([], "All memories in the device")
     device_ih = Param.AMDGPUInterruptHandler("GPU Interrupt handler")
@@ -118,6 +118,10 @@ class PM4PacketProcessor(DmaVirtDevice):
     cxx_header = "dev/amdgpu/pm4_packet_processor.hh"
     cxx_class = "gem5::PM4PacketProcessor"
 
+    # Default to 0 as the common case is one PM4 packet processor
+    ip_id = Param.Int(0, "Instance ID of this PM4 processor")
+    mmio_range = Param.AddrRange("Range of MMIO addresses")
+
 
 class AMDGPUMemoryManager(ClockedObject):
     type = "AMDGPUMemoryManager"
diff --git a/src/dev/amdgpu/amdgpu_defines.hh b/src/dev/amdgpu/amdgpu_defines.hh
index bc6377fbbc..883501b84d 100644
--- a/src/dev/amdgpu/amdgpu_defines.hh
+++ b/src/dev/amdgpu/amdgpu_defines.hh
@@ -49,6 +49,16 @@ enum QueueType
     RLC
 };
 
+/*
+ * Hold information about doorbells including queue type and the IP
+ * block ID if the IP can have multiple instances.
+ */
+typedef struct
+{
+    QueueType qtype;
+    int ip_id;
+} DoorbellInfo;
+
 // AMD GPUs support 16 different virtual address spaces
 static constexpr int AMDGPU_VM_COUNT = 16;
 
@@ -61,36 +71,11 @@ constexpr int MMIO_BAR = 5;
 constexpr uint32_t VGA_ROM_DEFAULT = 0xc0000;
 constexpr uint32_t ROM_SIZE = 0x20000;        // 128kB
 
-/* SDMA base, size, mmio offset shift. */
-static constexpr uint32_t SDMA0_BASE  = 0x4980;
-static constexpr uint32_t SDMA1_BASE  = 0x5180;
-static constexpr uint32_t SDMA_SIZE  = 0x800;
-static constexpr uint32_t SDMA_OFFSET_SHIFT  = 2;
-
-/* Interrupt handler base, size, mmio offset shift. */
-static constexpr uint32_t IH_BASE = 0x4280;
-static constexpr uint32_t IH_SIZE = 0x700;
+/* Most MMIOs use DWORD addresses and thus need to be shifted. */
 static constexpr uint32_t IH_OFFSET_SHIFT = 2;
-
-/* Graphics register bus manager base, size, mmio offset shift. */
-static constexpr uint32_t GRBM_BASE  = 0x8000;
-static constexpr uint32_t GRBM_SIZE  = 0x5000;
 static constexpr uint32_t GRBM_OFFSET_SHIFT  = 2;
-
-/* GFX base, size, mmio offset shift. */
-static constexpr uint32_t GFX_BASE  = 0x28000;
-static constexpr uint32_t GFX_SIZE  = 0x17000;
-static constexpr uint32_t GFX_OFFSET_SHIFT  = 2;
-
-/* MMHUB base, size, mmio offset shift. */
-static constexpr uint32_t MMHUB_BASE = 0x68000;
-static constexpr uint32_t MMHUB_SIZE = 0x2120;
 static constexpr uint32_t MMHUB_OFFSET_SHIFT = 2;
 
-/* NBIO base and size. */
-static constexpr uint32_t NBIO_BASE = 0x0;
-static constexpr uint32_t NBIO_SIZE = 0x4280;
-
 } // namespace gem5
 
 #endif // __DEV_AMDGPU_AMDGPU_DEFINES_HH__
diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc
index 4b684aa221..5ddd7756ba 100644
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -54,8 +54,7 @@ namespace gem5
 
 AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
     : PciDevice(p), gpuMemMgr(p.memory_manager), deviceIH(p.device_ih),
-      pm4PktProc(p.pm4_pkt_proc), cp(p.cp),
-      checkpoint_before_mmios(p.checkpoint_before_mmios),
+      cp(p.cp), checkpoint_before_mmios(p.checkpoint_before_mmios),
       init_interrupt_count(0), _lastVMID(0),
       deviceMem(name() + ".deviceMem", p.memories, false, "", false)
 {
@@ -81,6 +80,16 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
         romRange = RangeSize(VGA_ROM_DEFAULT, ROM_SIZE);
     }
 
+    if (p.device_name == "Vega10") {
+        gfx_version = GfxVersion::gfx900;
+    } else if (p.device_name == "MI100") {
+        gfx_version = GfxVersion::gfx908;
+    } else if (p.device_name == "MI200") {
+        gfx_version = GfxVersion::gfx90a;
+    } else {
+        panic("Unknown GPU device %s\n", p.device_name);
+    }
+
     if (p.trace_file != "") {
         mmioReader.readMMIOTrace(p.trace_file);
     }
@@ -126,8 +135,22 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
         panic("Unknown GPU device %s\n", p.device_name);
     }
 
+    // Setup PM4 packet processors and sanity check IDs
+    std::set<int> pm4_ids;
+    for (auto& pm4 : p.pm4_pkt_procs) {
+        pm4->setGPUDevice(this);
+        fatal_if(pm4_ids.count(pm4->getIpId()),
+                "Two PM4s with same IP IDs is not allowed");
+        pm4_ids.insert(pm4->getIpId());
+        pm4PktProcs.insert({pm4->getIpId(), pm4});
+
+        pm4Ranges.insert({pm4->getMMIORange(), pm4});
+    }
+
+    // There should be at least one PM4 packet processor with ID 0
+    fatal_if(!pm4PktProcs.count(0), "No default PM4 processor found");
+
     deviceIH->setGPUDevice(this);
-    pm4PktProc->setGPUDevice(this);
     cp->hsaPacketProc().setGPUDevice(this);
     cp->setGPUDevice(this);
     nbio.setGPUDevice(this);
@@ -136,6 +159,23 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
     // could possibly be anything, but these are the values used by hardware.
     uint64_t mmhubBase = 0x8000ULL << 24;
     uint64_t mmhubTop = 0x83ffULL << 24;
+    uint64_t mem_size = 0x3ff0; // 16 GB of memory
+
+    gpuvm.setMMHUBBase(mmhubBase);
+    gpuvm.setMMHUBTop(mmhubTop);
+
+    // Map other MMIO apertures based on gfx version. This must be done before
+    // any calls to get/setRegVal.
+    // NBIO               0x0     - 0x4280
+    // IH                 0x4280  - 0x4980
+    // GRBM               0x8000  - 0xC000
+    // GFX                0x28000 - 0x3F000
+    // MMHUB              0x68000 - 0x6a120
+    gpuvm.setMMIOAperture(NBIO_MMIO_RANGE, AddrRange(0x0, 0x4280));
+    gpuvm.setMMIOAperture(IH_MMIO_RANGE,   AddrRange(0x4280, 0x4980));
+    gpuvm.setMMIOAperture(GRBM_MMIO_RANGE, AddrRange(0x8000, 0xC000));
+    gpuvm.setMMIOAperture(GFX_MMIO_RANGE,  AddrRange(0x28000, 0x3F000));
+    gpuvm.setMMIOAperture(MMHUB_MMIO_RANGE,  AddrRange(0x68000, 0x6A120));
 
     // These are hardcoded register values to return what the driver expects
     setRegVal(AMDGPU_MP0_SMN_C2PMSG_33, 0x80000000);
@@ -145,25 +185,19 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
     if (p.device_name == "Vega10") {
         setRegVal(VEGA10_FB_LOCATION_BASE, mmhubBase >> 24);
         setRegVal(VEGA10_FB_LOCATION_TOP, mmhubTop >> 24);
-        gfx_version = GfxVersion::gfx900;
     } else if (p.device_name == "MI100") {
         setRegVal(MI100_FB_LOCATION_BASE, mmhubBase >> 24);
         setRegVal(MI100_FB_LOCATION_TOP, mmhubTop >> 24);
-        setRegVal(MI100_MEM_SIZE_REG, 0x3ff0); // 16GB of memory
-        gfx_version = GfxVersion::gfx908;
+        setRegVal(MI100_MEM_SIZE_REG, mem_size);
     } else if (p.device_name == "MI200") {
         // This device can have either 64GB or 128GB of device memory.
         // This limits to 16GB for simulation.
         setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
         setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
-        setRegVal(MI200_MEM_SIZE_REG, 0x3ff0);
-        gfx_version = GfxVersion::gfx90a;
+        setRegVal(MI200_MEM_SIZE_REG, mem_size);
     } else {
         panic("Unknown GPU device %s\n", p.device_name);
     }
-
-    gpuvm.setMMHUBBase(mmhubBase);
-    gpuvm.setMMHUBTop(mmhubTop);
 }
 
 void
@@ -356,29 +390,28 @@ AMDGPUDevice::readDoorbell(PacketPtr pkt, Addr offset)
 void
 AMDGPUDevice::readMMIO(PacketPtr pkt, Addr offset)
 {
-    Addr aperture = gpuvm.getMmioAperture(offset);
-    Addr aperture_offset = offset - aperture;
+    AddrRange aperture = gpuvm.getMMIOAperture(offset);
+    Addr aperture_offset = offset - aperture.start();
 
     // By default read from MMIO trace. Overwrite the packet for a select
     // few more dynamic MMIOs.
     DPRINTF(AMDGPUDevice, "Read MMIO %#lx\n", offset);
     mmioReader.readFromTrace(pkt, MMIO_BAR, offset);
 
-    switch (aperture) {
-      case NBIO_BASE:
+    if (aperture == gpuvm.getMMIORange(NBIO_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "NBIO base\n");
         nbio.readMMIO(pkt, aperture_offset);
-        break;
-      case GRBM_BASE:
+    } else if (aperture == gpuvm.getMMIORange(GRBM_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "GRBM base\n");
         gpuvm.readMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
-        break;
-      case GFX_BASE:
+    } else if (aperture == gpuvm.getMMIORange(GFX_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "GFX base\n");
         gfx.readMMIO(pkt, aperture_offset);
-        break;
-      case MMHUB_BASE:
+    } else if (aperture == gpuvm.getMMIORange(MMHUB_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "MMHUB base\n");
         gpuvm.readMMIO(pkt, aperture_offset >> MMHUB_OFFSET_SHIFT);
-        break;
-      default:
-        break;
+    } else {
+        DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for read %#x\n", offset);
     }
 }
 
@@ -422,17 +455,22 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
     DPRINTF(AMDGPUDevice, "Wrote doorbell %#lx\n", offset);
 
     if (doorbells.find(offset) != doorbells.end()) {
-        QueueType q_type = doorbells[offset];
+        QueueType q_type = doorbells[offset].qtype;
+        int ip_id = doorbells[offset].ip_id;
         DPRINTF(AMDGPUDevice, "Doorbell offset %p queue: %d\n",
                               offset, q_type);
         switch (q_type) {
           case Compute:
-            pm4PktProc->process(pm4PktProc->getQueue(offset),
-                                pkt->getLE<uint64_t>());
+            assert(pm4PktProcs.count(ip_id));
+            pm4PktProcs[ip_id]->process(
+                pm4PktProcs[ip_id]->getQueue(offset),
+                pkt->getLE<uint64_t>());
           break;
           case Gfx:
-            pm4PktProc->process(pm4PktProc->getQueue(offset, true),
-                                pkt->getLE<uint64_t>());
+            assert(pm4PktProcs.count(ip_id));
+            pm4PktProcs[ip_id]->process(
+                pm4PktProcs[ip_id]->getQueue(offset, true),
+                pkt->getLE<uint64_t>());
           break;
           case SDMAGfx: {
             SDMAEngine *sdmaEng = getSDMAEngine(offset);
@@ -443,9 +481,11 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
             sdmaEng->processPage(pkt->getLE<uint64_t>());
           } break;
           case ComputeAQL: {
+            assert(pm4PktProcs.count(ip_id));
             cp->hsaPacketProc().hwScheduler()->write(offset,
                 pkt->getLE<uint64_t>() + 1);
-            pm4PktProc->updateReadIndex(offset, pkt->getLE<uint64_t>() + 1);
+            pm4PktProcs[ip_id]->updateReadIndex(offset,
+                pkt->getLE<uint64_t>() + 1);
           } break;
           case InterruptHandler:
             deviceIH->updateRptr(pkt->getLE<uint32_t>());
@@ -475,12 +515,12 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
 void
 AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset)
 {
-    Addr aperture = gpuvm.getMmioAperture(offset);
-    Addr aperture_offset = offset - aperture;
+    AddrRange aperture = gpuvm.getMMIOAperture(offset);
+    Addr aperture_offset = offset - aperture.start();
 
     DPRINTF(AMDGPUDevice, "Wrote MMIO %#lx\n", offset);
 
-    // Check SDMA functions first, then fallback to switch statement
+    // Check SDMA functions first, then fallback to MMIO ranges.
     for (int idx = 0; idx < sdmaIds.size(); ++idx) {
         if (sdmaMmios[idx].contains(offset)) {
             Addr sdma_offset = (offset - sdmaMmios[idx].start()) >> 2;
@@ -498,26 +538,31 @@ AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset)
         }
     }
 
-    switch (aperture) {
-      /* Write a general register to the graphics register bus manager. */
-      case GRBM_BASE:
+    // Check PM4s next, returning to avoid duplicate writes.
+    for (auto& [range, pm4_proc] : pm4Ranges) {
+        if (range.contains(offset)) {
+            // PM4 MMIOs are offset based on the MMIO range start
+            Addr ip_offset = offset - range.start();
+            pm4_proc->writeMMIO(pkt, ip_offset >> GRBM_OFFSET_SHIFT);
+
+            return;
+        }
+    }
+
+    if (aperture == gpuvm.getMMIORange(GRBM_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "GRBM base\n");
         gpuvm.writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
-        pm4PktProc->writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
-        break;
-      /* Write a register to the interrupt handler. */
-      case IH_BASE:
+    } else if (aperture == gpuvm.getMMIORange(IH_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "IH base\n");
         deviceIH->writeMMIO(pkt, aperture_offset >> IH_OFFSET_SHIFT);
-        break;
-      /* Write an IO space register */
-      case NBIO_BASE:
+    } else if (aperture == gpuvm.getMMIORange(NBIO_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "NBIO base\n");
         nbio.writeMMIO(pkt, aperture_offset);
-        break;
-      case GFX_BASE:
+    } else if (aperture == gpuvm.getMMIORange(GFX_MMIO_RANGE)) {
+        DPRINTF(AMDGPUDevice, "GFX base\n");
         gfx.writeMMIO(pkt, aperture_offset);
-        break;
-      default:
-        DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for %#x\n", offset);
-        break;
+    } else {
+        DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for write %#x\n", offset);
     }
 }
 
@@ -638,10 +683,11 @@ AMDGPUDevice::setRegVal(uint64_t addr, uint32_t value)
 }
 
 void
-AMDGPUDevice::setDoorbellType(uint32_t offset, QueueType qt)
+AMDGPUDevice::setDoorbellType(uint32_t offset, QueueType qt, int ip_id)
 {
     DPRINTF(AMDGPUDevice, "Setting doorbell type for %x\n", offset);
-    doorbells[offset] = qt;
+    doorbells[offset].qtype = qt;
+    doorbells[offset].ip_id = ip_id;
 }
 
 void
@@ -692,6 +738,7 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
     // Make a c-style array of the regs to serialize
     uint32_t doorbells_offset[doorbells_size];
     QueueType doorbells_queues[doorbells_size];
+    int doorbells_ip_ids[doorbells_size];
     uint32_t sdma_engs_offset[sdma_engs_size];
     int sdma_engs[sdma_engs_size];
     int used_vmids[used_vmid_map_size];
@@ -701,7 +748,8 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
     int idx = 0;
     for (auto & it : doorbells) {
         doorbells_offset[idx] = it.first;
-        doorbells_queues[idx] = it.second;
+        doorbells_queues[idx] = it.second.qtype;
+        doorbells_ip_ids[idx] = it.second.ip_id;
         ++idx;
     }
 
@@ -730,6 +778,8 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
         sizeof(doorbells_offset[0]));
     SERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/
         sizeof(doorbells_queues[0]));
+    SERIALIZE_ARRAY(doorbells_ip_ids, sizeof(doorbells_ip_ids)/
+        sizeof(doorbells_ip_ids[0]));
     SERIALIZE_ARRAY(sdma_engs_offset, sizeof(sdma_engs_offset)/
         sizeof(sdma_engs_offset[0]));
     SERIALIZE_ARRAY(sdma_engs, sizeof(sdma_engs)/sizeof(sdma_engs[0]));
@@ -768,14 +818,18 @@ AMDGPUDevice::unserialize(CheckpointIn &cp)
     if (doorbells_size > 0) {
         uint32_t doorbells_offset[doorbells_size];
         QueueType doorbells_queues[doorbells_size];
+        int doorbells_ip_ids[doorbells_size];
 
         UNSERIALIZE_ARRAY(doorbells_offset, sizeof(doorbells_offset)/
                 sizeof(doorbells_offset[0]));
         UNSERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/
                 sizeof(doorbells_queues[0]));
+        UNSERIALIZE_ARRAY(doorbells_ip_ids, sizeof(doorbells_ip_ids)/
+                sizeof(doorbells_ip_ids[0]));
 
         for (int idx = 0; idx < doorbells_size; ++idx) {
-            doorbells[doorbells_offset[idx]] = doorbells_queues[idx];
+            doorbells[doorbells_offset[idx]].qtype = doorbells_queues[idx];
+            doorbells[doorbells_offset[idx]].ip_id = doorbells_ip_ids[idx];
         }
     }
 
diff --git a/src/dev/amdgpu/amdgpu_device.hh b/src/dev/amdgpu/amdgpu_device.hh
index fface5fb3e..33b6a9f3e7 100644
--- a/src/dev/amdgpu/amdgpu_device.hh
+++ b/src/dev/amdgpu/amdgpu_device.hh
@@ -87,7 +87,7 @@ class AMDGPUDevice : public PciDevice
     /**
      * Structures to hold registers, doorbells, and some frame memory
      */
-    std::unordered_map<uint32_t, QueueType> doorbells;
+    std::unordered_map<uint32_t, DoorbellInfo> doorbells;
     std::unordered_map<uint32_t, PacketPtr> pendingDoorbellPkts;
 
     /**
@@ -113,9 +113,19 @@ class AMDGPUDevice : public PciDevice
     AMDGPUMemoryManager *gpuMemMgr;
     AMDGPUInterruptHandler *deviceIH;
     AMDGPUVM gpuvm;
-    PM4PacketProcessor *pm4PktProc;
     GPUCommandProcessor *cp;
 
+    struct AddrRangeHasher
+    {
+        std::size_t operator()(const AddrRange& k) const
+        {
+            return k.start();
+        }
+    };
+    std::unordered_map<int, PM4PacketProcessor *> pm4PktProcs;
+    std::unordered_map<AddrRange, PM4PacketProcessor *,
+                       AddrRangeHasher> pm4Ranges;
+
     // SDMAs mapped by doorbell offset
     std::unordered_map<uint32_t, SDMAEngine *> sdmaEngs;
     // SDMAs mapped by ID
@@ -185,7 +195,7 @@ class AMDGPUDevice : public PciDevice
     /**
      * Set handles to GPU blocks.
      */
-    void setDoorbellType(uint32_t offset, QueueType qt);
+    void setDoorbellType(uint32_t offset, QueueType qt, int ip_id = 0);
     void processPendingDoorbells(uint32_t offset);
     void setSDMAEngine(Addr offset, SDMAEngine *eng);
 
diff --git a/src/dev/amdgpu/amdgpu_vm.cc b/src/dev/amdgpu/amdgpu_vm.cc
index 5a13ac9ba0..0eea590c5a 100644
--- a/src/dev/amdgpu/amdgpu_vm.cc
+++ b/src/dev/amdgpu/amdgpu_vm.cc
@@ -37,6 +37,7 @@
 #include "base/trace.hh"
 #include "debug/AMDGPUDevice.hh"
 #include "dev/amdgpu/amdgpu_defines.hh"
+#include "dev/amdgpu/amdgpu_device.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -51,6 +52,35 @@ AMDGPUVM::AMDGPUVM()
     for (int i = 0; i < AMDGPU_VM_COUNT; ++i) {
         memset(&vmContexts[0], 0, sizeof(AMDGPUVMContext));
     }
+
+    for (int i = 0; i < NUM_MMIO_RANGES; ++i) {
+        mmioRanges[i] = AddrRange();
+    }
+}
+
+void
+AMDGPUVM::setMMIOAperture(mmio_range_t mmio_aperture, AddrRange range)
+{
+    mmioRanges[mmio_aperture] = range;
+}
+
+AddrRange
+AMDGPUVM::getMMIORange(mmio_range_t mmio_aperture)
+{
+    return mmioRanges[mmio_aperture];
+}
+
+const AddrRange&
+AMDGPUVM::getMMIOAperture(Addr offset)
+{
+    for (int i = 0; i < NUM_MMIO_RANGES; ++i) {
+        if (mmioRanges[i].contains(offset)) {
+            return mmioRanges[i];
+        }
+    }
+
+    // Default to NBIO
+    return mmioRanges[NBIO_MMIO_RANGE];
 }
 
 Addr
diff --git a/src/dev/amdgpu/amdgpu_vm.hh b/src/dev/amdgpu/amdgpu_vm.hh
index 5af666f379..857ef724da 100644
--- a/src/dev/amdgpu/amdgpu_vm.hh
+++ b/src/dev/amdgpu/amdgpu_vm.hh
@@ -99,9 +99,23 @@ static constexpr int AMDGPU_USER_PAGE_SIZE = 4096;
 namespace gem5
 {
 
+typedef enum : int
+{
+    NBIO_MMIO_RANGE,
+    MMHUB_MMIO_RANGE,
+    GFX_MMIO_RANGE,
+    GRBM_MMIO_RANGE,
+    IH_MMIO_RANGE,
+    NUM_MMIO_RANGES
+} mmio_range_t;
+
+class AMDGPUDevice;
+
 class AMDGPUVM : public Serializable
 {
   private:
+    AMDGPUDevice *gpuDevice;
+
     typedef struct GEM5_PACKED
     {
         // Page table addresses: from (Base + Start) to (End)
@@ -160,9 +174,13 @@ class AMDGPUVM : public Serializable
      */
     std::vector<VegaISA::GpuTLB *> gpu_tlbs;
 
+    std::array<AddrRange, NUM_MMIO_RANGES> mmioRanges;
+
   public:
     AMDGPUVM();
 
+    void setGPUDevice(AMDGPUDevice *gpu_device) { gpuDevice = gpu_device; }
+
     /**
      * Return base address of GART table in framebuffer.
      */
@@ -232,38 +250,11 @@ class AMDGPUVM : public Serializable
     Addr getSysAddrRangeLow () { return vmContext0.sysAddrL; }
     Addr getSysAddrRangeHigh () { return vmContext0.sysAddrH; }
 
-    Addr
-    getMmioAperture(Addr addr)
-    {
-        // Aperture ranges:
-        // NBIO               0x0     - 0x4280
-        // IH                 0x4280  - 0x4980
-        // SDMA0              0x4980  - 0x5180
-        // SDMA1              0x5180  - 0x5980
-        // GRBM               0x8000  - 0xD000
-        // GFX                0x28000 - 0x3F000
-        // MMHUB              0x68000 - 0x6a120
+    void setMMIOAperture(mmio_range_t mmio_aperture, AddrRange range);
+    const AddrRange& getMMIOAperture(Addr addr);
+    AddrRange getMMIORange(mmio_range_t mmio_aperture);
 
-        if (IH_BASE <= addr && addr < IH_BASE + IH_SIZE)
-            return IH_BASE;
-        else if (SDMA0_BASE <= addr && addr < SDMA0_BASE + SDMA_SIZE)
-            return SDMA0_BASE;
-        else if (SDMA1_BASE <= addr && addr < SDMA1_BASE + SDMA_SIZE)
-            return SDMA1_BASE;
-        else if (GRBM_BASE <= addr && addr < GRBM_BASE + GRBM_SIZE)
-            return GRBM_BASE;
-        else if (GFX_BASE <= addr && addr < GFX_BASE + GFX_SIZE)
-            return GFX_BASE;
-        else if (MMHUB_BASE <= addr && addr < MMHUB_BASE + MMHUB_SIZE)
-            return MMHUB_BASE;
-        else {
-            warn_once("Accessing unsupported MMIO aperture! Assuming NBIO\n");
-            return NBIO_BASE;
-        }
-
-    }
-
-    // Gettig mapped aperture base addresses
+    // Getting mapped aperture base addresses
     Addr
     getFrameAperture(Addr addr)
     {
diff --git a/src/dev/amdgpu/pm4_mmio.hh b/src/dev/amdgpu/pm4_mmio.hh
index 3801223175..e9e504c3cd 100644
--- a/src/dev/amdgpu/pm4_mmio.hh
+++ b/src/dev/amdgpu/pm4_mmio.hh
@@ -36,34 +36,34 @@
 namespace gem5
 {
 
-#define mmCP_RB0_BASE                                                 0x1040
-#define mmCP_RB0_CNTL                                                 0x1041
-#define mmCP_RB_WPTR_POLL_ADDR_LO                                     0x1046
-#define mmCP_RB_WPTR_POLL_ADDR_HI                                     0x1047
-#define mmCP_RB_VMID                                                  0x1051
-#define mmCP_RB0_RPTR_ADDR                                            0x1043
-#define mmCP_RB0_RPTR_ADDR_HI                                         0x1044
-#define mmCP_RB0_WPTR                                                 0x1054
-#define mmCP_RB0_WPTR_HI                                              0x1055
-#define mmCP_RB_DOORBELL_CONTROL                                      0x1059
-#define mmCP_RB_DOORBELL_RANGE_LOWER                                  0x105a
-#define mmCP_RB_DOORBELL_RANGE_UPPER                                  0x105b
-#define mmCP_RB0_BASE_HI                                              0x10b1
+#define mmCP_RB0_BASE                                                 0x040
+#define mmCP_RB0_CNTL                                                 0x041
+#define mmCP_RB_WPTR_POLL_ADDR_LO                                     0x046
+#define mmCP_RB_WPTR_POLL_ADDR_HI                                     0x047
+#define mmCP_RB_VMID                                                  0x051
+#define mmCP_RB0_RPTR_ADDR                                            0x043
+#define mmCP_RB0_RPTR_ADDR_HI                                         0x044
+#define mmCP_RB0_WPTR                                                 0x054
+#define mmCP_RB0_WPTR_HI                                              0x055
+#define mmCP_RB_DOORBELL_CONTROL                                      0x059
+#define mmCP_RB_DOORBELL_RANGE_LOWER                                  0x05a
+#define mmCP_RB_DOORBELL_RANGE_UPPER                                  0x05b
+#define mmCP_RB0_BASE_HI                                              0x0b1
 
-#define mmCP_HQD_ACTIVE                                               0x1247
-#define mmCP_HQD_VMID                                                 0x1248
-#define mmCP_HQD_PQ_BASE                                              0x124d
-#define mmCP_HQD_PQ_BASE_HI                                           0x124e
-#define mmCP_HQD_PQ_DOORBELL_CONTROL                                  0x1254
-#define mmCP_HQD_PQ_RPTR                                              0x124f
-#define mmCP_HQD_PQ_RPTR_REPORT_ADDR                                  0x1250
-#define mmCP_HQD_PQ_RPTR_REPORT_ADDR_HI                               0x1251
-#define mmCP_HQD_PQ_WPTR_POLL_ADDR                                    0x1252
-#define mmCP_HQD_PQ_WPTR_POLL_ADDR_HI                                 0x1253
-#define mmCP_HQD_PQ_CONTROL                                           0x1256
-#define mmCP_HQD_IB_CONTROL                                           0x125a
-#define mmCP_HQD_PQ_WPTR_LO                                           0x127b
-#define mmCP_HQD_PQ_WPTR_HI                                           0x127c
+#define mmCP_HQD_ACTIVE                                               0x247
+#define mmCP_HQD_VMID                                                 0x248
+#define mmCP_HQD_PQ_BASE                                              0x24d
+#define mmCP_HQD_PQ_BASE_HI                                           0x24e
+#define mmCP_HQD_PQ_DOORBELL_CONTROL                                  0x254
+#define mmCP_HQD_PQ_RPTR                                              0x24f
+#define mmCP_HQD_PQ_RPTR_REPORT_ADDR                                  0x250
+#define mmCP_HQD_PQ_RPTR_REPORT_ADDR_HI                               0x251
+#define mmCP_HQD_PQ_WPTR_POLL_ADDR                                    0x252
+#define mmCP_HQD_PQ_WPTR_POLL_ADDR_HI                                 0x253
+#define mmCP_HQD_PQ_CONTROL                                           0x256
+#define mmCP_HQD_IB_CONTROL                                           0x25a
+#define mmCP_HQD_PQ_WPTR_LO                                           0x27b
+#define mmCP_HQD_PQ_WPTR_HI                                           0x27c
 
 } // namespace gem5
 
diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc
index c8baa5eab4..62e817aa98 100644
--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -49,7 +49,7 @@ namespace gem5
 {
 
 PM4PacketProcessor::PM4PacketProcessor(const PM4PacketProcessorParams &p)
-    : DmaVirtDevice(p)
+    : DmaVirtDevice(p), _ipId(p.ip_id), _mmioRange(p.mmio_range)
 {
     memset(&kiq, 0, sizeof(QueueDesc));
     memset(&pq, 0, sizeof(QueueDesc));
@@ -144,7 +144,7 @@ PM4PacketProcessor::newQueue(QueueDesc *mqd, Addr offset,
     QueueType qt;
     qt = mqd->aql ? QueueType::ComputeAQL
                   : QueueType::Compute;
-    gpuDevice->setDoorbellType(offset, qt);
+    gpuDevice->setDoorbellType(offset, qt, getIpId());
 
     DPRINTF(PM4PacketProcessor, "New PM4 queue %d, base: %p offset: %p, me: "
             "%d, pipe %d queue: %d size: %d\n", id, q->base(), q->offset(),
@@ -521,7 +521,7 @@ PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
 
     // Register doorbell with GPU device
     gpuDevice->setSDMAEngine(pkt->doorbellOffset << 2, sdma_eng);
-    gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC);
+    gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC, getIpId());
 
     gpuDevice->processPendingDoorbells(pkt->doorbellOffset << 2);
 }
@@ -774,9 +774,14 @@ PM4PacketProcessor::setUconfigReg(PM4Queue *q, PM4SetUconfigReg *pkt)
 {
     q->incRptr(sizeof(PM4SetUconfigReg));
 
+    DPRINTF(PM4PacketProcessor, "SetUconfig offset %x data %x\n",
+            pkt->offset, pkt->data);
+
     // SET_UCONFIG_REG_START and pkt->offset are dword addresses
     uint32_t reg_addr = (PACKET3_SET_UCONFIG_REG_START + pkt->offset) * 4;
 
+    // Additional CPs respond to addresses 0x40000 apart.
+    reg_addr += 0x40000 * getIpId();
     gpuDevice->setRegVal(reg_addr, pkt->data);
 
     decodeNext(q);
@@ -851,7 +856,7 @@ PM4PacketProcessor::writeMMIO(PacketPtr pkt, Addr mmio_offset)
         break;
       case mmCP_HQD_PQ_DOORBELL_CONTROL:
         setHqdPqDoorbellCtrl(pkt->getLE<uint32_t>());
-        gpuDevice->setDoorbellType(getKiqDoorbellOffset(), Compute);
+        gpuDevice->setDoorbellType(getKiqDoorbellOffset(), Compute, getIpId());
         break;
       case mmCP_HQD_PQ_RPTR:
         setHqdPqPtr(pkt->getLE<uint32_t>());
@@ -913,7 +918,7 @@ PM4PacketProcessor::writeMMIO(PacketPtr pkt, Addr mmio_offset)
         break;
       case mmCP_RB_DOORBELL_CONTROL:
         setRbDoorbellCntrl(pkt->getLE<uint32_t>());
-        gpuDevice->setDoorbellType(getPqDoorbellOffset(), Gfx);
+        gpuDevice->setDoorbellType(getPqDoorbellOffset(), Gfx, getIpId());
         break;
       case mmCP_RB_DOORBELL_RANGE_LOWER:
         setRbDoorbellRangeLo(pkt->getLE<uint32_t>());
diff --git a/src/dev/amdgpu/pm4_packet_processor.hh b/src/dev/amdgpu/pm4_packet_processor.hh
index 4782e70829..82c3c2716f 100644
--- a/src/dev/amdgpu/pm4_packet_processor.hh
+++ b/src/dev/amdgpu/pm4_packet_processor.hh
@@ -63,6 +63,10 @@ class PM4PacketProcessor : public DmaVirtDevice
     std::unordered_map<uint16_t, PM4Queue *> queues;
     /* A map of PM4 queues based on doorbell offset */
     std::unordered_map<uint32_t, PM4Queue *> queuesMap;
+
+    int _ipId;
+    AddrRange _mmioRange;
+
   public:
     PM4PacketProcessor(const PM4PacketProcessorParams &p);
 
@@ -188,6 +192,9 @@ class PM4PacketProcessor : public DmaVirtDevice
     void setRbDoorbellCntrl(uint32_t data);
     void setRbDoorbellRangeLo(uint32_t data);
     void setRbDoorbellRangeHi(uint32_t data);
+
+    int getIpId() const { return _ipId; }
+    AddrRange getMMIORange() const { return _mmioRange; }
 };
 
 } // namespace gem5