From 0faa9510f95c06047709eb3dac1bb89242d0a61f Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 24 Apr 2024 10:42:22 -0700
Subject: [PATCH 1/6] arch-vega,gpu-compute: Fix misc ubsan runtime errors

Three main fixes:
 - Remove the initDynOperandInfo. UBSAN errors and exits due to things
   not being captured properly. After a few failed attempts playing with
   the capture list, just move the lambda to a new method.
 - Invalid data type size for some thread mask instructions. This might
   actually have caused silent bugs when the thread id was > 31.
 - Alignment issues with the operands.

Change-Id: I0297e10df0f0ab9730b6f1bd132602cd36b5e7ac
---
 src/arch/amdgpu/vega/insts/sop2.cc |  3 +-
 src/arch/amdgpu/vega/insts/vop3.cc |  4 +-
 src/arch/amdgpu/vega/operand.hh    |  4 +-
 src/gpu-compute/gpu_static_inst.cc | 82 ++++++++++++++++--------------
 src/gpu-compute/gpu_static_inst.hh |  3 ++
 5 files changed, 54 insertions(+), 42 deletions(-)
diff --git a/src/arch/amdgpu/vega/insts/sop2.cc b/src/arch/amdgpu/vega/insts/sop2.cc
index 93618b2124..a2965763f7 100644
--- a/src/arch/amdgpu/vega/insts/sop2.cc
+++ b/src/arch/amdgpu/vega/insts/sop2.cc
@@ -1224,7 +1224,8 @@ namespace VegaISA
         src0.read();
         src1.read();
 
-        sdst = src0.rawData() * src1.rawData();
+        ScalarRegI64 tmp = src0.rawData() * src1.rawData();
+        sdst = tmp & mask(32);
 
         sdst.write();
     } // execute
diff --git a/src/arch/amdgpu/vega/insts/vop3.cc b/src/arch/amdgpu/vega/insts/vop3.cc
index f78f64bc91..59d72ac9ed 100644
--- a/src/arch/amdgpu/vega/insts/vop3.cc
+++ b/src/arch/amdgpu/vega/insts/vop3.cc
@@ -8583,7 +8583,7 @@ namespace VegaISA
 
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
-                threadMask = ((1LL << lane) - 1LL);
+                threadMask = ((1ULL << lane) - 1ULL);
                 vdst[lane] = popCount(src0[lane] & bits(threadMask, 31, 0)) +
                              src1[lane];
             }
@@ -8633,7 +8633,7 @@ namespace VegaISA
 
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
-                threadMask = ((1LL << lane) - 1LL);
+                threadMask = ((1ULL << lane) - 1ULL);
                 vdst[lane] = popCount(src0[lane] & bits(threadMask, 63, 32)) +
                              src1[lane];
             }
diff --git a/src/arch/amdgpu/vega/operand.hh b/src/arch/amdgpu/vega/operand.hh
index 698161d918..d4a7436c75 100644
--- a/src/arch/amdgpu/vega/operand.hh
+++ b/src/arch/amdgpu/vega/operand.hh
@@ -490,7 +490,7 @@ namespace VegaISA
         typename std::enable_if<Condition, void>::type
         setBit(int bit, int bit_val)
         {
-            DataType &sgpr = *((DataType*)srfData.data());
+            GEM5_ALIGNED(8) DataType &sgpr = *((DataType*)srfData.data());
             replaceBits(sgpr, bit, bit_val);
         }
 
@@ -739,7 +739,7 @@ namespace VegaISA
          * of a register is 1 dword. this class will take care to do the
          * proper packing/unpacking of sub-dword operands.
          */
-        std::array<ScalarRegU32, NumDwords> srfData;
+        GEM5_ALIGNED(8) std::array<ScalarRegU32, NumDwords> srfData;
     };
 
     // typedefs for the various sizes/types of scalar operands
diff --git a/src/gpu-compute/gpu_static_inst.cc b/src/gpu-compute/gpu_static_inst.cc
index 063e87eee1..e2dd9f54f2 100644
--- a/src/gpu-compute/gpu_static_inst.cc
+++ b/src/gpu-compute/gpu_static_inst.cc
@@ -54,55 +54,63 @@ GPUStaticInst::disassemble()
     return disassembly;
 }
 
+
+void
+GPUStaticInst::generateVirtToPhysMap(Wavefront *wf, ComputeUnit *cu,
+                                     OperandInfo& op,
+                                     std::vector<OperandInfo>& opVec,
+                                     OpType opType)
+{
+    std::vector<int> virt_idxs;
+    std::vector<int> phys_idxs;
+
+    int num_dwords = op.sizeInDWords();
+    int virt_idx = op.registerIndex(wf->reservedScalarRegs);
+
+    int phys_idx = -1;
+    for (int i = 0; i < num_dwords; i++) {
+        if (opType == OpType::SRC_VEC || opType == OpType::DST_VEC) {
+            phys_idx = cu->registerManager->mapVgpr(wf, virt_idx + i);
+        } else {
+            assert(opType == OpType::SRC_SCALAR ||
+                   opType == OpType::DST_SCALAR);
+            phys_idx = cu->registerManager->mapSgpr(wf, virt_idx + i);
+        }
+        virt_idxs.push_back(virt_idx + i);
+        phys_idxs.push_back(phys_idx);
+    }
+    DPRINTF(GPUInst, "%s adding %s %s (%d->%d) operand that uses "
+            "%d registers.\n", disassemble(),
+            (opType == OpType::SRC_VEC || opType == OpType::DST_VEC) ?
+            "vector" : "scalar",
+            (opType == OpType::SRC_VEC || opType == OpType::SRC_SCALAR) ?
+            "src" : "dst", virt_idxs[0], phys_idxs[0], num_dwords);
+
+    op.setVirtToPhysMapping(virt_idxs, phys_idxs);
+
+    opVec.emplace_back(op);
+}
+
 void
 GPUStaticInst::initDynOperandInfo(Wavefront *wf, ComputeUnit *cu)
 {
-    // Lambda function, as this is only ever used here
-    auto generateVirtToPhysMap = [&](OperandInfo& op,
-                                     std::vector<OperandInfo>& opVec,
-                                     MapRegFn mapFn, OpType opType)
-    {
-        std::vector<int> virt_idxs;
-        std::vector<int> phys_idxs;
-
-        int num_dwords = op.sizeInDWords();
-        int virt_idx = op.registerIndex(wf->reservedScalarRegs);
-
-        int phys_idx = -1;
-        for (int i = 0; i < num_dwords; i++){
-            phys_idx = (cu->registerManager->*mapFn)(wf, virt_idx + i);
-            virt_idxs.push_back(virt_idx + i);
-            phys_idxs.push_back(phys_idx);
-        }
-        DPRINTF(GPUInst, "%s adding %s %s (%d->%d) operand that uses "
-                "%d registers.\n", disassemble(),
-                (opType == OpType::SRC_VEC || opType == OpType::DST_VEC) ?
-                "vector" : "scalar",
-                (opType == OpType::SRC_VEC || opType == OpType::SRC_SCALAR) ?
-                "src" : "dst", virt_idxs[0], phys_idxs[0], num_dwords);
-
-        op.setVirtToPhysMapping(virt_idxs, phys_idxs);
-
-        opVec.emplace_back(op);
-    };
-
     for (auto& srcOp : srcOps) {
         if (srcOp.isVectorReg()) {
-            generateVirtToPhysMap(srcOp, srcVecRegOps,
-                            &RegisterManager::mapVgpr, OpType::SRC_VEC);
+            generateVirtToPhysMap(wf, cu, srcOp, srcVecRegOps,
+                                  OpType::SRC_VEC);
         } else if (srcOp.isScalarReg()) {
-            generateVirtToPhysMap(srcOp, srcScalarRegOps,
-                            &RegisterManager::mapSgpr, OpType::SRC_SCALAR);
+            generateVirtToPhysMap(wf, cu, srcOp, srcScalarRegOps,
+                                  OpType::SRC_SCALAR);
         }
     }
 
     for (auto& dstOp : dstOps) {
         if (dstOp.isVectorReg()) {
-            generateVirtToPhysMap(dstOp, dstVecRegOps,
-                            &RegisterManager::mapVgpr, OpType::DST_VEC);
+            generateVirtToPhysMap(wf, cu, dstOp, dstVecRegOps,
+                                  OpType::DST_VEC);
         } else if (dstOp.isScalarReg()) {
-            generateVirtToPhysMap(dstOp, dstScalarRegOps,
-                            &RegisterManager::mapSgpr, OpType::DST_SCALAR);
+            generateVirtToPhysMap(wf, cu, dstOp, dstScalarRegOps,
+                                  OpType::DST_SCALAR);
         }
     }
 }
diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh
index 156f0e529d..6132ab2d29 100644
--- a/src/gpu-compute/gpu_static_inst.hh
+++ b/src/gpu-compute/gpu_static_inst.hh
@@ -321,6 +321,9 @@ class GPUStaticInst : public GPUStaticInstFlags
     int _ipdInstNum;
 
     std::bitset<Num_Flags> _flags;
+
+    void generateVirtToPhysMap(Wavefront *wf, ComputeUnit *cu, OperandInfo& op,
+                               std::vector<OperandInfo>& opVec, OpType opType);
 };
 
 class KernelLaunchStaticInst : public GPUStaticInst

From 386fb3d1cc0836a512f1f00a190f6e58b7d15d13 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 24 Apr 2024 17:55:22 -0700
Subject: [PATCH 2/6] configs: Fix HSA packer processor address

The address has one too many zeros and is therefore placed in a memory
region usually used for system memory. As a result this causes failure
when trying to run a simulation with a huge amount of memory.

Change the address to be within the C000'0000h - FFFF'FFFFh X86 I/O hole
as was intended.

Change-Id: I5d03ac19ea3b2c01a8c431073c12fa1868b3df24
---
 configs/example/gpufs/system/system.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py
index b3943843e3..7c596f0ccf 100644
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -108,7 +108,7 @@ def makeGpuFSSystem(args):
     system.cpu.append(shader)
 
     # This arbitrary address is something in the X86 I/O hole
-    hsapp_gpu_map_paddr = 0xE00000000
+    hsapp_gpu_map_paddr = 0xE0000000
     hsapp_pt_walker = VegaPagetableWalker()
     gpu_hsapp = HSAPacketProcessor(
         pioAddr=hsapp_gpu_map_paddr,

From 2703fb56991f034bd7c633ae3ec367dcc7af7073 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Thu, 2 May 2024 11:39:48 -0700
Subject: [PATCH 3/6] gpu-compute: Fix valgrind memleak complaints

Fixes several memory leaks, mostly of small and medium severity. Fixes
mismatched new/new[] and delete/delete[] calls.

Change-Id: Iedafc409389bd94e45f330bc587d6d72d1971219
---
 src/dev/amdgpu/amdgpu_device.cc        |  8 +++++++-
 src/dev/amdgpu/interrupt_handler.cc    |  8 ++++++--
 src/dev/amdgpu/interrupt_handler.hh    |  6 ++++--
 src/dev/amdgpu/pm4_packet_processor.cc | 13 +++++++++++--
 src/dev/amdgpu/sdma_engine.cc          |  2 +-
 src/gpu-compute/compute_unit.cc        |  4 ++--
 6 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc
index f5bf0192bc..6bb5f9c2c5 100644
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -453,6 +453,8 @@ AMDGPUDevice::writeFrame(PacketPtr pkt, Addr offset)
 
     auto system = cp->shader()->gpuCmdProc.system();
     system->getDeviceMemory(writePkt)->access(writePkt);
+
+    delete writePkt;
 }
 
 void
@@ -671,7 +673,10 @@ AMDGPUDevice::getRegVal(uint64_t addr)
     DPRINTF(AMDGPUDevice, "Getting register 0x%lx = %x\n",
             fixup_addr, pkt->getLE<uint32_t>());
 
-    return pkt->getLE<uint32_t>();
+    pkt_data = pkt->getLE<uint32_t>();
+    delete pkt;
+
+    return pkt_data;
 }
 
 void
@@ -686,6 +691,7 @@ AMDGPUDevice::setRegVal(uint64_t addr, uint32_t value)
     PacketPtr pkt = Packet::createWrite(request);
     pkt->dataStatic((uint8_t *)&pkt_data);
     writeMMIO(pkt, addr);
+    delete pkt;
 }
 
 void
diff --git a/src/dev/amdgpu/interrupt_handler.cc b/src/dev/amdgpu/interrupt_handler.cc
index cb99ba7a39..4ad2527108 100644
--- a/src/dev/amdgpu/interrupt_handler.cc
+++ b/src/dev/amdgpu/interrupt_handler.cc
@@ -130,6 +130,10 @@ AMDGPUInterruptHandler::DmaEvent::process()
     } else {
         fatal("Interrupt Handler DMA event returned bad value: %d\n", data);
     }
+
+    if (dataPtr) {
+        delete [] dataPtr;
+    }
 }
 
 void
@@ -140,7 +144,7 @@ AMDGPUInterruptHandler::submitWritePointer()
     Addr paddr = regs.WptrAddr;
     std::memcpy(dataPtr, &regs.IH_Wptr, sizeof(uint32_t));
 
-    dmaEvent = new AMDGPUInterruptHandler::DmaEvent(this, 2);
+    dmaEvent = new AMDGPUInterruptHandler::DmaEvent(this, 2, dataPtr);
     dmaWrite(paddr, sizeof(uint32_t), dmaEvent, dataPtr);
 }
 
@@ -157,7 +161,7 @@ AMDGPUInterruptHandler::submitInterruptCookie()
 
     DPRINTF(AMDGPUDevice, "InterruptHandler rptr: 0x%x wptr: 0x%x\n",
                           regs.IH_Rptr, regs.IH_Wptr);
-    dmaEvent = new AMDGPUInterruptHandler::DmaEvent(this, 1);
+    dmaEvent = new AMDGPUInterruptHandler::DmaEvent(this, 1, dataPtr);
     dmaWrite(paddr, cookieSize, dmaEvent, dataPtr);
 
     interruptQueue.pop();
diff --git a/src/dev/amdgpu/interrupt_handler.hh b/src/dev/amdgpu/interrupt_handler.hh
index a895eabafc..b7ac4b29ee 100644
--- a/src/dev/amdgpu/interrupt_handler.hh
+++ b/src/dev/amdgpu/interrupt_handler.hh
@@ -136,10 +136,12 @@ class AMDGPUInterruptHandler : public DmaDevice
       private:
         AMDGPUInterruptHandler *deviceIh;
         uint32_t data;
+        uint8_t *dataPtr;
 
       public:
-        DmaEvent(AMDGPUInterruptHandler *deviceIh, uint32_t data)
-            : Event(), deviceIh(deviceIh), data(data)
+        DmaEvent(AMDGPUInterruptHandler *deviceIh, uint32_t data,
+                 uint8_t* _dataPtr)
+            : Event(), deviceIh(deviceIh), data(data), dataPtr(_dataPtr)
         {
             setFlags(Event::AutoDelete);
         }
diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc
index 62e817aa98..a921942678 100644
--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -456,8 +456,6 @@ PM4PacketProcessor::mapQueues(PM4Queue *q, PM4MapQueues *pkt)
     } else {
         panic("Unknown engine for MQD: %d\n", pkt->engineSel);
     }
-
-    decodeNext(q);
 }
 
 void
@@ -494,6 +492,9 @@ PM4PacketProcessor::processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
             "hqdAQL %d.\n", mqd->base, mqd->mqdBase, mqd->aql);
 
     gpuDevice->processPendingDoorbells(offset);
+
+    delete pkt;
+    decodeNext(q);
 }
 
 void
@@ -524,6 +525,9 @@ PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
     gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC, getIpId());
 
     gpuDevice->processPendingDoorbells(pkt->doorbellOffset << 2);
+
+    delete pkt;
+    decodeNext(q);
 }
 
 void
@@ -656,6 +660,7 @@ PM4PacketProcessor::unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt)
                 dmaWriteVirt(addr, sizeof(QueueDesc), cb, mqd);
                 queues.erase(id);
                 hsa_pp.unsetDeviceQueueDesc(id, 8);
+                delete mqd;
             }
         }
         gpuDevice->deallocateAllQueues();
@@ -754,6 +759,7 @@ PM4PacketProcessor::indirectBuffer(PM4Queue *q, PM4IndirectBuf *pkt)
     q->ibBase(pkt->ibBase);
     q->wptr(pkt->ibSize * sizeof(uint32_t));
 
+    delete pkt;
     decodeNext(q);
 }
 
@@ -766,6 +772,7 @@ PM4PacketProcessor::switchBuffer(PM4Queue *q, PM4SwitchBuf *pkt)
     DPRINTF(PM4PacketProcessor, "PM4 switching buffer, rptr: %p.\n",
             q->wptr());
 
+    delete pkt;
     decodeNext(q);
 }
 
@@ -784,6 +791,7 @@ PM4PacketProcessor::setUconfigReg(PM4Queue *q, PM4SetUconfigReg *pkt)
     reg_addr += 0x40000 * getIpId();
     gpuDevice->setRegVal(reg_addr, pkt->data);
 
+    delete pkt;
     decodeNext(q);
 }
 
@@ -800,6 +808,7 @@ PM4PacketProcessor::waitRegMem(PM4Queue *q, PM4WaitRegMem *pkt)
     DPRINTF(PM4PacketProcessor, "    Mask: %lx\n", pkt->mask);
     DPRINTF(PM4PacketProcessor, "    Poll Interval: %lx\n", pkt->pollInterval);
 
+    delete pkt;
     decodeNext(q);
 }
 
diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc
index 070c04fe64..dcf0acac1a 100644
--- a/src/dev/amdgpu/sdma_engine.cc
+++ b/src/dev/amdgpu/sdma_engine.cc
@@ -1132,7 +1132,7 @@ SDMAEngine::constFillDone(SDMAQueue *q, sdmaConstFill *pkt, uint8_t *fill_data)
 {
     DPRINTF(SDMAEngine, "ConstFill to %lx done\n", pkt->addr);
 
-    delete fill_data;
+    delete [] fill_data;
     delete pkt;
     decodeNext(q);
 }
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index daad5e9b40..f12293500d 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -1735,7 +1735,7 @@ ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt)
         SystemHubEvent *resp_event = new SystemHubEvent(pkt, this);
         compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
     } else if (!(sendTimingReq(pkt))) {
-        retries.push_back(std::make_pair(pkt, gpuDynInst));
+        retries.emplace_back(pkt, gpuDynInst);
 
         if (gpuDynInst) {
             DPRINTF(GPUPort,
@@ -1772,7 +1772,7 @@ ComputeUnit::ScalarDataPort::MemReqEvent::process()
         SystemHubEvent *resp_event = new SystemHubEvent(pkt, &scalarDataPort);
         compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
     } else if (!(scalarDataPort.sendTimingReq(pkt))) {
-        scalarDataPort.retries.push_back(pkt);
+        scalarDataPort.retries.emplace_back(pkt);
 
         DPRINTF(GPUPort,
                 "CU%d: WF[%d][%d]: addr %#x data req failed!\n",

From 8249d6d1cd767949ac908d593c34901eab2b4116 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Thu, 2 May 2024 15:58:47 -0700
Subject: [PATCH 4/6] arch-vega: Remove FP asserts in VOP3 lane manip insts

The VOP3 instruction encoding generally states that ABS/NEG modifiers in
the instruction encoding are only valid on floating point data types.
This is currently coded in gem5 to mean floating point *instructions*.
For untyped instructions like V_CNDMASK_B32, we don't actually know what
the data type is. We must trust that the compiler did not attempt to
apply these bits to non-FP data types.

This commit simply removes the asserts. The ABS/NEG modifiers are
therefore ignored which is consistent with the ISA documentation.
This is done on the lane manipulation instructions V_CNDMASK_B32,
V_READLINE_B32, and V_WRITELANE_B32 which are typically used to mask off
or move data between registers. Other bitwise instructions (e.g.,
V_OR_B32) keep the asserts as bitwise operations on FP types are
genernally illegal in languages like C++.

Change-Id: I478c5272ba96383a063b2828de21d60948b25c8f
---
 src/arch/amdgpu/vega/insts/vop3.cc | 30 ------------------------------
 1 file changed, 30 deletions(-)

diff --git a/src/arch/amdgpu/vega/insts/vop3.cc b/src/arch/amdgpu/vega/insts/vop3.cc
index 59d72ac9ed..18446d2e2b 100644
--- a/src/arch/amdgpu/vega/insts/vop3.cc
+++ b/src/arch/amdgpu/vega/insts/vop3.cc
@@ -66,16 +66,6 @@ namespace VegaISA
         src1.readSrc();
         vcc.read();
 
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
         for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
             if (wf->execMask(lane)) {
                 vdst[lane] = bits(vcc.rawData(), lane)
@@ -8440,16 +8430,6 @@ namespace VegaISA
         src0.readSrc();
         src1.read();
 
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
         sdst = src0[src1.rawData() & 0x3f];
 
         sdst.write();
@@ -8484,16 +8464,6 @@ namespace VegaISA
         src1.read();
         vdst.read();
 
-        /**
-         * input modifiers are supported by FP operations only
-         */
-        assert(!(instData.ABS & 0x1));
-        assert(!(instData.ABS & 0x2));
-        assert(!(instData.ABS & 0x4));
-        assert(!(extData.NEG & 0x1));
-        assert(!(extData.NEG & 0x2));
-        assert(!(extData.NEG & 0x4));
-
         vdst[src1.rawData() & 0x3f] = src0.rawData();
 
         vdst.write();

From 29f63f630b3db1435569dd2157a6198dc4155084 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Fri, 3 May 2024 10:06:34 -0700
Subject: [PATCH 5/6] dev-amdgpu: Correct missing GART warning

SDMA ptePde packets are generating a warning that a GART address is
missing, causing a wrong address to be clobbered by the operation.

This commit fixes this by converting the GART address when the queue is
running in privledged mode, which is the only mode allowed to use GART
addresses. This removes the warnings and writes to the correct memory
region.

Change-Id: I64acac308db2431c5996b876bf4cda704f51cf25
---
 src/dev/amdgpu/sdma_engine.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc
index dcf0acac1a..735be554b4 100644
--- a/src/dev/amdgpu/sdma_engine.cc
+++ b/src/dev/amdgpu/sdma_engine.cc
@@ -1000,6 +1000,9 @@ SDMAEngine::ptePde(SDMAQueue *q, sdmaPtePde *pkt)
                                              sizeof(uint64_t) * pkt->count, 0,
                                              cb);
     } else {
+        if (q->priv()) {
+            pkt->dest = getGARTAddr(pkt->dest);
+        }
         auto cb = new DmaVirtCallback<uint64_t>(
             [ = ] (const uint64_t &) { ptePdeDone(q, pkt, dmaBuffer); });
         dmaWriteVirt(pkt->dest, sizeof(uint64_t) * pkt->count, cb,

From 3490d5bf189eb2600a35572341a8073fdbd0d333 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 24 Apr 2024 10:42:58 -0700
Subject: [PATCH 6/6] gpu-compute: Add DebugFlag for LDS

This prints what values are read/written to LDS and the previous value
on write. This is useful for debugging problems with LDS instructions.

Change-Id: I30063327bec1a1a808914a018467d5d78d5d58b4
---
 src/gpu-compute/SConscript   |  3 ++-
 src/gpu-compute/lds_state.hh | 50 ++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript
index e4536ba2a5..23e3377f50 100644
--- a/src/gpu-compute/SConscript
+++ b/src/gpu-compute/SConscript
@@ -84,6 +84,7 @@ DebugFlag('GPUExec')
 DebugFlag('GPUFetch')
 DebugFlag('GPUInst')
 DebugFlag('GPUKernelInfo')
+DebugFlag('GPULDS')
 DebugFlag('GPUMem')
 DebugFlag('GPUPort')
 DebugFlag('GPUPrefetch')
@@ -106,4 +107,4 @@ DebugFlag('WavefrontStack')
 CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
                         'GPUMem', 'GPUPort', 'GPUSched', 'GPUSRF', 'GPUSync',
                         'GPUTLB', 'GPUVRF', 'GPURFC', 'GPUWgLatency',
-                        'GPUKernelInfo', 'GPUInitAbi'])
+                        'GPUKernelInfo', 'GPUInitAbi', 'GPULDS'])
diff --git a/src/gpu-compute/lds_state.hh b/src/gpu-compute/lds_state.hh
index 3228b7822c..d336d35079 100644
--- a/src/gpu-compute/lds_state.hh
+++ b/src/gpu-compute/lds_state.hh
@@ -39,6 +39,7 @@
 #include <utility>
 #include <vector>
 
+#include "debug/GPULDS.hh"
 #include "gpu-compute/misc.hh"
 #include "mem/port.hh"
 #include "params/LdsState.hh"
@@ -75,10 +76,30 @@ class LdsChunk
          * chunk allocated to this WG we return 0.
          */
         if (index >= chunk.size()) {
+            DPRINTF(GPULDS, "LDS[%d][%d]: Read 0 beyond size (%ld)\n",
+                    dispatchId, wgId, chunk.size());
             return (T)0;
         }
 
         T *p0 = (T *) (&(chunk.at(index)));
+
+        if (sizeof(T) <= 4) {
+            [[maybe_unused]] uint32_t int_val =
+                *reinterpret_cast<uint32_t*>(p0);
+            DPRINTF(GPULDS, "LDS[%d][%d]: Read %08x from index %d\n",
+                    dispatchId, wgId, int_val, index);
+        } else if (sizeof(T) <= 8) {
+            [[maybe_unused]] uint64_t int_val =
+                *reinterpret_cast<uint64_t*>(p0);
+            DPRINTF(GPULDS, "LDS[%d][%d]: Read %016lx from index %d\n",
+                    dispatchId, wgId, int_val, index);
+        } else if (sizeof(T) <= 16) {
+            [[maybe_unused]] uint64_t *int_vals =
+                reinterpret_cast<uint64_t*>(p0);
+            DPRINTF(GPULDS, "LDS[%d][%d]: Read %016lx%016lx from index %d\n",
+                    dispatchId, wgId, int_vals[1], int_vals[0], index);
+        }
+
         return *p0;
     }
 
@@ -94,10 +115,33 @@ class LdsChunk
          * chunk allocated to this WG are dropped.
          */
         if (index >= chunk.size()) {
+            DPRINTF(GPULDS, "LDS[%d][%d]: Ignoring write beyond size (%ld)\n",
+                    dispatchId, wgId, chunk.size());
             return;
         }
 
         T *p0 = (T *) (&(chunk.at(index)));
+
+        if (sizeof(T) <= 4) {
+            [[maybe_unused]] uint32_t prev_val =
+                *reinterpret_cast<uint32_t*>(p0);
+            DPRINTF(GPULDS, "LDS[%d][%d]: Write %08lx to index %d (was "
+                    "%08lx)\n", dispatchId, wgId, value, index, prev_val);
+        } else if (sizeof(T) <= 8) {
+            [[maybe_unused]] uint64_t prev_val =
+                *reinterpret_cast<uint64_t*>(p0);
+            DPRINTF(GPULDS, "LDS[%d][%d]: Write %016lx to index %d (was "
+                    "%016lx)\n", dispatchId, wgId, value, index, prev_val);
+        } else if (sizeof(T) <= 16) {
+            [[maybe_unused]] uint64_t *prev_vals =
+                reinterpret_cast<uint64_t*>(p0);
+            [[maybe_unused]] const uint64_t *next_vals =
+                reinterpret_cast<const uint64_t*>(&value);
+            DPRINTF(GPULDS, "LDS[%d][%d]: Write %016lx%016lx to index %d "
+                    "(was %016lx%016lx)\n", dispatchId, wgId, next_vals[1],
+                    next_vals[0], index, prev_vals[1], prev_vals[0]);
+        }
+
         *p0 = value;
     }
 
@@ -131,6 +175,9 @@ class LdsChunk
         return chunk.size();
     }
 
+    uint32_t dispatchId;
+    uint32_t wgId;
+
   protected:
     // the actual data store for this slice of the LDS
     std::vector<uint8_t> chunk;
@@ -402,6 +449,9 @@ class LdsState: public ClockedObject
             // make an entry for this workgroup
             refCounter[dispatchId][wgId] = 0;
 
+            chunkMap[dispatchId][wgId].dispatchId = dispatchId;
+            chunkMap[dispatchId][wgId].wgId = wgId;
+
             return &chunkMap[dispatchId][wgId];
         }
     }