From 0d0b68266cf17df1e37f828ce4c2fcc2b365bd83 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Fri, 26 Jul 2024 10:38:27 -0700
Subject: [PATCH 1/3] dev-amdgpu: Fix bad free in SDMA

The SDMA engine copies data in chunks. It currently uses the pointer
returned from new[] and manipulates it using pointer arithmetic. This
modified pointer is then passed to the completion function which deletes
the pointer. Since it is not the original pointer allocated by new[]
this triggers issues in ASAN.

Change-Id: I03ccf026633285e75005509445c62fcbda8eb978
---
 src/dev/amdgpu/sdma_engine.cc | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc
index e1e123df48..582f128f82 100644
--- a/src/dev/amdgpu/sdma_engine.cc
+++ b/src/dev/amdgpu/sdma_engine.cc
@@ -653,7 +653,7 @@ SDMAEngine::writeDone(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer)
 {
     DPRINTF(SDMAEngine, "Write packet completed to %p, %d dwords\n",
             pkt->dest, pkt->count);
-    delete []dmaBuffer;
+    delete [] dmaBuffer;
     delete pkt;
     decodeNext(q);
 }
@@ -686,6 +686,7 @@ SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt)
         // Copy the minimum page size at a time in case the physical addresses
         // are not contiguous.
         ChunkGenerator gen(pkt->source, pkt->count, AMDGPU_MMHUB_PAGE_SIZE);
+        uint8_t *buffer_ptr = dmaBuffer;
         for (; !gen.done(); gen.next()) {
             Addr chunk_addr = getDeviceAddress(gen.addr());
             assert(chunk_addr);
@@ -693,10 +694,10 @@ SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt)
             DPRINTF(SDMAEngine, "Copying chunk of %d bytes from %#lx (%#lx)\n",
                     gen.size(), gen.addr(), chunk_addr);
 
-            gpuDevice->getMemMgr()->readRequest(chunk_addr, dmaBuffer,
+            gpuDevice->getMemMgr()->readRequest(chunk_addr, buffer_ptr,
                                                 gen.size(), 0,
                                                 gen.last() ? cb : nullptr);
-            dmaBuffer += gen.size();
+            buffer_ptr += gen.size();
         }
     } else {
         auto cb = new DmaVirtCallback<uint64_t>(
@@ -731,6 +732,7 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
         // Copy the minimum page size at a time in case the physical addresses
         // are not contiguous.
         ChunkGenerator gen(pkt->dest, pkt->count, AMDGPU_MMHUB_PAGE_SIZE);
+        uint8_t *buffer_ptr = dmaBuffer;
         for (; !gen.done(); gen.next()) {
             Addr chunk_addr = getDeviceAddress(gen.addr());
             assert(chunk_addr);
@@ -738,11 +740,11 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
             DPRINTF(SDMAEngine, "Copying chunk of %d bytes to %#lx (%#lx)\n",
                     gen.size(), gen.addr(), chunk_addr);
 
-            gpuDevice->getMemMgr()->writeRequest(chunk_addr, dmaBuffer,
+            gpuDevice->getMemMgr()->writeRequest(chunk_addr, buffer_ptr,
                                                  gen.size(), 0,
                                                  gen.last() ? cb : nullptr);
 
-            dmaBuffer += gen.size();
+            buffer_ptr += gen.size();
         }
     } else {
         auto cb = new DmaVirtCallback<uint64_t>(
@@ -770,7 +772,7 @@ SDMAEngine::copyDone(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
 {
     DPRINTF(SDMAEngine, "Copy completed to %p, %d dwords\n",
             pkt->dest, pkt->count);
-    delete []dmaBuffer;
+    delete [] dmaBuffer;
     delete pkt;
     decodeNext(q);
 }
@@ -1018,7 +1020,7 @@ SDMAEngine::ptePdeDone(SDMAQueue *q, sdmaPtePde *pkt, uint64_t *dmaBuffer)
     DPRINTF(SDMAEngine, "PtePde packet completed to %p, %d 2dwords\n",
             pkt->dest, pkt->count);
 
-    delete []dmaBuffer;
+    delete [] dmaBuffer;
     delete pkt;
     decodeNext(q);
 }
@@ -1108,6 +1110,7 @@ SDMAEngine::constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header)
         // Copy the minimum page size at a time in case the physical addresses
         // are not contiguous.
         ChunkGenerator gen(pkt->addr, fill_bytes, AMDGPU_MMHUB_PAGE_SIZE);
+        uint8_t *fill_data_ptr = fill_data;
         for (; !gen.done(); gen.next()) {
             Addr chunk_addr = getDeviceAddress(gen.addr());
             assert(chunk_addr);
@@ -1115,10 +1118,10 @@ SDMAEngine::constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header)
             DPRINTF(SDMAEngine, "Copying chunk of %d bytes from %#lx (%#lx)\n",
                     gen.size(), gen.addr(), chunk_addr);
 
-            gpuDevice->getMemMgr()->writeRequest(chunk_addr, fill_data,
+            gpuDevice->getMemMgr()->writeRequest(chunk_addr, fill_data_ptr,
                                                  gen.size(), 0,
                                                  gen.last() ? cb : nullptr);
-            fill_data += gen.size();
+            fill_data_ptr += gen.size();
         }
     } else {
         DPRINTF(SDMAEngine, "ConstFill %d bytes of %x to host at %lx\n",

From db0d5f19cf09ac81b1528f40b31e9b5614f57a59 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 7 Aug 2024 12:49:00 -0700
Subject: [PATCH 2/3] dev-amdgpu: Add cleanup events for SDMA

SDMA packets which use dmaVirtWrites call their completion event before
the write takes place in the Ruby protocol. This causes a use-after-free
issue corruption random memory locations leading to random errors. This
commit adds a cleanup event for each packet that uses DMA and sets the
cleanup latency as 10000 ticks. In atomic mode, the writes complete
exactly 2000 ticks after the completion event is called and therefore a
fixed latency can be used. This is not tested with timing mode, which
does not work with GPUFS at the moment, so a warning is added to give an
idea where to look in case the same issue occurs once timing mode is
supported.

Change-Id: I9ee2689f2becc46bb7794b18b31205f1606109d8
---
 src/dev/amdgpu/sdma_engine.cc | 61 +++++++++++++++++++++++++++++++++--
 src/dev/amdgpu/sdma_engine.hh |  3 ++
 2 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc
index 582f128f82..6955837a09 100644
--- a/src/dev/amdgpu/sdma_engine.cc
+++ b/src/dev/amdgpu/sdma_engine.cc
@@ -38,6 +38,7 @@
 #include "dev/amdgpu/interrupt_handler.hh"
 #include "dev/amdgpu/sdma_commands.hh"
 #include "dev/amdgpu/sdma_mmio.hh"
+#include "gpu-compute/gpu_command_processor.hh"
 #include "mem/packet.hh"
 #include "mem/packet_access.hh"
 #include "params/SDMAEngine.hh"
@@ -653,11 +654,29 @@ SDMAEngine::writeDone(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer)
 {
     DPRINTF(SDMAEngine, "Write packet completed to %p, %d dwords\n",
             pkt->dest, pkt->count);
-    delete [] dmaBuffer;
+
+    auto cleanup_cb = new EventFunctionWrapper(
+        [ = ]{ writeCleanup(dmaBuffer); }, name());
+
+    auto system_ptr = gpuDevice->CP()->system();
+    if (!system_ptr->isAtomicMode()) {
+        warn_once("SDMA cleanup assumes 2000 tick timing for completion."
+                " This has not been tested in timing mode\n");
+    }
+
+    // Only 2000 ticks should be necessary, but add additional padding.
+    schedule(cleanup_cb, curTick() + 10000);
+
     delete pkt;
     decodeNext(q);
 }
 
+void
+SDMAEngine::writeCleanup(uint32_t *dmaBuffer)
+{
+    delete [] dmaBuffer;
+}
+
 /* Implements a copy packet. */
 void
 SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt)
@@ -747,6 +766,7 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
             buffer_ptr += gen.size();
         }
     } else {
+        DPRINTF(SDMAEngine, "Copying to host address %#lx\n", pkt->dest);
         auto cb = new DmaVirtCallback<uint64_t>(
             [ = ] (const uint64_t &) { copyDone(q, pkt, dmaBuffer); });
         dmaWriteVirt(pkt->dest, pkt->count, cb, (void *)dmaBuffer);
@@ -772,11 +792,29 @@ SDMAEngine::copyDone(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
 {
     DPRINTF(SDMAEngine, "Copy completed to %p, %d dwords\n",
             pkt->dest, pkt->count);
-    delete [] dmaBuffer;
+
+    auto cleanup_cb = new EventFunctionWrapper(
+        [ = ]{ copyCleanup(dmaBuffer); }, name());
+
+    auto system_ptr = gpuDevice->CP()->system();
+    if (!system_ptr->isAtomicMode()) {
+        warn_once("SDMA cleanup assumes 2000 tick timing for completion."
+                " This has not been tested in timing mode\n");
+    }
+
+    // Only 2000 ticks should be necessary, but add additional padding.
+    schedule(cleanup_cb, curTick() + 10000);
+
     delete pkt;
     decodeNext(q);
 }
 
+void
+SDMAEngine::copyCleanup(uint8_t *dmaBuffer)
+{
+    delete [] dmaBuffer;
+}
+
 /* Implements an indirect buffer packet. */
 void
 SDMAEngine::indirectBuffer(SDMAQueue *q, sdmaIndirectBuffer *pkt)
@@ -1020,11 +1058,28 @@ SDMAEngine::ptePdeDone(SDMAQueue *q, sdmaPtePde *pkt, uint64_t *dmaBuffer)
     DPRINTF(SDMAEngine, "PtePde packet completed to %p, %d 2dwords\n",
             pkt->dest, pkt->count);
 
-    delete [] dmaBuffer;
+    auto cleanup_cb = new EventFunctionWrapper(
+        [ = ]{ ptePdeCleanup(dmaBuffer); }, name());
+
+    auto system_ptr = gpuDevice->CP()->system();
+    if (!system_ptr->isAtomicMode()) {
+        warn_once("SDMA cleanup assumes 2000 tick timing for completion."
+                " This has not been tested in timing mode\n");
+    }
+
+    // Only 2000 ticks should be necessary, but add additional padding.
+    schedule(cleanup_cb, curTick() + 10000);
+
     delete pkt;
     decodeNext(q);
 }
 
+void
+SDMAEngine::ptePdeCleanup(uint64_t *dmaBuffer)
+{
+    delete [] dmaBuffer;
+}
+
 void
 SDMAEngine::atomic(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt)
 {
diff --git a/src/dev/amdgpu/sdma_engine.hh b/src/dev/amdgpu/sdma_engine.hh
index 9407b97d73..d5fe646fac 100644
--- a/src/dev/amdgpu/sdma_engine.hh
+++ b/src/dev/amdgpu/sdma_engine.hh
@@ -227,9 +227,11 @@ class SDMAEngine : public DmaVirtDevice
     void write(SDMAQueue *q, sdmaWrite *pkt);
     void writeReadData(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer);
     void writeDone(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer);
+    void writeCleanup(uint32_t *dmaBuffer);
     void copy(SDMAQueue *q, sdmaCopy *pkt);
     void copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer);
     void copyDone(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer);
+    void copyCleanup(uint8_t *dmaBuffer);
     void indirectBuffer(SDMAQueue *q, sdmaIndirectBuffer *pkt);
     void fence(SDMAQueue *q, sdmaFence *pkt);
     void fenceDone(SDMAQueue *q, sdmaFence *pkt);
@@ -243,6 +245,7 @@ class SDMAEngine : public DmaVirtDevice
     bool pollRegMemFunc(uint32_t value, uint32_t reference, uint32_t func);
     void ptePde(SDMAQueue *q, sdmaPtePde *pkt);
     void ptePdeDone(SDMAQueue *q, sdmaPtePde *pkt, uint64_t *dmaBuffer);
+    void ptePdeCleanup(uint64_t *dmaBuffer);
     void atomic(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt);
     void atomicData(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt,
                     uint64_t *dmaBuffer);

From 84fedecafef8529bc48361c8a86445b5cf66d72c Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Wed, 7 Aug 2024 12:52:48 -0700
Subject: [PATCH 3/3] gpu-compute: Update Requests for invalidations

The SQC and TCC invalidations share a Request pointer which they both
modify. This can cause some problems, so use a different request pointer
for each invalidate. The setContext call is also removed as the value
being assigned to it is uninitialized.

Change-Id: I82ea7aa44a4f4515c1560993caa26cc6a89355af
---
 src/gpu-compute/compute_unit.cc |  4 ----
 src/gpu-compute/shader.cc       | 14 +++++++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index 807fd21d4d..598864f9f2 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -409,8 +409,6 @@ ComputeUnit::doInvalidate(RequestPtr req, int kernId){
 
     // kern_id will be used in inv responses
     gpuDynInst->kern_id = kernId;
-    // update contextId field
-    req->setContext(gpuDynInst->wfDynId);
 
     injectGlobalMemFence(gpuDynInst, true, req);
 }
@@ -438,8 +436,6 @@ ComputeUnit::doSQCInvalidate(RequestPtr req, int kernId){
 
     // kern_id will be used in inv responses
     gpuDynInst->kern_id = kernId;
-    // update contextId field
-    req->setContext(gpuDynInst->wfDynId);
 
     gpuDynInst->staticInstruction()->setFlag(GPUStaticInst::Scalar);
     scalarMemoryPipe.injectScalarMemFence(gpuDynInst, true, req);
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
index 13b03b0a34..b7108efdf9 100644
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -214,19 +214,23 @@ Shader::prepareInvalidate(HSAQueueEntry *task) {
     for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
         // create a request to hold INV info; the request's fields will
         // be updated in cu before use
-        auto req = std::make_shared<Request>(0, 0, 0,
-                                             cuList[i_cu]->requestorId(),
-                                             0, -1);
+        auto tcc_req = std::make_shared<Request>(0, 0, 0,
+                                                 cuList[i_cu]->requestorId(),
+                                                 0, -1);
 
         _dispatcher.updateInvCounter(kernId, +1);
         // all necessary INV flags are all set now, call cu to execute
-        cuList[i_cu]->doInvalidate(req, task->dispatchId());
+        cuList[i_cu]->doInvalidate(tcc_req, task->dispatchId());
 
 
         // A set of CUs share a single SQC cache. Send a single invalidate
         // request to each SQC
+        auto sqc_req = std::make_shared<Request>(0, 0, 0,
+                                                 cuList[i_cu]->requestorId(),
+                                                 0, -1);
+
         if ((i_cu % n_cu_per_sqc) == 0) {
-            cuList[i_cu]->doSQCInvalidate(req, task->dispatchId());
+            cuList[i_cu]->doSQCInvalidate(sqc_req, task->dispatchId());
         }
 
         // I don't like this. This is intrusive coding.