From 0d0b68266cf17df1e37f828ce4c2fcc2b365bd83 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Fri, 26 Jul 2024 10:38:27 -0700 Subject: [PATCH 1/3] dev-amdgpu: Fix bad free in SDMA The SDMA engine copies data in chunks. It currently uses the pointer returned from new[] and manipulates it using pointer arithmetic. This modified pointer is then passed to the completion function which deletes the pointer. Since it is not the original pointer allocated by new[] this triggers issues in ASAN. Change-Id: I03ccf026633285e75005509445c62fcbda8eb978 --- src/dev/amdgpu/sdma_engine.cc | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc index e1e123df48..582f128f82 100644 --- a/src/dev/amdgpu/sdma_engine.cc +++ b/src/dev/amdgpu/sdma_engine.cc @@ -653,7 +653,7 @@ SDMAEngine::writeDone(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer) { DPRINTF(SDMAEngine, "Write packet completed to %p, %d dwords\n", pkt->dest, pkt->count); - delete []dmaBuffer; + delete [] dmaBuffer; delete pkt; decodeNext(q); } @@ -686,6 +686,7 @@ SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt) // Copy the minimum page size at a time in case the physical addresses // are not contiguous. ChunkGenerator gen(pkt->source, pkt->count, AMDGPU_MMHUB_PAGE_SIZE); + uint8_t *buffer_ptr = dmaBuffer; for (; !gen.done(); gen.next()) { Addr chunk_addr = getDeviceAddress(gen.addr()); assert(chunk_addr); @@ -693,10 +694,10 @@ SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt) DPRINTF(SDMAEngine, "Copying chunk of %d bytes from %#lx (%#lx)\n", gen.size(), gen.addr(), chunk_addr); - gpuDevice->getMemMgr()->readRequest(chunk_addr, dmaBuffer, + gpuDevice->getMemMgr()->readRequest(chunk_addr, buffer_ptr, gen.size(), 0, gen.last() ? cb : nullptr); - dmaBuffer += gen.size(); + buffer_ptr += gen.size(); } } else { auto cb = new DmaVirtCallback( @@ -731,6 +732,7 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer) // Copy the minimum page size at a time in case the physical addresses // are not contiguous. ChunkGenerator gen(pkt->dest, pkt->count, AMDGPU_MMHUB_PAGE_SIZE); + uint8_t *buffer_ptr = dmaBuffer; for (; !gen.done(); gen.next()) { Addr chunk_addr = getDeviceAddress(gen.addr()); assert(chunk_addr); @@ -738,11 +740,11 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer) DPRINTF(SDMAEngine, "Copying chunk of %d bytes to %#lx (%#lx)\n", gen.size(), gen.addr(), chunk_addr); - gpuDevice->getMemMgr()->writeRequest(chunk_addr, dmaBuffer, + gpuDevice->getMemMgr()->writeRequest(chunk_addr, buffer_ptr, gen.size(), 0, gen.last() ? cb : nullptr); - dmaBuffer += gen.size(); + buffer_ptr += gen.size(); } } else { auto cb = new DmaVirtCallback( @@ -770,7 +772,7 @@ SDMAEngine::copyDone(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer) { DPRINTF(SDMAEngine, "Copy completed to %p, %d dwords\n", pkt->dest, pkt->count); - delete []dmaBuffer; + delete [] dmaBuffer; delete pkt; decodeNext(q); } @@ -1018,7 +1020,7 @@ SDMAEngine::ptePdeDone(SDMAQueue *q, sdmaPtePde *pkt, uint64_t *dmaBuffer) DPRINTF(SDMAEngine, "PtePde packet completed to %p, %d 2dwords\n", pkt->dest, pkt->count); - delete []dmaBuffer; + delete [] dmaBuffer; delete pkt; decodeNext(q); } @@ -1108,6 +1110,7 @@ SDMAEngine::constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header) // Copy the minimum page size at a time in case the physical addresses // are not contiguous. ChunkGenerator gen(pkt->addr, fill_bytes, AMDGPU_MMHUB_PAGE_SIZE); + uint8_t *fill_data_ptr = fill_data; for (; !gen.done(); gen.next()) { Addr chunk_addr = getDeviceAddress(gen.addr()); assert(chunk_addr); @@ -1115,10 +1118,10 @@ SDMAEngine::constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header) DPRINTF(SDMAEngine, "Copying chunk of %d bytes from %#lx (%#lx)\n", gen.size(), gen.addr(), chunk_addr); - gpuDevice->getMemMgr()->writeRequest(chunk_addr, fill_data, + gpuDevice->getMemMgr()->writeRequest(chunk_addr, fill_data_ptr, gen.size(), 0, gen.last() ? cb : nullptr); - fill_data += gen.size(); + fill_data_ptr += gen.size(); } } else { DPRINTF(SDMAEngine, "ConstFill %d bytes of %x to host at %lx\n", From db0d5f19cf09ac81b1528f40b31e9b5614f57a59 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Wed, 7 Aug 2024 12:49:00 -0700 Subject: [PATCH 2/3] dev-amdgpu: Add cleanup events for SDMA SDMA packets which use dmaVirtWrites call their completion event before the write takes place in the Ruby protocol. This causes a use-after-free issue corruption random memory locations leading to random errors. This commit adds a cleanup event for each packet that uses DMA and sets the cleanup latency as 10000 ticks. In atomic mode, the writes complete exactly 2000 ticks after the completion event is called and therefore a fixed latency can be used. This is not tested with timing mode, which does not work with GPUFS at the moment, so a warning is added to give an idea where to look in case the same issue occurs once timing mode is supported. Change-Id: I9ee2689f2becc46bb7794b18b31205f1606109d8 --- src/dev/amdgpu/sdma_engine.cc | 61 +++++++++++++++++++++++++++++++++-- src/dev/amdgpu/sdma_engine.hh | 3 ++ 2 files changed, 61 insertions(+), 3 deletions(-) diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc index 582f128f82..6955837a09 100644 --- a/src/dev/amdgpu/sdma_engine.cc +++ b/src/dev/amdgpu/sdma_engine.cc @@ -38,6 +38,7 @@ #include "dev/amdgpu/interrupt_handler.hh" #include "dev/amdgpu/sdma_commands.hh" #include "dev/amdgpu/sdma_mmio.hh" +#include "gpu-compute/gpu_command_processor.hh" #include "mem/packet.hh" #include "mem/packet_access.hh" #include "params/SDMAEngine.hh" @@ -653,11 +654,29 @@ SDMAEngine::writeDone(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer) { DPRINTF(SDMAEngine, "Write packet completed to %p, %d dwords\n", pkt->dest, pkt->count); - delete [] dmaBuffer; + + auto cleanup_cb = new EventFunctionWrapper( + [ = ]{ writeCleanup(dmaBuffer); }, name()); + + auto system_ptr = gpuDevice->CP()->system(); + if (!system_ptr->isAtomicMode()) { + warn_once("SDMA cleanup assumes 2000 tick timing for completion." + " This has not been tested in timing mode\n"); + } + + // Only 2000 ticks should be necessary, but add additional padding. + schedule(cleanup_cb, curTick() + 10000); + delete pkt; decodeNext(q); } +void +SDMAEngine::writeCleanup(uint32_t *dmaBuffer) +{ + delete [] dmaBuffer; +} + /* Implements a copy packet. */ void SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt) @@ -747,6 +766,7 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer) buffer_ptr += gen.size(); } } else { + DPRINTF(SDMAEngine, "Copying to host address %#lx\n", pkt->dest); auto cb = new DmaVirtCallback( [ = ] (const uint64_t &) { copyDone(q, pkt, dmaBuffer); }); dmaWriteVirt(pkt->dest, pkt->count, cb, (void *)dmaBuffer); @@ -772,11 +792,29 @@ SDMAEngine::copyDone(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer) { DPRINTF(SDMAEngine, "Copy completed to %p, %d dwords\n", pkt->dest, pkt->count); - delete [] dmaBuffer; + + auto cleanup_cb = new EventFunctionWrapper( + [ = ]{ copyCleanup(dmaBuffer); }, name()); + + auto system_ptr = gpuDevice->CP()->system(); + if (!system_ptr->isAtomicMode()) { + warn_once("SDMA cleanup assumes 2000 tick timing for completion." + " This has not been tested in timing mode\n"); + } + + // Only 2000 ticks should be necessary, but add additional padding. + schedule(cleanup_cb, curTick() + 10000); + delete pkt; decodeNext(q); } +void +SDMAEngine::copyCleanup(uint8_t *dmaBuffer) +{ + delete [] dmaBuffer; +} + /* Implements an indirect buffer packet. */ void SDMAEngine::indirectBuffer(SDMAQueue *q, sdmaIndirectBuffer *pkt) @@ -1020,11 +1058,28 @@ SDMAEngine::ptePdeDone(SDMAQueue *q, sdmaPtePde *pkt, uint64_t *dmaBuffer) DPRINTF(SDMAEngine, "PtePde packet completed to %p, %d 2dwords\n", pkt->dest, pkt->count); - delete [] dmaBuffer; + auto cleanup_cb = new EventFunctionWrapper( + [ = ]{ ptePdeCleanup(dmaBuffer); }, name()); + + auto system_ptr = gpuDevice->CP()->system(); + if (!system_ptr->isAtomicMode()) { + warn_once("SDMA cleanup assumes 2000 tick timing for completion." + " This has not been tested in timing mode\n"); + } + + // Only 2000 ticks should be necessary, but add additional padding. + schedule(cleanup_cb, curTick() + 10000); + delete pkt; decodeNext(q); } +void +SDMAEngine::ptePdeCleanup(uint64_t *dmaBuffer) +{ + delete [] dmaBuffer; +} + void SDMAEngine::atomic(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt) { diff --git a/src/dev/amdgpu/sdma_engine.hh b/src/dev/amdgpu/sdma_engine.hh index 9407b97d73..d5fe646fac 100644 --- a/src/dev/amdgpu/sdma_engine.hh +++ b/src/dev/amdgpu/sdma_engine.hh @@ -227,9 +227,11 @@ class SDMAEngine : public DmaVirtDevice void write(SDMAQueue *q, sdmaWrite *pkt); void writeReadData(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer); void writeDone(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer); + void writeCleanup(uint32_t *dmaBuffer); void copy(SDMAQueue *q, sdmaCopy *pkt); void copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer); void copyDone(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer); + void copyCleanup(uint8_t *dmaBuffer); void indirectBuffer(SDMAQueue *q, sdmaIndirectBuffer *pkt); void fence(SDMAQueue *q, sdmaFence *pkt); void fenceDone(SDMAQueue *q, sdmaFence *pkt); @@ -243,6 +245,7 @@ class SDMAEngine : public DmaVirtDevice bool pollRegMemFunc(uint32_t value, uint32_t reference, uint32_t func); void ptePde(SDMAQueue *q, sdmaPtePde *pkt); void ptePdeDone(SDMAQueue *q, sdmaPtePde *pkt, uint64_t *dmaBuffer); + void ptePdeCleanup(uint64_t *dmaBuffer); void atomic(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt); void atomicData(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt, uint64_t *dmaBuffer); From 84fedecafef8529bc48361c8a86445b5cf66d72c Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Wed, 7 Aug 2024 12:52:48 -0700 Subject: [PATCH 3/3] gpu-compute: Update Requests for invalidations The SQC and TCC invalidations share a Request pointer which they both modify. This can cause some problems, so use a different request pointer for each invalidate. The setContext call is also removed as the value being assigned to it is uninitialized. Change-Id: I82ea7aa44a4f4515c1560993caa26cc6a89355af --- src/gpu-compute/compute_unit.cc | 4 ---- src/gpu-compute/shader.cc | 14 +++++++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 807fd21d4d..598864f9f2 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -409,8 +409,6 @@ ComputeUnit::doInvalidate(RequestPtr req, int kernId){ // kern_id will be used in inv responses gpuDynInst->kern_id = kernId; - // update contextId field - req->setContext(gpuDynInst->wfDynId); injectGlobalMemFence(gpuDynInst, true, req); } @@ -438,8 +436,6 @@ ComputeUnit::doSQCInvalidate(RequestPtr req, int kernId){ // kern_id will be used in inv responses gpuDynInst->kern_id = kernId; - // update contextId field - req->setContext(gpuDynInst->wfDynId); gpuDynInst->staticInstruction()->setFlag(GPUStaticInst::Scalar); scalarMemoryPipe.injectScalarMemFence(gpuDynInst, true, req); diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc index 13b03b0a34..b7108efdf9 100644 --- a/src/gpu-compute/shader.cc +++ b/src/gpu-compute/shader.cc @@ -214,19 +214,23 @@ Shader::prepareInvalidate(HSAQueueEntry *task) { for (int i_cu = 0; i_cu < n_cu; ++i_cu) { // create a request to hold INV info; the request's fields will // be updated in cu before use - auto req = std::make_shared(0, 0, 0, - cuList[i_cu]->requestorId(), - 0, -1); + auto tcc_req = std::make_shared(0, 0, 0, + cuList[i_cu]->requestorId(), + 0, -1); _dispatcher.updateInvCounter(kernId, +1); // all necessary INV flags are all set now, call cu to execute - cuList[i_cu]->doInvalidate(req, task->dispatchId()); + cuList[i_cu]->doInvalidate(tcc_req, task->dispatchId()); // A set of CUs share a single SQC cache. Send a single invalidate // request to each SQC + auto sqc_req = std::make_shared(0, 0, 0, + cuList[i_cu]->requestorId(), + 0, -1); + if ((i_cu % n_cu_per_sqc) == 0) { - cuList[i_cu]->doSQCInvalidate(req, task->dispatchId()); + cuList[i_cu]->doSQCInvalidate(sqc_req, task->dispatchId()); } // I don't like this. This is intrusive coding.