dev-amdgpu: Add cleanup events for SDMA

SDMA packets which use dmaVirtWrites call their completion event before
the write takes place in the Ruby protocol. This causes a use-after-free
issue corruption random memory locations leading to random errors. This
commit adds a cleanup event for each packet that uses DMA and sets the
cleanup latency as 10000 ticks. In atomic mode, the writes complete
exactly 2000 ticks after the completion event is called and therefore a
fixed latency can be used. This is not tested with timing mode, which
does not work with GPUFS at the moment, so a warning is added to give an
idea where to look in case the same issue occurs once timing mode is
supported.

Change-Id: I9ee2689f2becc46bb7794b18b31205f1606109d8
This commit is contained in:
Matthew Poremba
2024-08-07 12:49:00 -07:00
parent 0d0b68266c
commit db0d5f19cf
2 changed files with 61 additions and 3 deletions

View File

@@ -38,6 +38,7 @@
#include "dev/amdgpu/interrupt_handler.hh"
#include "dev/amdgpu/sdma_commands.hh"
#include "dev/amdgpu/sdma_mmio.hh"
#include "gpu-compute/gpu_command_processor.hh"
#include "mem/packet.hh"
#include "mem/packet_access.hh"
#include "params/SDMAEngine.hh"
@@ -653,11 +654,29 @@ SDMAEngine::writeDone(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer)
{
DPRINTF(SDMAEngine, "Write packet completed to %p, %d dwords\n",
pkt->dest, pkt->count);
delete [] dmaBuffer;
auto cleanup_cb = new EventFunctionWrapper(
[ = ]{ writeCleanup(dmaBuffer); }, name());
auto system_ptr = gpuDevice->CP()->system();
if (!system_ptr->isAtomicMode()) {
warn_once("SDMA cleanup assumes 2000 tick timing for completion."
" This has not been tested in timing mode\n");
}
// Only 2000 ticks should be necessary, but add additional padding.
schedule(cleanup_cb, curTick() + 10000);
delete pkt;
decodeNext(q);
}
void
SDMAEngine::writeCleanup(uint32_t *dmaBuffer)
{
delete [] dmaBuffer;
}
/* Implements a copy packet. */
void
SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt)
@@ -747,6 +766,7 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
buffer_ptr += gen.size();
}
} else {
DPRINTF(SDMAEngine, "Copying to host address %#lx\n", pkt->dest);
auto cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &) { copyDone(q, pkt, dmaBuffer); });
dmaWriteVirt(pkt->dest, pkt->count, cb, (void *)dmaBuffer);
@@ -772,11 +792,29 @@ SDMAEngine::copyDone(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
{
DPRINTF(SDMAEngine, "Copy completed to %p, %d dwords\n",
pkt->dest, pkt->count);
delete [] dmaBuffer;
auto cleanup_cb = new EventFunctionWrapper(
[ = ]{ copyCleanup(dmaBuffer); }, name());
auto system_ptr = gpuDevice->CP()->system();
if (!system_ptr->isAtomicMode()) {
warn_once("SDMA cleanup assumes 2000 tick timing for completion."
" This has not been tested in timing mode\n");
}
// Only 2000 ticks should be necessary, but add additional padding.
schedule(cleanup_cb, curTick() + 10000);
delete pkt;
decodeNext(q);
}
void
SDMAEngine::copyCleanup(uint8_t *dmaBuffer)
{
delete [] dmaBuffer;
}
/* Implements an indirect buffer packet. */
void
SDMAEngine::indirectBuffer(SDMAQueue *q, sdmaIndirectBuffer *pkt)
@@ -1020,11 +1058,28 @@ SDMAEngine::ptePdeDone(SDMAQueue *q, sdmaPtePde *pkt, uint64_t *dmaBuffer)
DPRINTF(SDMAEngine, "PtePde packet completed to %p, %d 2dwords\n",
pkt->dest, pkt->count);
delete [] dmaBuffer;
auto cleanup_cb = new EventFunctionWrapper(
[ = ]{ ptePdeCleanup(dmaBuffer); }, name());
auto system_ptr = gpuDevice->CP()->system();
if (!system_ptr->isAtomicMode()) {
warn_once("SDMA cleanup assumes 2000 tick timing for completion."
" This has not been tested in timing mode\n");
}
// Only 2000 ticks should be necessary, but add additional padding.
schedule(cleanup_cb, curTick() + 10000);
delete pkt;
decodeNext(q);
}
void
SDMAEngine::ptePdeCleanup(uint64_t *dmaBuffer)
{
delete [] dmaBuffer;
}
void
SDMAEngine::atomic(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt)
{

View File

@@ -227,9 +227,11 @@ class SDMAEngine : public DmaVirtDevice
void write(SDMAQueue *q, sdmaWrite *pkt);
void writeReadData(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer);
void writeDone(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer);
void writeCleanup(uint32_t *dmaBuffer);
void copy(SDMAQueue *q, sdmaCopy *pkt);
void copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer);
void copyDone(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer);
void copyCleanup(uint8_t *dmaBuffer);
void indirectBuffer(SDMAQueue *q, sdmaIndirectBuffer *pkt);
void fence(SDMAQueue *q, sdmaFence *pkt);
void fenceDone(SDMAQueue *q, sdmaFence *pkt);
@@ -243,6 +245,7 @@ class SDMAEngine : public DmaVirtDevice
bool pollRegMemFunc(uint32_t value, uint32_t reference, uint32_t func);
void ptePde(SDMAQueue *q, sdmaPtePde *pkt);
void ptePdeDone(SDMAQueue *q, sdmaPtePde *pkt, uint64_t *dmaBuffer);
void ptePdeCleanup(uint64_t *dmaBuffer);
void atomic(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt);
void atomicData(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt,
uint64_t *dmaBuffer);