dev-amdgpu: Add cleanup events for SDMA
SDMA packets which use dmaVirtWrites call their completion event before the write takes place in the Ruby protocol. This causes a use-after-free issue corruption random memory locations leading to random errors. This commit adds a cleanup event for each packet that uses DMA and sets the cleanup latency as 10000 ticks. In atomic mode, the writes complete exactly 2000 ticks after the completion event is called and therefore a fixed latency can be used. This is not tested with timing mode, which does not work with GPUFS at the moment, so a warning is added to give an idea where to look in case the same issue occurs once timing mode is supported. Change-Id: I9ee2689f2becc46bb7794b18b31205f1606109d8
This commit is contained in:
@@ -38,6 +38,7 @@
|
||||
#include "dev/amdgpu/interrupt_handler.hh"
|
||||
#include "dev/amdgpu/sdma_commands.hh"
|
||||
#include "dev/amdgpu/sdma_mmio.hh"
|
||||
#include "gpu-compute/gpu_command_processor.hh"
|
||||
#include "mem/packet.hh"
|
||||
#include "mem/packet_access.hh"
|
||||
#include "params/SDMAEngine.hh"
|
||||
@@ -653,11 +654,29 @@ SDMAEngine::writeDone(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer)
|
||||
{
|
||||
DPRINTF(SDMAEngine, "Write packet completed to %p, %d dwords\n",
|
||||
pkt->dest, pkt->count);
|
||||
delete [] dmaBuffer;
|
||||
|
||||
auto cleanup_cb = new EventFunctionWrapper(
|
||||
[ = ]{ writeCleanup(dmaBuffer); }, name());
|
||||
|
||||
auto system_ptr = gpuDevice->CP()->system();
|
||||
if (!system_ptr->isAtomicMode()) {
|
||||
warn_once("SDMA cleanup assumes 2000 tick timing for completion."
|
||||
" This has not been tested in timing mode\n");
|
||||
}
|
||||
|
||||
// Only 2000 ticks should be necessary, but add additional padding.
|
||||
schedule(cleanup_cb, curTick() + 10000);
|
||||
|
||||
delete pkt;
|
||||
decodeNext(q);
|
||||
}
|
||||
|
||||
void
|
||||
SDMAEngine::writeCleanup(uint32_t *dmaBuffer)
|
||||
{
|
||||
delete [] dmaBuffer;
|
||||
}
|
||||
|
||||
/* Implements a copy packet. */
|
||||
void
|
||||
SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt)
|
||||
@@ -747,6 +766,7 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
|
||||
buffer_ptr += gen.size();
|
||||
}
|
||||
} else {
|
||||
DPRINTF(SDMAEngine, "Copying to host address %#lx\n", pkt->dest);
|
||||
auto cb = new DmaVirtCallback<uint64_t>(
|
||||
[ = ] (const uint64_t &) { copyDone(q, pkt, dmaBuffer); });
|
||||
dmaWriteVirt(pkt->dest, pkt->count, cb, (void *)dmaBuffer);
|
||||
@@ -772,11 +792,29 @@ SDMAEngine::copyDone(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
|
||||
{
|
||||
DPRINTF(SDMAEngine, "Copy completed to %p, %d dwords\n",
|
||||
pkt->dest, pkt->count);
|
||||
delete [] dmaBuffer;
|
||||
|
||||
auto cleanup_cb = new EventFunctionWrapper(
|
||||
[ = ]{ copyCleanup(dmaBuffer); }, name());
|
||||
|
||||
auto system_ptr = gpuDevice->CP()->system();
|
||||
if (!system_ptr->isAtomicMode()) {
|
||||
warn_once("SDMA cleanup assumes 2000 tick timing for completion."
|
||||
" This has not been tested in timing mode\n");
|
||||
}
|
||||
|
||||
// Only 2000 ticks should be necessary, but add additional padding.
|
||||
schedule(cleanup_cb, curTick() + 10000);
|
||||
|
||||
delete pkt;
|
||||
decodeNext(q);
|
||||
}
|
||||
|
||||
void
|
||||
SDMAEngine::copyCleanup(uint8_t *dmaBuffer)
|
||||
{
|
||||
delete [] dmaBuffer;
|
||||
}
|
||||
|
||||
/* Implements an indirect buffer packet. */
|
||||
void
|
||||
SDMAEngine::indirectBuffer(SDMAQueue *q, sdmaIndirectBuffer *pkt)
|
||||
@@ -1020,11 +1058,28 @@ SDMAEngine::ptePdeDone(SDMAQueue *q, sdmaPtePde *pkt, uint64_t *dmaBuffer)
|
||||
DPRINTF(SDMAEngine, "PtePde packet completed to %p, %d 2dwords\n",
|
||||
pkt->dest, pkt->count);
|
||||
|
||||
delete [] dmaBuffer;
|
||||
auto cleanup_cb = new EventFunctionWrapper(
|
||||
[ = ]{ ptePdeCleanup(dmaBuffer); }, name());
|
||||
|
||||
auto system_ptr = gpuDevice->CP()->system();
|
||||
if (!system_ptr->isAtomicMode()) {
|
||||
warn_once("SDMA cleanup assumes 2000 tick timing for completion."
|
||||
" This has not been tested in timing mode\n");
|
||||
}
|
||||
|
||||
// Only 2000 ticks should be necessary, but add additional padding.
|
||||
schedule(cleanup_cb, curTick() + 10000);
|
||||
|
||||
delete pkt;
|
||||
decodeNext(q);
|
||||
}
|
||||
|
||||
void
|
||||
SDMAEngine::ptePdeCleanup(uint64_t *dmaBuffer)
|
||||
{
|
||||
delete [] dmaBuffer;
|
||||
}
|
||||
|
||||
void
|
||||
SDMAEngine::atomic(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt)
|
||||
{
|
||||
|
||||
@@ -227,9 +227,11 @@ class SDMAEngine : public DmaVirtDevice
|
||||
void write(SDMAQueue *q, sdmaWrite *pkt);
|
||||
void writeReadData(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer);
|
||||
void writeDone(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer);
|
||||
void writeCleanup(uint32_t *dmaBuffer);
|
||||
void copy(SDMAQueue *q, sdmaCopy *pkt);
|
||||
void copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer);
|
||||
void copyDone(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer);
|
||||
void copyCleanup(uint8_t *dmaBuffer);
|
||||
void indirectBuffer(SDMAQueue *q, sdmaIndirectBuffer *pkt);
|
||||
void fence(SDMAQueue *q, sdmaFence *pkt);
|
||||
void fenceDone(SDMAQueue *q, sdmaFence *pkt);
|
||||
@@ -243,6 +245,7 @@ class SDMAEngine : public DmaVirtDevice
|
||||
bool pollRegMemFunc(uint32_t value, uint32_t reference, uint32_t func);
|
||||
void ptePde(SDMAQueue *q, sdmaPtePde *pkt);
|
||||
void ptePdeDone(SDMAQueue *q, sdmaPtePde *pkt, uint64_t *dmaBuffer);
|
||||
void ptePdeCleanup(uint64_t *dmaBuffer);
|
||||
void atomic(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt);
|
||||
void atomicData(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt,
|
||||
uint64_t *dmaBuffer);
|
||||
|
||||
Reference in New Issue
Block a user