dev-amdgpu: Fix issues found by address sanitizer (#1430)
These commits primarily fix the SDMA engine which was (1) using pointer arithmetic on a variable returned by new and then attempting to free the modified pointer and (2) using a buffer after it was freed due to the DMA device calling completion event before Ruby actually completed. Some minor fixes are included: Stop using uninitialized value as packet context and using same request pointer for two separate packets for GPU invalidations.
This commit is contained in:
@@ -38,6 +38,7 @@
|
||||
#include "dev/amdgpu/interrupt_handler.hh"
|
||||
#include "dev/amdgpu/sdma_commands.hh"
|
||||
#include "dev/amdgpu/sdma_mmio.hh"
|
||||
#include "gpu-compute/gpu_command_processor.hh"
|
||||
#include "mem/packet.hh"
|
||||
#include "mem/packet_access.hh"
|
||||
#include "params/SDMAEngine.hh"
|
||||
@@ -653,11 +654,29 @@ SDMAEngine::writeDone(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer)
|
||||
{
|
||||
DPRINTF(SDMAEngine, "Write packet completed to %p, %d dwords\n",
|
||||
pkt->dest, pkt->count);
|
||||
delete []dmaBuffer;
|
||||
|
||||
auto cleanup_cb = new EventFunctionWrapper(
|
||||
[ = ]{ writeCleanup(dmaBuffer); }, name());
|
||||
|
||||
auto system_ptr = gpuDevice->CP()->system();
|
||||
if (!system_ptr->isAtomicMode()) {
|
||||
warn_once("SDMA cleanup assumes 2000 tick timing for completion."
|
||||
" This has not been tested in timing mode\n");
|
||||
}
|
||||
|
||||
// Only 2000 ticks should be necessary, but add additional padding.
|
||||
schedule(cleanup_cb, curTick() + 10000);
|
||||
|
||||
delete pkt;
|
||||
decodeNext(q);
|
||||
}
|
||||
|
||||
void
|
||||
SDMAEngine::writeCleanup(uint32_t *dmaBuffer)
|
||||
{
|
||||
delete [] dmaBuffer;
|
||||
}
|
||||
|
||||
/* Implements a copy packet. */
|
||||
void
|
||||
SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt)
|
||||
@@ -686,6 +705,7 @@ SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt)
|
||||
// Copy the minimum page size at a time in case the physical addresses
|
||||
// are not contiguous.
|
||||
ChunkGenerator gen(pkt->source, pkt->count, AMDGPU_MMHUB_PAGE_SIZE);
|
||||
uint8_t *buffer_ptr = dmaBuffer;
|
||||
for (; !gen.done(); gen.next()) {
|
||||
Addr chunk_addr = getDeviceAddress(gen.addr());
|
||||
assert(chunk_addr);
|
||||
@@ -693,10 +713,10 @@ SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt)
|
||||
DPRINTF(SDMAEngine, "Copying chunk of %d bytes from %#lx (%#lx)\n",
|
||||
gen.size(), gen.addr(), chunk_addr);
|
||||
|
||||
gpuDevice->getMemMgr()->readRequest(chunk_addr, dmaBuffer,
|
||||
gpuDevice->getMemMgr()->readRequest(chunk_addr, buffer_ptr,
|
||||
gen.size(), 0,
|
||||
gen.last() ? cb : nullptr);
|
||||
dmaBuffer += gen.size();
|
||||
buffer_ptr += gen.size();
|
||||
}
|
||||
} else {
|
||||
auto cb = new DmaVirtCallback<uint64_t>(
|
||||
@@ -731,6 +751,7 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
|
||||
// Copy the minimum page size at a time in case the physical addresses
|
||||
// are not contiguous.
|
||||
ChunkGenerator gen(pkt->dest, pkt->count, AMDGPU_MMHUB_PAGE_SIZE);
|
||||
uint8_t *buffer_ptr = dmaBuffer;
|
||||
for (; !gen.done(); gen.next()) {
|
||||
Addr chunk_addr = getDeviceAddress(gen.addr());
|
||||
assert(chunk_addr);
|
||||
@@ -738,13 +759,14 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
|
||||
DPRINTF(SDMAEngine, "Copying chunk of %d bytes to %#lx (%#lx)\n",
|
||||
gen.size(), gen.addr(), chunk_addr);
|
||||
|
||||
gpuDevice->getMemMgr()->writeRequest(chunk_addr, dmaBuffer,
|
||||
gpuDevice->getMemMgr()->writeRequest(chunk_addr, buffer_ptr,
|
||||
gen.size(), 0,
|
||||
gen.last() ? cb : nullptr);
|
||||
|
||||
dmaBuffer += gen.size();
|
||||
buffer_ptr += gen.size();
|
||||
}
|
||||
} else {
|
||||
DPRINTF(SDMAEngine, "Copying to host address %#lx\n", pkt->dest);
|
||||
auto cb = new DmaVirtCallback<uint64_t>(
|
||||
[ = ] (const uint64_t &) { copyDone(q, pkt, dmaBuffer); });
|
||||
dmaWriteVirt(pkt->dest, pkt->count, cb, (void *)dmaBuffer);
|
||||
@@ -770,11 +792,29 @@ SDMAEngine::copyDone(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
|
||||
{
|
||||
DPRINTF(SDMAEngine, "Copy completed to %p, %d dwords\n",
|
||||
pkt->dest, pkt->count);
|
||||
delete []dmaBuffer;
|
||||
|
||||
auto cleanup_cb = new EventFunctionWrapper(
|
||||
[ = ]{ copyCleanup(dmaBuffer); }, name());
|
||||
|
||||
auto system_ptr = gpuDevice->CP()->system();
|
||||
if (!system_ptr->isAtomicMode()) {
|
||||
warn_once("SDMA cleanup assumes 2000 tick timing for completion."
|
||||
" This has not been tested in timing mode\n");
|
||||
}
|
||||
|
||||
// Only 2000 ticks should be necessary, but add additional padding.
|
||||
schedule(cleanup_cb, curTick() + 10000);
|
||||
|
||||
delete pkt;
|
||||
decodeNext(q);
|
||||
}
|
||||
|
||||
void
|
||||
SDMAEngine::copyCleanup(uint8_t *dmaBuffer)
|
||||
{
|
||||
delete [] dmaBuffer;
|
||||
}
|
||||
|
||||
/* Implements an indirect buffer packet. */
|
||||
void
|
||||
SDMAEngine::indirectBuffer(SDMAQueue *q, sdmaIndirectBuffer *pkt)
|
||||
@@ -1018,11 +1058,28 @@ SDMAEngine::ptePdeDone(SDMAQueue *q, sdmaPtePde *pkt, uint64_t *dmaBuffer)
|
||||
DPRINTF(SDMAEngine, "PtePde packet completed to %p, %d 2dwords\n",
|
||||
pkt->dest, pkt->count);
|
||||
|
||||
delete []dmaBuffer;
|
||||
auto cleanup_cb = new EventFunctionWrapper(
|
||||
[ = ]{ ptePdeCleanup(dmaBuffer); }, name());
|
||||
|
||||
auto system_ptr = gpuDevice->CP()->system();
|
||||
if (!system_ptr->isAtomicMode()) {
|
||||
warn_once("SDMA cleanup assumes 2000 tick timing for completion."
|
||||
" This has not been tested in timing mode\n");
|
||||
}
|
||||
|
||||
// Only 2000 ticks should be necessary, but add additional padding.
|
||||
schedule(cleanup_cb, curTick() + 10000);
|
||||
|
||||
delete pkt;
|
||||
decodeNext(q);
|
||||
}
|
||||
|
||||
void
|
||||
SDMAEngine::ptePdeCleanup(uint64_t *dmaBuffer)
|
||||
{
|
||||
delete [] dmaBuffer;
|
||||
}
|
||||
|
||||
void
|
||||
SDMAEngine::atomic(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt)
|
||||
{
|
||||
@@ -1108,6 +1165,7 @@ SDMAEngine::constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header)
|
||||
// Copy the minimum page size at a time in case the physical addresses
|
||||
// are not contiguous.
|
||||
ChunkGenerator gen(pkt->addr, fill_bytes, AMDGPU_MMHUB_PAGE_SIZE);
|
||||
uint8_t *fill_data_ptr = fill_data;
|
||||
for (; !gen.done(); gen.next()) {
|
||||
Addr chunk_addr = getDeviceAddress(gen.addr());
|
||||
assert(chunk_addr);
|
||||
@@ -1115,10 +1173,10 @@ SDMAEngine::constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header)
|
||||
DPRINTF(SDMAEngine, "Copying chunk of %d bytes from %#lx (%#lx)\n",
|
||||
gen.size(), gen.addr(), chunk_addr);
|
||||
|
||||
gpuDevice->getMemMgr()->writeRequest(chunk_addr, fill_data,
|
||||
gpuDevice->getMemMgr()->writeRequest(chunk_addr, fill_data_ptr,
|
||||
gen.size(), 0,
|
||||
gen.last() ? cb : nullptr);
|
||||
fill_data += gen.size();
|
||||
fill_data_ptr += gen.size();
|
||||
}
|
||||
} else {
|
||||
DPRINTF(SDMAEngine, "ConstFill %d bytes of %x to host at %lx\n",
|
||||
|
||||
@@ -227,9 +227,11 @@ class SDMAEngine : public DmaVirtDevice
|
||||
void write(SDMAQueue *q, sdmaWrite *pkt);
|
||||
void writeReadData(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer);
|
||||
void writeDone(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer);
|
||||
void writeCleanup(uint32_t *dmaBuffer);
|
||||
void copy(SDMAQueue *q, sdmaCopy *pkt);
|
||||
void copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer);
|
||||
void copyDone(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer);
|
||||
void copyCleanup(uint8_t *dmaBuffer);
|
||||
void indirectBuffer(SDMAQueue *q, sdmaIndirectBuffer *pkt);
|
||||
void fence(SDMAQueue *q, sdmaFence *pkt);
|
||||
void fenceDone(SDMAQueue *q, sdmaFence *pkt);
|
||||
@@ -243,6 +245,7 @@ class SDMAEngine : public DmaVirtDevice
|
||||
bool pollRegMemFunc(uint32_t value, uint32_t reference, uint32_t func);
|
||||
void ptePde(SDMAQueue *q, sdmaPtePde *pkt);
|
||||
void ptePdeDone(SDMAQueue *q, sdmaPtePde *pkt, uint64_t *dmaBuffer);
|
||||
void ptePdeCleanup(uint64_t *dmaBuffer);
|
||||
void atomic(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt);
|
||||
void atomicData(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt,
|
||||
uint64_t *dmaBuffer);
|
||||
|
||||
@@ -409,8 +409,6 @@ ComputeUnit::doInvalidate(RequestPtr req, int kernId){
|
||||
|
||||
// kern_id will be used in inv responses
|
||||
gpuDynInst->kern_id = kernId;
|
||||
// update contextId field
|
||||
req->setContext(gpuDynInst->wfDynId);
|
||||
|
||||
injectGlobalMemFence(gpuDynInst, true, req);
|
||||
}
|
||||
@@ -438,8 +436,6 @@ ComputeUnit::doSQCInvalidate(RequestPtr req, int kernId){
|
||||
|
||||
// kern_id will be used in inv responses
|
||||
gpuDynInst->kern_id = kernId;
|
||||
// update contextId field
|
||||
req->setContext(gpuDynInst->wfDynId);
|
||||
|
||||
gpuDynInst->staticInstruction()->setFlag(GPUStaticInst::Scalar);
|
||||
scalarMemoryPipe.injectScalarMemFence(gpuDynInst, true, req);
|
||||
|
||||
@@ -214,19 +214,23 @@ Shader::prepareInvalidate(HSAQueueEntry *task) {
|
||||
for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
|
||||
// create a request to hold INV info; the request's fields will
|
||||
// be updated in cu before use
|
||||
auto req = std::make_shared<Request>(0, 0, 0,
|
||||
cuList[i_cu]->requestorId(),
|
||||
0, -1);
|
||||
auto tcc_req = std::make_shared<Request>(0, 0, 0,
|
||||
cuList[i_cu]->requestorId(),
|
||||
0, -1);
|
||||
|
||||
_dispatcher.updateInvCounter(kernId, +1);
|
||||
// all necessary INV flags are all set now, call cu to execute
|
||||
cuList[i_cu]->doInvalidate(req, task->dispatchId());
|
||||
cuList[i_cu]->doInvalidate(tcc_req, task->dispatchId());
|
||||
|
||||
|
||||
// A set of CUs share a single SQC cache. Send a single invalidate
|
||||
// request to each SQC
|
||||
auto sqc_req = std::make_shared<Request>(0, 0, 0,
|
||||
cuList[i_cu]->requestorId(),
|
||||
0, -1);
|
||||
|
||||
if ((i_cu % n_cu_per_sqc) == 0) {
|
||||
cuList[i_cu]->doSQCInvalidate(req, task->dispatchId());
|
||||
cuList[i_cu]->doSQCInvalidate(sqc_req, task->dispatchId());
|
||||
}
|
||||
|
||||
// I don't like this. This is intrusive coding.
|
||||
|
||||
Reference in New Issue
Block a user