diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc index e1e123df48..6955837a09 100644 --- a/src/dev/amdgpu/sdma_engine.cc +++ b/src/dev/amdgpu/sdma_engine.cc @@ -38,6 +38,7 @@ #include "dev/amdgpu/interrupt_handler.hh" #include "dev/amdgpu/sdma_commands.hh" #include "dev/amdgpu/sdma_mmio.hh" +#include "gpu-compute/gpu_command_processor.hh" #include "mem/packet.hh" #include "mem/packet_access.hh" #include "params/SDMAEngine.hh" @@ -653,11 +654,29 @@ SDMAEngine::writeDone(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer) { DPRINTF(SDMAEngine, "Write packet completed to %p, %d dwords\n", pkt->dest, pkt->count); - delete []dmaBuffer; + + auto cleanup_cb = new EventFunctionWrapper( + [ = ]{ writeCleanup(dmaBuffer); }, name()); + + auto system_ptr = gpuDevice->CP()->system(); + if (!system_ptr->isAtomicMode()) { + warn_once("SDMA cleanup assumes 2000 tick timing for completion." + " This has not been tested in timing mode\n"); + } + + // Only 2000 ticks should be necessary, but add additional padding. + schedule(cleanup_cb, curTick() + 10000); + delete pkt; decodeNext(q); } +void +SDMAEngine::writeCleanup(uint32_t *dmaBuffer) +{ + delete [] dmaBuffer; +} + /* Implements a copy packet. */ void SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt) @@ -686,6 +705,7 @@ SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt) // Copy the minimum page size at a time in case the physical addresses // are not contiguous. ChunkGenerator gen(pkt->source, pkt->count, AMDGPU_MMHUB_PAGE_SIZE); + uint8_t *buffer_ptr = dmaBuffer; for (; !gen.done(); gen.next()) { Addr chunk_addr = getDeviceAddress(gen.addr()); assert(chunk_addr); @@ -693,10 +713,10 @@ SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt) DPRINTF(SDMAEngine, "Copying chunk of %d bytes from %#lx (%#lx)\n", gen.size(), gen.addr(), chunk_addr); - gpuDevice->getMemMgr()->readRequest(chunk_addr, dmaBuffer, + gpuDevice->getMemMgr()->readRequest(chunk_addr, buffer_ptr, gen.size(), 0, gen.last() ? cb : nullptr); - dmaBuffer += gen.size(); + buffer_ptr += gen.size(); } } else { auto cb = new DmaVirtCallback( @@ -731,6 +751,7 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer) // Copy the minimum page size at a time in case the physical addresses // are not contiguous. ChunkGenerator gen(pkt->dest, pkt->count, AMDGPU_MMHUB_PAGE_SIZE); + uint8_t *buffer_ptr = dmaBuffer; for (; !gen.done(); gen.next()) { Addr chunk_addr = getDeviceAddress(gen.addr()); assert(chunk_addr); @@ -738,13 +759,14 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer) DPRINTF(SDMAEngine, "Copying chunk of %d bytes to %#lx (%#lx)\n", gen.size(), gen.addr(), chunk_addr); - gpuDevice->getMemMgr()->writeRequest(chunk_addr, dmaBuffer, + gpuDevice->getMemMgr()->writeRequest(chunk_addr, buffer_ptr, gen.size(), 0, gen.last() ? cb : nullptr); - dmaBuffer += gen.size(); + buffer_ptr += gen.size(); } } else { + DPRINTF(SDMAEngine, "Copying to host address %#lx\n", pkt->dest); auto cb = new DmaVirtCallback( [ = ] (const uint64_t &) { copyDone(q, pkt, dmaBuffer); }); dmaWriteVirt(pkt->dest, pkt->count, cb, (void *)dmaBuffer); @@ -770,11 +792,29 @@ SDMAEngine::copyDone(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer) { DPRINTF(SDMAEngine, "Copy completed to %p, %d dwords\n", pkt->dest, pkt->count); - delete []dmaBuffer; + + auto cleanup_cb = new EventFunctionWrapper( + [ = ]{ copyCleanup(dmaBuffer); }, name()); + + auto system_ptr = gpuDevice->CP()->system(); + if (!system_ptr->isAtomicMode()) { + warn_once("SDMA cleanup assumes 2000 tick timing for completion." + " This has not been tested in timing mode\n"); + } + + // Only 2000 ticks should be necessary, but add additional padding. + schedule(cleanup_cb, curTick() + 10000); + delete pkt; decodeNext(q); } +void +SDMAEngine::copyCleanup(uint8_t *dmaBuffer) +{ + delete [] dmaBuffer; +} + /* Implements an indirect buffer packet. */ void SDMAEngine::indirectBuffer(SDMAQueue *q, sdmaIndirectBuffer *pkt) @@ -1018,11 +1058,28 @@ SDMAEngine::ptePdeDone(SDMAQueue *q, sdmaPtePde *pkt, uint64_t *dmaBuffer) DPRINTF(SDMAEngine, "PtePde packet completed to %p, %d 2dwords\n", pkt->dest, pkt->count); - delete []dmaBuffer; + auto cleanup_cb = new EventFunctionWrapper( + [ = ]{ ptePdeCleanup(dmaBuffer); }, name()); + + auto system_ptr = gpuDevice->CP()->system(); + if (!system_ptr->isAtomicMode()) { + warn_once("SDMA cleanup assumes 2000 tick timing for completion." + " This has not been tested in timing mode\n"); + } + + // Only 2000 ticks should be necessary, but add additional padding. + schedule(cleanup_cb, curTick() + 10000); + delete pkt; decodeNext(q); } +void +SDMAEngine::ptePdeCleanup(uint64_t *dmaBuffer) +{ + delete [] dmaBuffer; +} + void SDMAEngine::atomic(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt) { @@ -1108,6 +1165,7 @@ SDMAEngine::constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header) // Copy the minimum page size at a time in case the physical addresses // are not contiguous. ChunkGenerator gen(pkt->addr, fill_bytes, AMDGPU_MMHUB_PAGE_SIZE); + uint8_t *fill_data_ptr = fill_data; for (; !gen.done(); gen.next()) { Addr chunk_addr = getDeviceAddress(gen.addr()); assert(chunk_addr); @@ -1115,10 +1173,10 @@ SDMAEngine::constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header) DPRINTF(SDMAEngine, "Copying chunk of %d bytes from %#lx (%#lx)\n", gen.size(), gen.addr(), chunk_addr); - gpuDevice->getMemMgr()->writeRequest(chunk_addr, fill_data, + gpuDevice->getMemMgr()->writeRequest(chunk_addr, fill_data_ptr, gen.size(), 0, gen.last() ? cb : nullptr); - fill_data += gen.size(); + fill_data_ptr += gen.size(); } } else { DPRINTF(SDMAEngine, "ConstFill %d bytes of %x to host at %lx\n", diff --git a/src/dev/amdgpu/sdma_engine.hh b/src/dev/amdgpu/sdma_engine.hh index 9407b97d73..d5fe646fac 100644 --- a/src/dev/amdgpu/sdma_engine.hh +++ b/src/dev/amdgpu/sdma_engine.hh @@ -227,9 +227,11 @@ class SDMAEngine : public DmaVirtDevice void write(SDMAQueue *q, sdmaWrite *pkt); void writeReadData(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer); void writeDone(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer); + void writeCleanup(uint32_t *dmaBuffer); void copy(SDMAQueue *q, sdmaCopy *pkt); void copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer); void copyDone(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer); + void copyCleanup(uint8_t *dmaBuffer); void indirectBuffer(SDMAQueue *q, sdmaIndirectBuffer *pkt); void fence(SDMAQueue *q, sdmaFence *pkt); void fenceDone(SDMAQueue *q, sdmaFence *pkt); @@ -243,6 +245,7 @@ class SDMAEngine : public DmaVirtDevice bool pollRegMemFunc(uint32_t value, uint32_t reference, uint32_t func); void ptePde(SDMAQueue *q, sdmaPtePde *pkt); void ptePdeDone(SDMAQueue *q, sdmaPtePde *pkt, uint64_t *dmaBuffer); + void ptePdeCleanup(uint64_t *dmaBuffer); void atomic(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt); void atomicData(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt, uint64_t *dmaBuffer); diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 807fd21d4d..598864f9f2 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -409,8 +409,6 @@ ComputeUnit::doInvalidate(RequestPtr req, int kernId){ // kern_id will be used in inv responses gpuDynInst->kern_id = kernId; - // update contextId field - req->setContext(gpuDynInst->wfDynId); injectGlobalMemFence(gpuDynInst, true, req); } @@ -438,8 +436,6 @@ ComputeUnit::doSQCInvalidate(RequestPtr req, int kernId){ // kern_id will be used in inv responses gpuDynInst->kern_id = kernId; - // update contextId field - req->setContext(gpuDynInst->wfDynId); gpuDynInst->staticInstruction()->setFlag(GPUStaticInst::Scalar); scalarMemoryPipe.injectScalarMemFence(gpuDynInst, true, req); diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc index 13b03b0a34..b7108efdf9 100644 --- a/src/gpu-compute/shader.cc +++ b/src/gpu-compute/shader.cc @@ -214,19 +214,23 @@ Shader::prepareInvalidate(HSAQueueEntry *task) { for (int i_cu = 0; i_cu < n_cu; ++i_cu) { // create a request to hold INV info; the request's fields will // be updated in cu before use - auto req = std::make_shared(0, 0, 0, - cuList[i_cu]->requestorId(), - 0, -1); + auto tcc_req = std::make_shared(0, 0, 0, + cuList[i_cu]->requestorId(), + 0, -1); _dispatcher.updateInvCounter(kernId, +1); // all necessary INV flags are all set now, call cu to execute - cuList[i_cu]->doInvalidate(req, task->dispatchId()); + cuList[i_cu]->doInvalidate(tcc_req, task->dispatchId()); // A set of CUs share a single SQC cache. Send a single invalidate // request to each SQC + auto sqc_req = std::make_shared(0, 0, 0, + cuList[i_cu]->requestorId(), + 0, -1); + if ((i_cu % n_cu_per_sqc) == 0) { - cuList[i_cu]->doSQCInvalidate(req, task->dispatchId()); + cuList[i_cu]->doSQCInvalidate(sqc_req, task->dispatchId()); } // I don't like this. This is intrusive coding.