From 7b16b17e619490685edb875cd9678fc2fb1943e9 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Tue, 18 Oct 2022 13:01:33 -0700 Subject: [PATCH] dev-amdgpu: Chunkify SDMA copies that use device memory The current implementation of SDMA copy calls the GPU memory manager's read/write method one time passing a physical address as the source/destination. This implicitly assumes the physical addresses are contiguous which is generally not true for large allocations. This results in reading from/writing to the wrong address. This changeset fixes the problem by copying large copies in chunks of the minimum possible page size on the GPU (4kB). Each page is translated seperately to ensure the correct physical address. The final copy "done" callback is only used for the last transfer. The transfers should complete in order so the copy command will not complete until all chunks have been copied. Tested and verified on an application with a large allocation (~5GB). Change-Id: I27018a963da7133f5e49dec13b0475c3637c8765 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/64752 Reviewed-by: Matt Sinclair Maintainer: Matt Sinclair Tested-by: kokoro --- src/dev/amdgpu/sdma_engine.cc | 37 +++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc index cafef6cc02..1cd6ff2ed0 100644 --- a/src/dev/amdgpu/sdma_engine.cc +++ b/src/dev/amdgpu/sdma_engine.cc @@ -589,8 +589,22 @@ SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt) DPRINTF(SDMAEngine, "Copying from device address %#lx\n", device_addr); auto cb = new EventFunctionWrapper( [ = ]{ copyReadData(q, pkt, dmaBuffer); }, name()); - gpuDevice->getMemMgr()->readRequest(device_addr, dmaBuffer, pkt->count, - 0, cb); + + // Copy the minimum page size at a time in case the physical addresses + // are not contiguous. + ChunkGenerator gen(pkt->source, pkt->count, AMDGPU_MMHUB_PAGE_SIZE); + for (; !gen.done(); gen.next()) { + Addr chunk_addr = getDeviceAddress(gen.addr()); + assert(chunk_addr); + + DPRINTF(SDMAEngine, "Copying chunk of %d bytes from %#lx (%#lx)\n", + gen.size(), gen.addr(), chunk_addr); + + gpuDevice->getMemMgr()->readRequest(chunk_addr, dmaBuffer, + gen.size(), 0, + gen.last() ? cb : nullptr); + dmaBuffer += gen.size(); + } } else { auto cb = new DmaVirtCallback( [ = ] (const uint64_t &) { copyReadData(q, pkt, dmaBuffer); }); @@ -620,8 +634,23 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer) DPRINTF(SDMAEngine, "Copying to device address %#lx\n", device_addr); auto cb = new EventFunctionWrapper( [ = ]{ copyDone(q, pkt, dmaBuffer); }, name()); - gpuDevice->getMemMgr()->writeRequest(device_addr, dmaBuffer, - pkt->count, 0, cb); + + // Copy the minimum page size at a time in case the physical addresses + // are not contiguous. + ChunkGenerator gen(pkt->dest, pkt->count, AMDGPU_MMHUB_PAGE_SIZE); + for (; !gen.done(); gen.next()) { + Addr chunk_addr = getDeviceAddress(gen.addr()); + assert(chunk_addr); + + DPRINTF(SDMAEngine, "Copying chunk of %d bytes to %#lx (%#lx)\n", + gen.size(), gen.addr(), chunk_addr); + + gpuDevice->getMemMgr()->writeRequest(chunk_addr, dmaBuffer, + gen.size(), 0, + gen.last() ? cb : nullptr); + + dmaBuffer += gen.size(); + } } else { auto cb = new DmaVirtCallback( [ = ] (const uint64_t &) { copyDone(q, pkt, dmaBuffer); });