dev-amdgpu: Chunkify SDMA copies that use device memory

The current implementation of SDMA copy calls the GPU memory manager's read/write method one time passing a physical address as the source/destination. This implicitly assumes the physical addresses are contiguous which is generally not true for large allocations. This results in reading from/writing to the wrong address. This changeset fixes the problem by copying large copies in chunks of the minimum possible page size on the GPU (4kB). Each page is translated seperately to ensure the correct physical address. The final copy "done" callback is only used for the last transfer. The transfers should complete in order so the copy command will not complete until all chunks have been copied. Tested and verified on an application with a large allocation (~5GB). Change-Id: I27018a963da7133f5e49dec13b0475c3637c8765 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/64752 Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
2022-10-18 13:01:33 -07:00
parent 6a4a12ebbd
commit 7b16b17e61
1 changed files with 33 additions and 4 deletions
--- a/src/dev/amdgpu/sdma_engine.cc
+++ b/src/dev/amdgpu/sdma_engine.cc
@@ -589,8 +589,22 @@ SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt)
        DPRINTF(SDMAEngine, "Copying from device address %#lx\n", device_addr);
        auto cb = new EventFunctionWrapper(
            [ = ]{ copyReadData(q, pkt, dmaBuffer); }, name());
-        gpuDevice->getMemMgr()->readRequest(device_addr, dmaBuffer, pkt->count,
-                                            0, cb);
+
+        // Copy the minimum page size at a time in case the physical addresses
+        // are not contiguous.
+        ChunkGenerator gen(pkt->source, pkt->count, AMDGPU_MMHUB_PAGE_SIZE);
+        for (; !gen.done(); gen.next()) {
+            Addr chunk_addr = getDeviceAddress(gen.addr());
+            assert(chunk_addr);
+
+            DPRINTF(SDMAEngine, "Copying chunk of %d bytes from %#lx (%#lx)\n",
+                    gen.size(), gen.addr(), chunk_addr);
+
+            gpuDevice->getMemMgr()->readRequest(chunk_addr, dmaBuffer,
+                                                gen.size(), 0,
+                                                gen.last() ? cb : nullptr);
+            dmaBuffer += gen.size();
+        }
    } else {
        auto cb = new DmaVirtCallback<uint64_t>(
            [ = ] (const uint64_t &) { copyReadData(q, pkt, dmaBuffer); });
@@ -620,8 +634,23 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
        DPRINTF(SDMAEngine, "Copying to device address %#lx\n", device_addr);
        auto cb = new EventFunctionWrapper(
            [ = ]{ copyDone(q, pkt, dmaBuffer); }, name());
-        gpuDevice->getMemMgr()->writeRequest(device_addr, dmaBuffer,
-                                             pkt->count, 0, cb);
+
+        // Copy the minimum page size at a time in case the physical addresses
+        // are not contiguous.
+        ChunkGenerator gen(pkt->dest, pkt->count, AMDGPU_MMHUB_PAGE_SIZE);
+        for (; !gen.done(); gen.next()) {
+            Addr chunk_addr = getDeviceAddress(gen.addr());
+            assert(chunk_addr);
+
+            DPRINTF(SDMAEngine, "Copying chunk of %d bytes to %#lx (%#lx)\n",
+                    gen.size(), gen.addr(), chunk_addr);
+
+            gpuDevice->getMemMgr()->writeRequest(chunk_addr, dmaBuffer,
+                                                 gen.size(), 0,
+                                                 gen.last() ? cb : nullptr);
+
+            dmaBuffer += gen.size();
+        }
    } else {
        auto cb = new DmaVirtCallback<uint64_t>(
            [ = ] (const uint64_t &) { copyDone(q, pkt, dmaBuffer); });