gpu-compute: Chunkify AMDKernelCode read from device

The AMDKernelCode object can span potentially span two pages. Currently the copy loop from device memory only translates once at the base address. This changeset translates one cache line at a time before copying and has the ancillary benefit for cleaning up this code a bit. Change-Id: I602bc12d8f8c5d3a3e57ab3f42f7dd3df58dc144 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/65251 Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Jason Lowe-Power <power.jg@gmail.com>
2022-11-02 14:57:51 -07:00
parent 553096ee53
commit f6dc5c6aa4
1 changed files with 21 additions and 9 deletions
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -118,6 +118,7 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
 {
    static int dynamic_task_id = 0;
    _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
+    assert(!(disp_pkt->kernel_object & (system()->cacheLineSize() - 1)));

    /**
     * we need to read a pointer in the application's address
@@ -150,6 +151,10 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
                                is_system_page);
    }

+    DPRINTF(GPUCommandProc, "kernobj vaddr %#lx paddr %#lx size %d s:%d\n",
+            disp_pkt->kernel_object, phys_addr, sizeof(AMDKernelCode),
+            is_system_page);
+
    /**
     * The kernel_object is a pointer to the machine code, whose entry
     * point is an 'amd_kernel_code_t' type, which is included in the
@@ -167,20 +172,27 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
    } else {
        assert(FullSystem);
        DPRINTF(GPUCommandProc, "kernel_object in device, using device mem\n");
-        // Read from GPU memory manager
-        uint8_t raw_akc[sizeof(AMDKernelCode)];
-        for (int i = 0; i < sizeof(AMDKernelCode) / sizeof(uint8_t); ++i) {
-            Addr mmhubAddr = phys_addr + i*sizeof(uint8_t);
+
+        // Read from GPU memory manager one cache line at a time to prevent
+        // rare cases where the AKC spans two memory pages.
+        ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
+                           system()->cacheLineSize());
+        for (; !gen.done(); gen.next()) {
+            Addr chunk_addr = gen.addr();
+            int vmid = 1;
+            unsigned dummy;
+            walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),
+                                    chunk_addr, dummy, BaseMMU::Mode::Read,
+                                    is_system_page);
+
            Request::Flags flags = Request::PHYSICAL;
-            RequestPtr request = std::make_shared<Request>(
-                mmhubAddr, sizeof(uint8_t), flags, walker->getDevRequestor());
+            RequestPtr request = std::make_shared<Request>(chunk_addr,
+                system()->cacheLineSize(), flags, walker->getDevRequestor());
            Packet *readPkt = new Packet(request, MemCmd::ReadReq);
-            readPkt->allocate();
+            readPkt->dataStatic((uint8_t *)&akc + gen.complete());
            system()->getDeviceMemory(readPkt)->access(readPkt);
-            raw_akc[i] = readPkt->getLE<uint8_t>();
            delete readPkt;
        }
-        memcpy(&akc, &raw_akc, sizeof(AMDKernelCode));
    }

    DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "