gpu-compute: Support cache line sizes >64B in GPUFS (#939)

This change fixes two issues: 1) The --cacheline_size option was setting the system cache line size but not the Ruby cache line size, and the mismatch was causing assertion failures. 2) The submitDispatchPkt() function accesses the kernel object in chunks, with the chunk size equal to the cache line size. For cache line sizes >64B (e.g. 128B), the kernel object is not guaranteed to be aligned to a cache line and it was possible for a chunk to be partially contained in two separate device memories, causing the memory access to fail. Change-Id: I8e45146901943e9c2750d32162c0f35c851e09e1 Co-authored-by: Michael Boyer <Michael.Boyer@amd.com>
2024-03-20 11:09:25 -07:00
parent 2b67d0eba6
commit ba2f5615ba
2 changed files with 11 additions and 3 deletions
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -118,7 +118,10 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
                                       Addr host_pkt_addr)
 {
    _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
-    assert(!(disp_pkt->kernel_object & (system()->cacheLineSize() - 1)));
+    // The kernel object should be aligned to a 64B boundary, but not
+    // necessarily a cache line boundary.
+    unsigned akc_alignment_granularity = 64;
+    assert(!(disp_pkt->kernel_object & (akc_alignment_granularity - 1)));

    /**
     * Need to use a raw pointer for DmaVirtDevice API. This is deleted
@@ -201,7 +204,7 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
            // Read from GPU memory manager one cache line at a time to prevent
            // rare cases where the AKC spans two memory pages.
            ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
-                               system()->cacheLineSize());
+                               akc_alignment_granularity);
            for (; !gen.done(); gen.next()) {
                Addr chunk_addr = gen.addr();
                int vmid = 1;
@@ -212,10 +215,13 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,

                Request::Flags flags = Request::PHYSICAL;
                RequestPtr request = std::make_shared<Request>(chunk_addr,
-                    system()->cacheLineSize(), flags,
+                    akc_alignment_granularity, flags,
                    walker->getDevRequestor());
                Packet *readPkt = new Packet(request, MemCmd::ReadReq);
                readPkt->dataStatic((uint8_t *)akc + gen.complete());
+                // If the request spans two device memories, the device memory
+                // returned will be null.
+                assert(system()->getDeviceMemory(readPkt) != nullptr);
                system()->getDeviceMemory(readPkt)->access(readPkt);
                delete readPkt;
            }