From ba2f5615bae872efa4d94bb3400e42e6b568921d Mon Sep 17 00:00:00 2001
From: Michael Boyer <mboyer@amd.com>
Date: Wed, 20 Mar 2024 11:09:25 -0700
Subject: [PATCH] gpu-compute: Support cache line sizes >64B in GPUFS (#939)

This change fixes two issues:

1) The --cacheline_size option was setting the system cache line size
but not the Ruby cache line size, and the mismatch was causing assertion
failures.

2) The submitDispatchPkt() function accesses the kernel object in
chunks, with the chunk size equal to the cache line size. For cache line
sizes >64B (e.g. 128B), the kernel object is not guaranteed to be
aligned to a cache line and it was possible for a chunk to be partially
contained in two separate device memories, causing the memory access to
fail.

Change-Id: I8e45146901943e9c2750d32162c0f35c851e09e1

Co-authored-by: Michael Boyer <Michael.Boyer@amd.com>
---
 configs/example/gpufs/Disjoint_VIPER.py  |  2 ++
 src/gpu-compute/gpu_command_processor.cc | 12 +++++++++---
 2 files changed, 11 insertions(+), 3 deletions(-)
diff --git a/configs/example/gpufs/Disjoint_VIPER.py b/configs/example/gpufs/Disjoint_VIPER.py
index 28f0768c2a..0fd258e0fd 100644
--- a/configs/example/gpufs/Disjoint_VIPER.py
+++ b/configs/example/gpufs/Disjoint_VIPER.py
@@ -58,6 +58,8 @@ class Disjoint_VIPER(RubySystem):
             self.network_cpu = DisjointSimple(self)
             self.network_gpu = DisjointSimple(self)
 
+        self.block_size_bytes = options.cacheline_size
+
         # Construct CPU controllers
         cpu_dir_nodes = construct_dirs(options, system, self, self.network_cpu)
         (cp_sequencers, cp_cntrl_nodes) = construct_corepairs(
diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc
index 02b1bb174a..3737f8a6ff 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -118,7 +118,10 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
                                        Addr host_pkt_addr)
 {
     _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
-    assert(!(disp_pkt->kernel_object & (system()->cacheLineSize() - 1)));
+    // The kernel object should be aligned to a 64B boundary, but not
+    // necessarily a cache line boundary.
+    unsigned akc_alignment_granularity = 64;
+    assert(!(disp_pkt->kernel_object & (akc_alignment_granularity - 1)));
 
     /**
      * Need to use a raw pointer for DmaVirtDevice API. This is deleted
@@ -201,7 +204,7 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
             // Read from GPU memory manager one cache line at a time to prevent
             // rare cases where the AKC spans two memory pages.
             ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
-                               system()->cacheLineSize());
+                               akc_alignment_granularity);
             for (; !gen.done(); gen.next()) {
                 Addr chunk_addr = gen.addr();
                 int vmid = 1;
@@ -212,10 +215,13 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
 
                 Request::Flags flags = Request::PHYSICAL;
                 RequestPtr request = std::make_shared<Request>(chunk_addr,
-                    system()->cacheLineSize(), flags,
+                    akc_alignment_granularity, flags,
                     walker->getDevRequestor());
                 Packet *readPkt = new Packet(request, MemCmd::ReadReq);
                 readPkt->dataStatic((uint8_t *)akc + gen.complete());
+                // If the request spans two device memories, the device memory
+                // returned will be null.
+                assert(system()->getDeviceMemory(readPkt) != nullptr);
                 system()->getDeviceMemory(readPkt)->access(readPkt);
                 delete readPkt;
             }