From 96a86780ee5746675add3d167b7928ace8c697d6 Mon Sep 17 00:00:00 2001
From: Matt Sinclair <mattdsinclair@gmail.com>
Date: Fri, 8 Oct 2021 00:51:39 -0500
Subject: [PATCH] dev-hsa,gpu-compute: fix bug with gfx8 VAs for HSA Queues

GFX7 (not supported in gem5) and GFX8 have a bug with how virtual
addresses are calculated for their HSA queues.  The ROCr component of
ROCm solves this problem by doubling the HSA queue size that is
requested, then mapping all virtual addresses in the second half of the
queue to the same virtual addresses as the first half of the queue.
This commit fixes gem5's support to mimic this behavior.

Note that this change does not affect Vega's HSA queue support, because
according to the ROCm documentation, Vega does not have the same problem
as GCN3.

Change-Id: I133cf1acc3a00a0baded0c4c3c2a25f39effdb51
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/51371
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Matthew Poremba <matthew.poremba@amd.com>
---
 src/dev/hsa/hsa_packet_processor.cc   |  7 ++--
 src/dev/hsa/hsa_packet_processor.hh   | 50 +++++++++++++++++++++------
 src/dev/hsa/hw_scheduler.cc           |  5 +--
 src/dev/hsa/hw_scheduler.hh           |  4 ++-
 src/gpu-compute/gpu_compute_driver.cc |  2 +-
 5 files changed, 51 insertions(+), 17 deletions(-)
diff --git a/src/dev/hsa/hsa_packet_processor.cc b/src/dev/hsa/hsa_packet_processor.cc
index 0427def4aa..22124b1a27 100644
--- a/src/dev/hsa/hsa_packet_processor.cc
+++ b/src/dev/hsa/hsa_packet_processor.cc
@@ -44,6 +44,7 @@
 #include "dev/dma_device.hh"
 #include "dev/hsa/hsa_packet.hh"
 #include "dev/hsa/hw_scheduler.hh"
+#include "enums/GfxVersion.hh"
 #include "gpu-compute/gpu_command_processor.hh"
 #include "mem/packet_access.hh"
 #include "mem/page_table.hh"
@@ -100,13 +101,15 @@ void
 HSAPacketProcessor::setDeviceQueueDesc(uint64_t hostReadIndexPointer,
                                        uint64_t basePointer,
                                        uint64_t queue_id,
-                                       uint32_t size, int doorbellSize)
+                                       uint32_t size, int doorbellSize,
+                                       GfxVersion gfxVersion)
 {
     DPRINTF(HSAPacketProcessor,
              "%s:base = %p, qID = %d, ze = %d\n", __FUNCTION__,
              (void *)basePointer, queue_id, size);
     hwSchdlr->registerNewQueue(hostReadIndexPointer,
-                               basePointer, queue_id, size, doorbellSize);
+                               basePointer, queue_id, size, doorbellSize,
+                               gfxVersion);
 }
 
 AddrRangeList
diff --git a/src/dev/hsa/hsa_packet_processor.hh b/src/dev/hsa/hsa_packet_processor.hh
index 9545006cb5..aabe24e1a9 100644
--- a/src/dev/hsa/hsa_packet_processor.hh
+++ b/src/dev/hsa/hsa_packet_processor.hh
@@ -39,9 +39,11 @@
 #include <vector>
 
 #include "base/types.hh"
+#include "debug/HSAPacketProcessor.hh"
 #include "dev/dma_virt_device.hh"
 #include "dev/hsa/hsa.h"
 #include "dev/hsa/hsa_queue.hh"
+#include "enums/GfxVersion.hh"
 #include "params/HSAPacketProcessor.hh"
 #include "sim/eventq.hh"
 
@@ -84,14 +86,16 @@ class HSAQueueDescriptor
         uint64_t     hostReadIndexPtr;
         bool         stalledOnDmaBufAvailability;
         bool         dmaInProgress;
+        GfxVersion   gfxVersion;
 
         HSAQueueDescriptor(uint64_t base_ptr, uint64_t db_ptr,
-                           uint64_t hri_ptr, uint32_t size)
+                           uint64_t hri_ptr, uint32_t size,
+                           GfxVersion gfxVersion)
           : basePointer(base_ptr), doorbellPointer(db_ptr),
             writeIndex(0), readIndex(0),
             numElts(size / AQL_PACKET_SIZE), hostReadIndexPtr(hri_ptr),
             stalledOnDmaBufAvailability(false),
-            dmaInProgress(false)
+            dmaInProgress(false), gfxVersion(gfxVersion)
         {  }
         uint64_t spaceRemaining() { return numElts - (writeIndex - readIndex); }
         uint64_t spaceUsed() { return writeIndex - readIndex; }
@@ -102,15 +106,38 @@ class HSAQueueDescriptor
 
         uint64_t ptr(uint64_t ix)
         {
-            /**
-             * Sometimes queues report that their size is 512k, which would
-             * indicate numElts of 0x2000. However, they only have 256k
-             * mapped which means any index over 0x1000 will fail an
-             * address translation.
+            /*
+             * Based on ROCm Documentation:
+             * - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/
+                     10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
+                     rocr/src/core/runtime/amd_aql_queue.cpp#L99
+             * - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/
+                     10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
+                     rocr/src/core/runtime/amd_aql_queue.cpp#L624
+             *
+             * GFX7 and GFX8 will allocate twice as much space for their HSA
+             * queues as they actually access (using mod operations to map the
+             * virtual addresses from the upper half of the queue to the same
+             * virtual addresses as the lower half).  Thus, we need to check if
+             * the ISA is GFX8 and mod the address by half of the queue size if
+             * so.
              */
-            assert(ix % numElts < 0x1000);
-            return basePointer +
-                ((ix % numElts) * objSize());
+            uint64_t retAddr = 0ll;
+            if ((gfxVersion == GfxVersion::gfx801) ||
+                (gfxVersion == GfxVersion::gfx803)) {
+              retAddr = basePointer + ((ix % (numElts/2)) * objSize());
+              DPRINTF(HSAPacketProcessor, "ptr() gfx8: base: 0x%x, "
+                      "index: 0x%x, numElts: 0x%x, numElts/2: 0x%x, "
+                      "objSize: 0x%x, retAddr: 0x%x\n", basePointer, ix,
+                      numElts, numElts/2, objSize(), retAddr);
+            } else {
+              retAddr = basePointer + ((ix % numElts) * objSize());
+              DPRINTF(HSAPacketProcessor, "ptr() gfx9: base: 0x%x, "
+                      "index: 0x%x, numElts: 0x%x, objSize: 0x%x, "
+                      "retAddr: 0x%x\n", basePointer, ix, numElts, objSize(),
+                      retAddr);
+            }
+            return retAddr;
         }
 };
 
@@ -325,7 +352,8 @@ class HSAPacketProcessor: public DmaVirtDevice
     void setDeviceQueueDesc(uint64_t hostReadIndexPointer,
                             uint64_t basePointer,
                             uint64_t queue_id,
-                            uint32_t size, int doorbellSize);
+                            uint32_t size, int doorbellSize,
+                            GfxVersion gfxVersion);
     void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize);
     void setDevice(GPUCommandProcessor * dev);
     void updateReadIndex(int, uint32_t);
diff --git a/src/dev/hsa/hw_scheduler.cc b/src/dev/hsa/hw_scheduler.cc
index 48f4e694eb..f42dedef1c 100644
--- a/src/dev/hsa/hw_scheduler.cc
+++ b/src/dev/hsa/hw_scheduler.cc
@@ -87,7 +87,8 @@ void
 HWScheduler::registerNewQueue(uint64_t hostReadIndexPointer,
                               uint64_t basePointer,
                               uint64_t queue_id,
-                              uint32_t size, int doorbellSize)
+                              uint32_t size, int doorbellSize,
+                              GfxVersion gfxVersion)
 {
     assert(queue_id < MAX_ACTIVE_QUEUES);
     // Map queue ID to doorbell.
@@ -108,7 +109,7 @@ HWScheduler::registerNewQueue(uint64_t hostReadIndexPointer,
 
     HSAQueueDescriptor* q_desc =
        new HSAQueueDescriptor(basePointer, db_offset,
-                              hostReadIndexPointer, size);
+                              hostReadIndexPointer, size, gfxVersion);
     AQLRingBuffer* aql_buf =
         new AQLRingBuffer(NUM_DMA_BUFS, hsaPP->name());
     QCntxt q_cntxt(q_desc, aql_buf);
diff --git a/src/dev/hsa/hw_scheduler.hh b/src/dev/hsa/hw_scheduler.hh
index b50273d4ce..8c043ec8ef 100644
--- a/src/dev/hsa/hw_scheduler.hh
+++ b/src/dev/hsa/hw_scheduler.hh
@@ -39,6 +39,7 @@
 
 #include "base/types.hh"
 #include "dev/hsa/hsa_packet_processor.hh"
+#include "enums/GfxVersion.hh"
 #include "sim/eventq.hh"
 
 // We allocate one PIO page for doorbells and each
@@ -59,7 +60,8 @@ class HWScheduler
     void registerNewQueue(uint64_t hostReadIndexPointer,
                           uint64_t basePointer,
                           uint64_t queue_id,
-                          uint32_t size, int doorbellSize);
+                          uint32_t size, int doorbellSize,
+                          GfxVersion gfxVersion);
     void unregisterQueue(uint64_t queue_id, int doorbellSize);
     void wakeup();
     void schedWakeup();
diff --git a/src/gpu-compute/gpu_compute_driver.cc b/src/gpu-compute/gpu_compute_driver.cc
index 3adfadd0ea..dd8e140ad8 100644
--- a/src/gpu-compute/gpu_compute_driver.cc
+++ b/src/gpu-compute/gpu_compute_driver.cc
@@ -173,7 +173,7 @@ GPUComputeDriver::allocateQueue(PortProxy &mem_proxy, Addr ioc_buf)
     auto &hsa_pp = device->hsaPacketProc();
     hsa_pp.setDeviceQueueDesc(args->read_pointer_address,
                               args->ring_base_address, args->queue_id,
-                              args->ring_size, doorbellSize());
+                              args->ring_size, doorbellSize(), gfxVersion);
     args.copyOut(mem_proxy);
 }