dev-hsa,gpu-compute: fix bug with gfx8 VAs for HSA Queues

GFX7 (not supported in gem5) and GFX8 have a bug with how virtual addresses are calculated for their HSA queues. The ROCr component of ROCm solves this problem by doubling the HSA queue size that is requested, then mapping all virtual addresses in the second half of the queue to the same virtual addresses as the first half of the queue. This commit fixes gem5's support to mimic this behavior. Note that this change does not affect Vega's HSA queue support, because according to the ROCm documentation, Vega does not have the same problem as GCN3. Change-Id: I133cf1acc3a00a0baded0c4c3c2a25f39effdb51 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/51371 Maintainer: Matt Sinclair <mattdsinclair@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com> Reviewed-by: Matthew Poremba <matthew.poremba@amd.com>
2021-10-08 00:51:39 -05:00
parent ac63b7e294
commit 96a86780ee
5 changed files with 51 additions and 17 deletions
--- a/src/dev/hsa/hsa_packet_processor.cc
+++ b/src/dev/hsa/hsa_packet_processor.cc
@@ -44,6 +44,7 @@
 #include "dev/dma_device.hh"
 #include "dev/hsa/hsa_packet.hh"
 #include "dev/hsa/hw_scheduler.hh"
+#include "enums/GfxVersion.hh"
 #include "gpu-compute/gpu_command_processor.hh"
 #include "mem/packet_access.hh"
 #include "mem/page_table.hh"
@@ -100,13 +101,15 @@ void
 HSAPacketProcessor::setDeviceQueueDesc(uint64_t hostReadIndexPointer,
                                       uint64_t basePointer,
                                       uint64_t queue_id,
-                                       uint32_t size, int doorbellSize)
+                                       uint32_t size, int doorbellSize,
+                                       GfxVersion gfxVersion)
 {
    DPRINTF(HSAPacketProcessor,
             "%s:base = %p, qID = %d, ze = %d\n", __FUNCTION__,
             (void *)basePointer, queue_id, size);
    hwSchdlr->registerNewQueue(hostReadIndexPointer,
-                               basePointer, queue_id, size, doorbellSize);
+                               basePointer, queue_id, size, doorbellSize,
+                               gfxVersion);
 }

 AddrRangeList
--- a/src/dev/hsa/hsa_packet_processor.hh
+++ b/src/dev/hsa/hsa_packet_processor.hh
@@ -39,9 +39,11 @@
 #include <vector>

 #include "base/types.hh"
+#include "debug/HSAPacketProcessor.hh"
 #include "dev/dma_virt_device.hh"
 #include "dev/hsa/hsa.h"
 #include "dev/hsa/hsa_queue.hh"
+#include "enums/GfxVersion.hh"
 #include "params/HSAPacketProcessor.hh"
 #include "sim/eventq.hh"

@@ -84,14 +86,16 @@ class HSAQueueDescriptor
        uint64_t     hostReadIndexPtr;
        bool         stalledOnDmaBufAvailability;
        bool         dmaInProgress;
+        GfxVersion   gfxVersion;

        HSAQueueDescriptor(uint64_t base_ptr, uint64_t db_ptr,
-                           uint64_t hri_ptr, uint32_t size)
+                           uint64_t hri_ptr, uint32_t size,
+                           GfxVersion gfxVersion)
          : basePointer(base_ptr), doorbellPointer(db_ptr),
            writeIndex(0), readIndex(0),
            numElts(size / AQL_PACKET_SIZE), hostReadIndexPtr(hri_ptr),
            stalledOnDmaBufAvailability(false),
-            dmaInProgress(false)
+            dmaInProgress(false), gfxVersion(gfxVersion)
        {  }
        uint64_t spaceRemaining() { return numElts - (writeIndex - readIndex); }
        uint64_t spaceUsed() { return writeIndex - readIndex; }
@@ -102,15 +106,38 @@ class HSAQueueDescriptor

        uint64_t ptr(uint64_t ix)
        {
-            /**
-             * Sometimes queues report that their size is 512k, which would
-             * indicate numElts of 0x2000. However, they only have 256k
-             * mapped which means any index over 0x1000 will fail an
-             * address translation.
+            /*
+             * Based on ROCm Documentation:
+             * - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/
+                     10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
+                     rocr/src/core/runtime/amd_aql_queue.cpp#L99
+             * - https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/
+                     10ca0a99bbd0252f5bf6f08d1503e59f1129df4a/ROCm_Libraries/
+                     rocr/src/core/runtime/amd_aql_queue.cpp#L624
+             *
+             * GFX7 and GFX8 will allocate twice as much space for their HSA
+             * queues as they actually access (using mod operations to map the
+             * virtual addresses from the upper half of the queue to the same
+             * virtual addresses as the lower half).  Thus, we need to check if
+             * the ISA is GFX8 and mod the address by half of the queue size if
+             * so.
             */
-            assert(ix % numElts < 0x1000);
-            return basePointer +
-                ((ix % numElts) * objSize());
+            uint64_t retAddr = 0ll;
+            if ((gfxVersion == GfxVersion::gfx801) ||
+                (gfxVersion == GfxVersion::gfx803)) {
+              retAddr = basePointer + ((ix % (numElts/2)) * objSize());
+              DPRINTF(HSAPacketProcessor, "ptr() gfx8: base: 0x%x, "
+                      "index: 0x%x, numElts: 0x%x, numElts/2: 0x%x, "
+                      "objSize: 0x%x, retAddr: 0x%x\n", basePointer, ix,
+                      numElts, numElts/2, objSize(), retAddr);
+            } else {
+              retAddr = basePointer + ((ix % numElts) * objSize());
+              DPRINTF(HSAPacketProcessor, "ptr() gfx9: base: 0x%x, "
+                      "index: 0x%x, numElts: 0x%x, objSize: 0x%x, "
+                      "retAddr: 0x%x\n", basePointer, ix, numElts, objSize(),
+                      retAddr);
+            }
+            return retAddr;
        }
 };

@@ -325,7 +352,8 @@ class HSAPacketProcessor: public DmaVirtDevice
    void setDeviceQueueDesc(uint64_t hostReadIndexPointer,
                            uint64_t basePointer,
                            uint64_t queue_id,
-                            uint32_t size, int doorbellSize);
+                            uint32_t size, int doorbellSize,
+                            GfxVersion gfxVersion);
    void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize);
    void setDevice(GPUCommandProcessor * dev);
    void updateReadIndex(int, uint32_t);
--- a/src/dev/hsa/hw_scheduler.cc
+++ b/src/dev/hsa/hw_scheduler.cc
@@ -87,7 +87,8 @@ void
 HWScheduler::registerNewQueue(uint64_t hostReadIndexPointer,
                              uint64_t basePointer,
                              uint64_t queue_id,
-                              uint32_t size, int doorbellSize)
+                              uint32_t size, int doorbellSize,
+                              GfxVersion gfxVersion)
 {
    assert(queue_id < MAX_ACTIVE_QUEUES);
    // Map queue ID to doorbell.
@@ -108,7 +109,7 @@ HWScheduler::registerNewQueue(uint64_t hostReadIndexPointer,

    HSAQueueDescriptor* q_desc =
       new HSAQueueDescriptor(basePointer, db_offset,
-                              hostReadIndexPointer, size);
+                              hostReadIndexPointer, size, gfxVersion);
    AQLRingBuffer* aql_buf =
        new AQLRingBuffer(NUM_DMA_BUFS, hsaPP->name());
    QCntxt q_cntxt(q_desc, aql_buf);
--- a/src/dev/hsa/hw_scheduler.hh
+++ b/src/dev/hsa/hw_scheduler.hh
@@ -39,6 +39,7 @@

 #include "base/types.hh"
 #include "dev/hsa/hsa_packet_processor.hh"
+#include "enums/GfxVersion.hh"
 #include "sim/eventq.hh"

 // We allocate one PIO page for doorbells and each
@@ -59,7 +60,8 @@ class HWScheduler
    void registerNewQueue(uint64_t hostReadIndexPointer,
                          uint64_t basePointer,
                          uint64_t queue_id,
-                          uint32_t size, int doorbellSize);
+                          uint32_t size, int doorbellSize,
+                          GfxVersion gfxVersion);
    void unregisterQueue(uint64_t queue_id, int doorbellSize);
    void wakeup();
    void schedWakeup();
--- a/src/gpu-compute/gpu_compute_driver.cc
+++ b/src/gpu-compute/gpu_compute_driver.cc
@@ -173,7 +173,7 @@ GPUComputeDriver::allocateQueue(PortProxy &mem_proxy, Addr ioc_buf)
    auto &hsa_pp = device->hsaPacketProc();
    hsa_pp.setDeviceQueueDesc(args->read_pointer_address,
                              args->ring_base_address, args->queue_id,
-                              args->ring_size, doorbellSize());
+                              args->ring_size, doorbellSize(), gfxVersion);
    args.copyOut(mem_proxy);
 }