From d05433b3f65af251e0e4b98266f50a5ab1b8a503 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Sat, 21 Oct 2023 12:36:35 -0500
Subject: [PATCH 1/4] gpu-compute,dev-hsa: Send vendor packet completion signal

gem5 does not currently implement any vendor-specific HSA packets.
Starting in ROCm 5.5, vendor packets appear to end with a completion
signal. Not sending this completion causes gem5 to hang. Since these
packets are not documented anywhere and need to be reverse engineered we
send the completion signal, if non-zero, and finish the packet as is the
current behavior.

Testing: HIP examples working on most recent ROCm release (5.7.1).

Change-Id: Id0841407bec564c84f590c943f0609b17e01e14c
---
 src/dev/hsa/hsa_packet.hh                |  8 ++++++++
 src/gpu-compute/gpu_command_processor.cc | 23 ++++++++++++++++-------
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/src/dev/hsa/hsa_packet.hh b/src/dev/hsa/hsa_packet.hh
index 8c7d694431..8eab8385a6 100644
--- a/src/dev/hsa/hsa_packet.hh
+++ b/src/dev/hsa/hsa_packet.hh
@@ -100,6 +100,14 @@ struct _hsa_barrier_or_packet_t
     uint64_t completion_signal;
 };
 
+struct _hsa_generic_vendor_pkt
+{
+    uint32_t padding[14];
+    Addr completion_signal;
+};
+// All HSA AQL packets are 64 bytes. Confirm that here.
+static_assert(sizeof(_hsa_generic_vendor_pkt) == 64);
+
 } // namespace gem5
 
 #endif // __DEV_HSA_HSA_PACKET_HH__
diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc
index ecc5f1d98b..5bed04b9dd 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -473,18 +473,27 @@ GPUCommandProcessor::driver()
  */
 
 /**
- * TODO: For now we simply tell the HSAPP to finish the packet,
- *       however a future patch will update this method to provide
- *       the proper handling of any required vendor-specific packets.
- *       In the version of ROCm that is currently supported (1.6)
- *       the runtime will send packets that direct the CP to
- *       invalidate the GPUs caches. We do this automatically on
- *       each kernel launch in the CU, so this is safe for now.
+ * TODO: For now we simply tell the HSAPP to finish the packet and write a
+ * completion signal, if any. However, in the future proper handing may be
+ * required for vendor specific packets.
+ *
+ * In the version of ROCm that is currently supported the runtime will send
+ * packets that direct the CP to invalidate the GPU caches. We do this
+ * automatically on each kernel launch in the CU, so that situation is safe
+ * for now.
  */
 void
 GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
     Addr host_pkt_addr)
 {
+    auto vendor_pkt = (_hsa_generic_vendor_pkt *)raw_pkt;
+
+    if (vendor_pkt->completion_signal) {
+        sendCompletionSignal(vendor_pkt->completion_signal);
+    }
+
+    warn("Ignoring vendor packet\n");
+
     hsaPP->finishPkt(raw_pkt, queue_id);
 }
 

From 37da1c45f328e45fc1e07ea55197742bae007d7d Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Fri, 13 Oct 2023 14:59:56 -0500
Subject: [PATCH 2/4] dev-amdgpu: Better handling for queue remapping

The amdgpu driver can, at *any* time, tell the device to unmap a queue
to force the queue descriptor to be written back to main memory in the
form of a memory queue descriptor (MQD). It will then immediately remap
the queue and continue writing the doorbell to the queue. It is possible
that the doorbell write occurs after the queue is unmapped but before it
is remapped. In this situation, we need to check the updated value of
the doorbell for the queue and write that to the queue after it is
mapped.

To handle this, a pending doorbell packet map is created to hold a
packet to replay when the queue is mapped. Because PCI in gem5
implements only the atomic protocol port, we cannot use the original
packet as it must respond in the same Tick. This patch fixes issues with
the doorbell maps not being cleared on unmapping to ensure the doorbell
is not found in writeDoorbell and places in the pending doorbell map.
This includes fixing the doorbell offset value in the doorbell to VMID
map which was is now multiplied by four as it is a dword address.

This was tested using tensorflow 2.0's MNIST example which was seeing
this issue consistently. With this patch it now makes progress and does
issue pending doorbell writes.

Change-Id: Ic6b401d3fe7fc46b7bcbf19a769cdea6814e7d1e
---
 src/dev/amdgpu/amdgpu_device.cc        | 31 +++++++++++++++++++++++++-
 src/dev/amdgpu/amdgpu_device.hh        |  2 ++
 src/dev/amdgpu/pm4_packet_processor.cc | 10 ++++++++-
 3 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc
index 1b81c4d0b2..b25ffbf79f 100644
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -466,7 +466,17 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
             panic("Write to unkown queue type!");
         }
     } else {
-        warn("Unknown doorbell offset: %lx\n", offset);
+        warn("Unknown doorbell offset: %lx. Saving to pending doorbells.\n",
+             offset);
+
+        // We have to ACK the PCI packet immediately, so create a copy of the
+        // packet here to send again.
+        RequestPtr pending_req(pkt->req);
+        PacketPtr pending_pkt = Packet::createWrite(pending_req);
+        uint8_t *pending_data = new uint8_t[pkt->getSize()];
+        pending_pkt->dataDynamic(pending_data);
+
+        pendingDoorbellPkts.emplace(offset, pending_pkt);
     }
 }
 
@@ -589,6 +599,17 @@ AMDGPUDevice::write(PacketPtr pkt)
     return pioDelay;
 }
 
+void
+AMDGPUDevice::processPendingDoorbells(uint32_t offset)
+{
+    if (pendingDoorbellPkts.count(offset)) {
+        DPRINTF(AMDGPUDevice, "Sending pending doorbell %x\n", offset);
+        writeDoorbell(pendingDoorbellPkts[offset], offset);
+        delete pendingDoorbellPkts[offset];
+        pendingDoorbellPkts.erase(offset);
+    }
+}
+
 bool
 AMDGPUDevice::haveRegVal(uint32_t addr)
 {
@@ -812,6 +833,14 @@ AMDGPUDevice::deallocateAllQueues()
     for (auto& it : sdmaEngs) {
         it.second->deallocateRLCQueues();
     }
+
+    // "All" queues implicitly refers to all user queues. User queues begin at
+    // doorbell address 0x4000, so unmap any queue at or above that address.
+    for (auto [offset, vmid] : doorbellVMIDMap) {
+        if (offset >= 0x4000) {
+            doorbells.erase(offset);
+        }
+    }
 }
 
 void
diff --git a/src/dev/amdgpu/amdgpu_device.hh b/src/dev/amdgpu/amdgpu_device.hh
index 7f69ec19f6..b6b6e2a81a 100644
--- a/src/dev/amdgpu/amdgpu_device.hh
+++ b/src/dev/amdgpu/amdgpu_device.hh
@@ -90,6 +90,7 @@ class AMDGPUDevice : public PciDevice
     using GPURegMap = std::unordered_map<uint32_t, uint64_t>;
     GPURegMap regs;
     std::unordered_map<uint32_t, QueueType> doorbells;
+    std::unordered_map<uint32_t, PacketPtr> pendingDoorbellPkts;
 
     /**
      * VGA ROM methods
@@ -187,6 +188,7 @@ class AMDGPUDevice : public PciDevice
      * Set handles to GPU blocks.
      */
     void setDoorbellType(uint32_t offset, QueueType qt);
+    void processPendingDoorbells(uint32_t offset);
     void setSDMAEngine(Addr offset, SDMAEngine *eng);
 
     /**
diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc
index fdb6f9d7ce..352af400b0 100644
--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -384,7 +384,10 @@ PM4PacketProcessor::mapQueues(PM4Queue *q, PM4MapQueues *pkt)
                 "Mapping mqd from %p %p (vmid %d - last vmid %d).\n",
                 addr, pkt->mqdAddr, pkt->vmid, gpuDevice->lastVMID());
 
-        gpuDevice->mapDoorbellToVMID(pkt->doorbellOffset,
+        // The doorbellOffset is a dword address. We shift by two / multiply
+        // by four to get the byte address to match doorbell addresses in
+        // the GPU device.
+        gpuDevice->mapDoorbellToVMID(pkt->doorbellOffset << 2,
                                      gpuDevice->lastVMID());
 
         QueueDesc *mqd = new QueueDesc();
@@ -444,6 +447,8 @@ PM4PacketProcessor::processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
 
     DPRINTF(PM4PacketProcessor, "PM4 mqd read completed, base %p, mqd %p, "
             "hqdAQL %d.\n", mqd->base, mqd->mqdBase, mqd->aql);
+
+    gpuDevice->processPendingDoorbells(offset);
 }
 
 void
@@ -472,6 +477,8 @@ PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
     // Register doorbell with GPU device
     gpuDevice->setSDMAEngine(pkt->doorbellOffset << 2, sdma_eng);
     gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC);
+
+    gpuDevice->processPendingDoorbells(pkt->doorbellOffset << 2);
 }
 
 void
@@ -576,6 +583,7 @@ PM4PacketProcessor::unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt)
         gpuDevice->deallocatePasid(pkt->pasid);
         break;
       case 2:
+        panic("Unmapping queue selection 2 unimplemented\n");
         break;
       case 3: {
         auto &hsa_pp = gpuDevice->CP()->hsaPacketProc();

From f07e0e7f5d38f56d147ac07f03757e0b89094e49 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Fri, 20 Oct 2023 16:30:43 -0500
Subject: [PATCH 3/4] gpu-compute: Read dispatch packet with timing DMA

This fixes occasional readBlob fatals caused by the functional read of
system memory, seen often with the KVM CPU.

Change-Id: Ifccee666f62faa5b2fcf0a64a9d77c8cf95b3add
---
 src/gpu-compute/gpu_command_processor.cc | 155 ++++++++++++++---------
 src/gpu-compute/gpu_command_processor.hh |   5 +
 2 files changed, 99 insertions(+), 61 deletions(-)

diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc
index 5bed04b9dd..05c9a95eed 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -116,28 +116,52 @@ void
 GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
                                        Addr host_pkt_addr)
 {
-    static int dynamic_task_id = 0;
     _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
     assert(!(disp_pkt->kernel_object & (system()->cacheLineSize() - 1)));
 
     /**
-     * we need to read a pointer in the application's address
-     * space to pull out the kernel code descriptor.
+     * Need to use a raw pointer for DmaVirtDevice API. This is deleted
+     * in the dispatchKernelObject method.
      */
-    auto *tc = sys->threads[0];
-
-    TranslatingPortProxy fs_proxy(tc);
-    SETranslatingPortProxy se_proxy(tc);
-    PortProxy &virt_proxy = FullSystem ? fs_proxy : se_proxy;
+    AMDKernelCode *akc = new AMDKernelCode;
 
     /**
-     * In full system mode, the page table entry may point to a system page
-     * or a device page. System pages use the proxy as normal, but a device
-     * page needs to be read from device memory. Check what type it is here.
+     * The kernel_object is a pointer to the machine code, whose entry
+     * point is an 'amd_kernel_code_t' type, which is included in the
+     * kernel binary, and describes various aspects of the kernel. The
+     * desired entry is the 'kernel_code_entry_byte_offset' field,
+     * which provides the byte offset (positive or negative) from the
+     * address of the amd_kernel_code_t to the start of the machine
+     * instructions.
+     *
+     * For SE mode we can read from the port proxy. In FS mode, we may need
+     * to wait for the guest OS to setup translations, especially when using
+     * the KVM CPU, so it is preferred to read the code object using a timing
+     * DMA request.
      */
-    bool is_system_page = true;
-    Addr phys_addr = disp_pkt->kernel_object;
-    if (FullSystem) {
+    if (!FullSystem) {
+        /**
+         * we need to read a pointer in the application's address
+         * space to pull out the kernel code descriptor.
+         */
+        auto *tc = sys->threads[0];
+        SETranslatingPortProxy virt_proxy(tc);
+
+        DPRINTF(GPUCommandProc, "reading kernel_object using proxy\n");
+        virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)akc,
+            sizeof(AMDKernelCode));
+
+        dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
+    } else {
+        /**
+         * In full system mode, the page table entry may point to a system
+         * page or a device page. System pages use the proxy as normal, but
+         * a device page needs to be read from device memory. Check what type
+         * it is here.
+         */
+        bool is_system_page = true;
+        Addr phys_addr = disp_pkt->kernel_object;
+
         /**
          * Full system currently only supports running on single VMID (one
          * virtual memory space), i.e., one application running on GPU at a
@@ -149,61 +173,68 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
         walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),
                                 phys_addr, tmp_bytes, BaseMMU::Mode::Read,
                                 is_system_page);
-    }
 
-    DPRINTF(GPUCommandProc, "kernobj vaddr %#lx paddr %#lx size %d s:%d\n",
-            disp_pkt->kernel_object, phys_addr, sizeof(AMDKernelCode),
-            is_system_page);
+        DPRINTF(GPUCommandProc, "kernel_object vaddr %#lx paddr %#lx size %d"
+                " s:%d\n", disp_pkt->kernel_object, phys_addr,
+                sizeof(AMDKernelCode), is_system_page);
 
-    /**
-     * The kernel_object is a pointer to the machine code, whose entry
-     * point is an 'amd_kernel_code_t' type, which is included in the
-     * kernel binary, and describes various aspects of the kernel. The
-     * desired entry is the 'kernel_code_entry_byte_offset' field,
-     * which provides the byte offset (positive or negative) from the
-     * address of the amd_kernel_code_t to the start of the machine
-     * instructions.
-     */
-    AMDKernelCode akc;
-    if (is_system_page) {
-        DPRINTF(GPUCommandProc, "kernel_object in system, using proxy\n");
-        virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
-            sizeof(AMDKernelCode));
-    } else {
-        assert(FullSystem);
-        DPRINTF(GPUCommandProc, "kernel_object in device, using device mem\n");
+        /**
+         * System objects use DMA device. Device objects need to use device
+         * memory.
+         */
+        if (is_system_page) {
+            DPRINTF(GPUCommandProc,
+                    "sending system DMA read for kernel_object\n");
 
-        // Read from GPU memory manager one cache line at a time to prevent
-        // rare cases where the AKC spans two memory pages.
-        ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
-                           system()->cacheLineSize());
-        for (; !gen.done(); gen.next()) {
-            Addr chunk_addr = gen.addr();
-            int vmid = 1;
-            unsigned dummy;
-            walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),
-                                    chunk_addr, dummy, BaseMMU::Mode::Read,
-                                    is_system_page);
+            auto dma_callback = new DmaVirtCallback<uint32_t>(
+              [=](const uint32_t&) {
+                dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
+              });
 
-            Request::Flags flags = Request::PHYSICAL;
-            RequestPtr request = std::make_shared<Request>(chunk_addr,
-                system()->cacheLineSize(), flags, walker->getDevRequestor());
-            Packet *readPkt = new Packet(request, MemCmd::ReadReq);
-            readPkt->dataStatic((uint8_t *)&akc + gen.complete());
-            system()->getDeviceMemory(readPkt)->access(readPkt);
-            delete readPkt;
+            dmaReadVirt(disp_pkt->kernel_object, sizeof(AMDKernelCode),
+                    dma_callback, (void *)akc);
+        } else {
+            DPRINTF(GPUCommandProc,
+                    "kernel_object in device, using device mem\n");
+
+            // Read from GPU memory manager one cache line at a time to prevent
+            // rare cases where the AKC spans two memory pages.
+            ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
+                               system()->cacheLineSize());
+            for (; !gen.done(); gen.next()) {
+                Addr chunk_addr = gen.addr();
+                int vmid = 1;
+                unsigned dummy;
+                walker->startFunctional(
+                    gpuDevice->getVM().getPageTableBase(vmid), chunk_addr,
+                    dummy, BaseMMU::Mode::Read, is_system_page);
+
+                Request::Flags flags = Request::PHYSICAL;
+                RequestPtr request = std::make_shared<Request>(chunk_addr,
+                    system()->cacheLineSize(), flags,
+                    walker->getDevRequestor());
+                Packet *readPkt = new Packet(request, MemCmd::ReadReq);
+                readPkt->dataStatic((uint8_t *)akc + gen.complete());
+                system()->getDeviceMemory(readPkt)->access(readPkt);
+                delete readPkt;
+            }
+
+            dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
         }
     }
+}
+
+void
+GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
+                                        uint32_t queue_id, Addr host_pkt_addr)
+{
+    _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
 
     DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
-        "kernel object\n", akc.kernel_code_entry_byte_offset);
-
-    DPRINTF(GPUCommandProc,"GPUCommandProc: Sending dispatch pkt to %lu\n",
-        (uint64_t)tc->cpuId());
-
+        "kernel object\n", akc->kernel_code_entry_byte_offset);
 
     Addr machine_code_addr = (Addr)disp_pkt->kernel_object
-        + akc.kernel_code_entry_byte_offset;
+        + akc->kernel_code_entry_byte_offset;
 
     DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
         machine_code_addr);
@@ -219,7 +250,7 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
      * APUs to implement asynchronous memcopy operations from 2 pointers in
      * host memory.  I have no idea what BLIT stands for.
      * */
-    if (akc.runtime_loader_kernel_symbol) {
+    if (akc->runtime_loader_kernel_symbol) {
         kernel_name = "Some kernel";
     } else {
         kernel_name = "Blit kernel";
@@ -230,7 +261,7 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
     GfxVersion gfxVersion = FullSystem ? gpuDevice->getGfxVersion()
                           : driver()->getGfxVersion();
     HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
-        dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr,
+        dynamic_task_id, raw_pkt, akc, host_pkt_addr, machine_code_addr,
         gfxVersion);
 
     DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
@@ -252,6 +283,8 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
     // The driver expects the start time to be in ns
     Tick start_ts = curTick() / sim_clock::as_int::ns;
     dispatchStartTime.insert({disp_pkt->completion_signal, start_ts});
+
+    delete akc;
 }
 
 void
diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh
index f6783834eb..85b2a44494 100644
--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -99,6 +99,8 @@ class GPUCommandProcessor : public DmaVirtDevice
                          Addr host_pkt_addr);
     void attachDriver(GPUComputeDriver *driver);
 
+    void dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
+                              uint32_t queue_id, Addr host_pkt_addr);
     void dispatchPkt(HSAQueueEntry *task);
     void signalWakeupEvent(uint32_t event_id);
 
@@ -149,6 +151,9 @@ class GPUCommandProcessor : public DmaVirtDevice
     HSAPacketProcessor *hsaPP;
     TranslationGenPtr translate(Addr vaddr, Addr size) override;
 
+    // Running counter of dispatched tasks
+    int dynamic_task_id = 0;
+
     // Keep track of start times for task dispatches.
     std::unordered_map<Addr, Tick> dispatchStartTime;
 

From e362310f3d72a45dea7fc6855e5cff3b675b47a1 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Fri, 27 Oct 2023 13:20:56 -0500
Subject: [PATCH 4/4] gpu-compute: Update GPR allocation counts

GPR allocation is using fields in the AMD kernel code structure which
are not backwards compatible and are not populated in more recent
compiler versions. Use the granulated fields instead which is enfored to
be backwards compatible.

Change-Id: I718716226f5dbeb08369d5365d5e85b029027932
---
 src/gpu-compute/hsa_queue_entry.hh | 58 ++++++++++++++----------------
 1 file changed, 26 insertions(+), 32 deletions(-)

diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh
index 4083c1c85a..84ae139127 100644
--- a/src/gpu-compute/hsa_queue_entry.hh
+++ b/src/gpu-compute/hsa_queue_entry.hh
@@ -70,8 +70,6 @@ class HSAQueueEntry
           _gridSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_x,
                     (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_y,
                     (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_z}},
-          numVgprs(akc->workitem_vgpr_count),
-          numSgprs(akc->wavefront_sgpr_count),
           _queueId(queue_id), _dispatchId(dispatch_id), dispPkt(disp_pkt),
           _hostDispPktAddr(host_pkt_addr),
           _completionSignal(((_hsa_dispatch_packet_t*)disp_pkt)
@@ -88,40 +86,36 @@ class HSAQueueEntry
           _globalWgId(0), dispatchComplete(false)
 
     {
-        // Precompiled BLIT kernels actually violate the spec a bit
-        // and don't set many of the required akc fields.  For these kernels,
-        // we need to rip register usage from the resource registers.
-        //
-        // We can't get an exact number of registers from the resource
-        // registers because they round, but we can get an upper bound on it.
-        // We determine the number of registers by solving for "vgprs_used"
-        // in the LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
+        // Use the resource descriptors to determine number of GPRs. This will
+        // round up in some cases, however the exact number field in the AMD
+        // kernel code struct is not backwards compatible and that field is
+        // not populated in newer compiles. The resource descriptor dword must
+        // be backwards compatible, so use that always.
+        // LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
         //     #code-object-v3-kernel-descriptor
+        //
         // Currently, the only supported gfx version in gem5 that computes
-        // this differently is gfx90a.
-        if (!numVgprs) {
-            if (gfx_version == GfxVersion::gfx90a) {
-                numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
-            } else {
-                numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
-            }
+        // VGPR count differently is gfx90a.
+        if (gfx_version == GfxVersion::gfx90a) {
+            numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
+        } else {
+            numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
         }
 
-        if (!numSgprs || numSgprs ==
-            std::numeric_limits<decltype(akc->wavefront_sgpr_count)>::max()) {
-            // Supported major generation numbers: 0 (BLIT kernels), 8, and 9
-            uint16_t version = akc->amd_machine_version_major;
-            assert((version == 0) || (version == 8) || (version == 9));
-            // SGPR allocation granularies:
-            // - GFX8: 8
-            // - GFX9: 16
-            // Source: https://llvm.org/docs/AMDGPUUsage.html
-            if ((version == 0) || (version == 8)) {
-                // We assume that BLIT kernels use the same granularity as GFX8
-                numSgprs = (akc->granulated_wavefront_sgpr_count + 1) * 8;
-            } else if (version == 9) {
-                numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2;
-            }
+        // SGPR allocation granularies:
+        // - GFX8: 8
+        // - GFX9: 16
+        // Source: https://llvm.org/docs/.html
+        if (gfx_version == GfxVersion::gfx801 ||
+                gfx_version == GfxVersion::gfx803) {
+            numSgprs = (akc->granulated_wavefront_sgpr_count + 1) * 8;
+        } else if (gfx_version == GfxVersion::gfx900 ||
+                gfx_version == GfxVersion::gfx902 ||
+                gfx_version == GfxVersion::gfx908 ||
+                gfx_version == GfxVersion::gfx90a) {
+            numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2;
+        } else {
+            panic("Saw unknown gfx version setting up GPR counts\n");
         }
 
         initialVgprState.reset();