gpu-compute,dev-hsa: ROCm 5.5+ support (#498)

ROCm 5.5 support including: - Vendor packet completion signals - Queue remapping race condition fix - Backwards compatible GPR allocation - Fix transient readBlob fatal reading kernel descriptor
2023-11-06 10:51:37 -08:00
parent e4cdd73a59 e362310f3d
commit 71973b386e
7 changed files with 190 additions and 102 deletions
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -466,7 +466,17 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
            panic("Write to unkown queue type!");
        }
    } else {
-        warn("Unknown doorbell offset: %lx\n", offset);
+        warn("Unknown doorbell offset: %lx. Saving to pending doorbells.\n",
+             offset);
+
+        // We have to ACK the PCI packet immediately, so create a copy of the
+        // packet here to send again.
+        RequestPtr pending_req(pkt->req);
+        PacketPtr pending_pkt = Packet::createWrite(pending_req);
+        uint8_t *pending_data = new uint8_t[pkt->getSize()];
+        pending_pkt->dataDynamic(pending_data);
+
+        pendingDoorbellPkts.emplace(offset, pending_pkt);
    }
 }

@@ -589,6 +599,17 @@ AMDGPUDevice::write(PacketPtr pkt)
    return pioDelay;
 }

+void
+AMDGPUDevice::processPendingDoorbells(uint32_t offset)
+{
+    if (pendingDoorbellPkts.count(offset)) {
+        DPRINTF(AMDGPUDevice, "Sending pending doorbell %x\n", offset);
+        writeDoorbell(pendingDoorbellPkts[offset], offset);
+        delete pendingDoorbellPkts[offset];
+        pendingDoorbellPkts.erase(offset);
+    }
+}
+
 bool
 AMDGPUDevice::haveRegVal(uint32_t addr)
 {
@@ -812,6 +833,14 @@ AMDGPUDevice::deallocateAllQueues()
    for (auto& it : sdmaEngs) {
        it.second->deallocateRLCQueues();
    }
+
+    // "All" queues implicitly refers to all user queues. User queues begin at
+    // doorbell address 0x4000, so unmap any queue at or above that address.
+    for (auto [offset, vmid] : doorbellVMIDMap) {
+        if (offset >= 0x4000) {
+            doorbells.erase(offset);
+        }
+    }
 }

 void
--- a/src/dev/amdgpu/amdgpu_device.hh
+++ b/src/dev/amdgpu/amdgpu_device.hh
@@ -90,6 +90,7 @@ class AMDGPUDevice : public PciDevice
    using GPURegMap = std::unordered_map<uint32_t, uint64_t>;
    GPURegMap regs;
    std::unordered_map<uint32_t, QueueType> doorbells;
+    std::unordered_map<uint32_t, PacketPtr> pendingDoorbellPkts;

    /**
     * VGA ROM methods
@@ -187,6 +188,7 @@ class AMDGPUDevice : public PciDevice
     * Set handles to GPU blocks.
     */
    void setDoorbellType(uint32_t offset, QueueType qt);
+    void processPendingDoorbells(uint32_t offset);
    void setSDMAEngine(Addr offset, SDMAEngine *eng);

    /**
--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -384,7 +384,10 @@ PM4PacketProcessor::mapQueues(PM4Queue *q, PM4MapQueues *pkt)
                "Mapping mqd from %p %p (vmid %d - last vmid %d).\n",
                addr, pkt->mqdAddr, pkt->vmid, gpuDevice->lastVMID());

-        gpuDevice->mapDoorbellToVMID(pkt->doorbellOffset,
+        // The doorbellOffset is a dword address. We shift by two / multiply
+        // by four to get the byte address to match doorbell addresses in
+        // the GPU device.
+        gpuDevice->mapDoorbellToVMID(pkt->doorbellOffset << 2,
                                     gpuDevice->lastVMID());

        QueueDesc *mqd = new QueueDesc();
@@ -444,6 +447,8 @@ PM4PacketProcessor::processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,

    DPRINTF(PM4PacketProcessor, "PM4 mqd read completed, base %p, mqd %p, "
            "hqdAQL %d.\n", mqd->base, mqd->mqdBase, mqd->aql);
+
+    gpuDevice->processPendingDoorbells(offset);
 }

 void
@@ -472,6 +477,8 @@ PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
    // Register doorbell with GPU device
    gpuDevice->setSDMAEngine(pkt->doorbellOffset << 2, sdma_eng);
    gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC);
+
+    gpuDevice->processPendingDoorbells(pkt->doorbellOffset << 2);
 }

 void
@@ -576,6 +583,7 @@ PM4PacketProcessor::unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt)
        gpuDevice->deallocatePasid(pkt->pasid);
        break;
      case 2:
+        panic("Unmapping queue selection 2 unimplemented\n");
        break;
      case 3: {
        auto &hsa_pp = gpuDevice->CP()->hsaPacketProc();
--- a/src/dev/hsa/hsa_packet.hh
+++ b/src/dev/hsa/hsa_packet.hh
@@ -100,6 +100,14 @@ struct _hsa_barrier_or_packet_t
    uint64_t completion_signal;
 };

+struct _hsa_generic_vendor_pkt
+{
+    uint32_t padding[14];
+    Addr completion_signal;
+};
+// All HSA AQL packets are 64 bytes. Confirm that here.
+static_assert(sizeof(_hsa_generic_vendor_pkt) == 64);
+
 } // namespace gem5

 #endif // __DEV_HSA_HSA_PACKET_HH__
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -116,28 +116,52 @@ void
 GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
                                       Addr host_pkt_addr)
 {
-    static int dynamic_task_id = 0;
    _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
    assert(!(disp_pkt->kernel_object & (system()->cacheLineSize() - 1)));

    /**
-     * we need to read a pointer in the application's address
-     * space to pull out the kernel code descriptor.
+     * Need to use a raw pointer for DmaVirtDevice API. This is deleted
+     * in the dispatchKernelObject method.
     */
-    auto *tc = sys->threads[0];
-
-    TranslatingPortProxy fs_proxy(tc);
-    SETranslatingPortProxy se_proxy(tc);
-    PortProxy &virt_proxy = FullSystem ? fs_proxy : se_proxy;
+    AMDKernelCode *akc = new AMDKernelCode;

    /**
-     * In full system mode, the page table entry may point to a system page
-     * or a device page. System pages use the proxy as normal, but a device
-     * page needs to be read from device memory. Check what type it is here.
+     * The kernel_object is a pointer to the machine code, whose entry
+     * point is an 'amd_kernel_code_t' type, which is included in the
+     * kernel binary, and describes various aspects of the kernel. The
+     * desired entry is the 'kernel_code_entry_byte_offset' field,
+     * which provides the byte offset (positive or negative) from the
+     * address of the amd_kernel_code_t to the start of the machine
+     * instructions.
+     *
+     * For SE mode we can read from the port proxy. In FS mode, we may need
+     * to wait for the guest OS to setup translations, especially when using
+     * the KVM CPU, so it is preferred to read the code object using a timing
+     * DMA request.
     */
-    bool is_system_page = true;
-    Addr phys_addr = disp_pkt->kernel_object;
-    if (FullSystem) {
+    if (!FullSystem) {
+        /**
+         * we need to read a pointer in the application's address
+         * space to pull out the kernel code descriptor.
+         */
+        auto *tc = sys->threads[0];
+        SETranslatingPortProxy virt_proxy(tc);
+
+        DPRINTF(GPUCommandProc, "reading kernel_object using proxy\n");
+        virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)akc,
+            sizeof(AMDKernelCode));
+
+        dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
+    } else {
+        /**
+         * In full system mode, the page table entry may point to a system
+         * page or a device page. System pages use the proxy as normal, but
+         * a device page needs to be read from device memory. Check what type
+         * it is here.
+         */
+        bool is_system_page = true;
+        Addr phys_addr = disp_pkt->kernel_object;
+
        /**
         * Full system currently only supports running on single VMID (one
         * virtual memory space), i.e., one application running on GPU at a
@@ -149,61 +173,68 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
        walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),
                                phys_addr, tmp_bytes, BaseMMU::Mode::Read,
                                is_system_page);
-    }

-    DPRINTF(GPUCommandProc, "kernobj vaddr %#lx paddr %#lx size %d s:%d\n",
-            disp_pkt->kernel_object, phys_addr, sizeof(AMDKernelCode),
-            is_system_page);
+        DPRINTF(GPUCommandProc, "kernel_object vaddr %#lx paddr %#lx size %d"
+                " s:%d\n", disp_pkt->kernel_object, phys_addr,
+                sizeof(AMDKernelCode), is_system_page);

-    /**
-     * The kernel_object is a pointer to the machine code, whose entry
-     * point is an 'amd_kernel_code_t' type, which is included in the
-     * kernel binary, and describes various aspects of the kernel. The
-     * desired entry is the 'kernel_code_entry_byte_offset' field,
-     * which provides the byte offset (positive or negative) from the
-     * address of the amd_kernel_code_t to the start of the machine
-     * instructions.
-     */
-    AMDKernelCode akc;
-    if (is_system_page) {
-        DPRINTF(GPUCommandProc, "kernel_object in system, using proxy\n");
-        virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
-            sizeof(AMDKernelCode));
-    } else {
-        assert(FullSystem);
-        DPRINTF(GPUCommandProc, "kernel_object in device, using device mem\n");
+        /**
+         * System objects use DMA device. Device objects need to use device
+         * memory.
+         */
+        if (is_system_page) {
+            DPRINTF(GPUCommandProc,
+                    "sending system DMA read for kernel_object\n");

-        // Read from GPU memory manager one cache line at a time to prevent
-        // rare cases where the AKC spans two memory pages.
-        ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
-                           system()->cacheLineSize());
-        for (; !gen.done(); gen.next()) {
-            Addr chunk_addr = gen.addr();
-            int vmid = 1;
-            unsigned dummy;
-            walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),
-                                    chunk_addr, dummy, BaseMMU::Mode::Read,
-                                    is_system_page);
+            auto dma_callback = new DmaVirtCallback<uint32_t>(
+              [=](const uint32_t&) {
+                dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
+              });

-            Request::Flags flags = Request::PHYSICAL;
-            RequestPtr request = std::make_shared<Request>(chunk_addr,
-                system()->cacheLineSize(), flags, walker->getDevRequestor());
-            Packet *readPkt = new Packet(request, MemCmd::ReadReq);
-            readPkt->dataStatic((uint8_t *)&akc + gen.complete());
-            system()->getDeviceMemory(readPkt)->access(readPkt);
-            delete readPkt;
+            dmaReadVirt(disp_pkt->kernel_object, sizeof(AMDKernelCode),
+                    dma_callback, (void *)akc);
+        } else {
+            DPRINTF(GPUCommandProc,
+                    "kernel_object in device, using device mem\n");
+
+            // Read from GPU memory manager one cache line at a time to prevent
+            // rare cases where the AKC spans two memory pages.
+            ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
+                               system()->cacheLineSize());
+            for (; !gen.done(); gen.next()) {
+                Addr chunk_addr = gen.addr();
+                int vmid = 1;
+                unsigned dummy;
+                walker->startFunctional(
+                    gpuDevice->getVM().getPageTableBase(vmid), chunk_addr,
+                    dummy, BaseMMU::Mode::Read, is_system_page);
+
+                Request::Flags flags = Request::PHYSICAL;
+                RequestPtr request = std::make_shared<Request>(chunk_addr,
+                    system()->cacheLineSize(), flags,
+                    walker->getDevRequestor());
+                Packet *readPkt = new Packet(request, MemCmd::ReadReq);
+                readPkt->dataStatic((uint8_t *)akc + gen.complete());
+                system()->getDeviceMemory(readPkt)->access(readPkt);
+                delete readPkt;
+            }
+
+            dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
        }
    }
+}
+
+void
+GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
+                                        uint32_t queue_id, Addr host_pkt_addr)
+{
+    _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;

    DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
-        "kernel object\n", akc.kernel_code_entry_byte_offset);
-
-    DPRINTF(GPUCommandProc,"GPUCommandProc: Sending dispatch pkt to %lu\n",
-        (uint64_t)tc->cpuId());
-
+        "kernel object\n", akc->kernel_code_entry_byte_offset);

    Addr machine_code_addr = (Addr)disp_pkt->kernel_object
-        + akc.kernel_code_entry_byte_offset;
+        + akc->kernel_code_entry_byte_offset;

    DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
        machine_code_addr);
@@ -219,7 +250,7 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
     * APUs to implement asynchronous memcopy operations from 2 pointers in
     * host memory.  I have no idea what BLIT stands for.
     * */
-    if (akc.runtime_loader_kernel_symbol) {
+    if (akc->runtime_loader_kernel_symbol) {
        kernel_name = "Some kernel";
    } else {
        kernel_name = "Blit kernel";
@@ -230,7 +261,7 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
    GfxVersion gfxVersion = FullSystem ? gpuDevice->getGfxVersion()
                          : driver()->getGfxVersion();
    HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
-        dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr,
+        dynamic_task_id, raw_pkt, akc, host_pkt_addr, machine_code_addr,
        gfxVersion);

    DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
@@ -252,6 +283,8 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
    // The driver expects the start time to be in ns
    Tick start_ts = curTick() / sim_clock::as_int::ns;
    dispatchStartTime.insert({disp_pkt->completion_signal, start_ts});
+
+    delete akc;
 }

 void
@@ -473,18 +506,27 @@ GPUCommandProcessor::driver()
 */

 /**
- * TODO: For now we simply tell the HSAPP to finish the packet,
- *       however a future patch will update this method to provide
- *       the proper handling of any required vendor-specific packets.
- *       In the version of ROCm that is currently supported (1.6)
- *       the runtime will send packets that direct the CP to
- *       invalidate the GPUs caches. We do this automatically on
- *       each kernel launch in the CU, so this is safe for now.
+ * TODO: For now we simply tell the HSAPP to finish the packet and write a
+ * completion signal, if any. However, in the future proper handing may be
+ * required for vendor specific packets.
+ *
+ * In the version of ROCm that is currently supported the runtime will send
+ * packets that direct the CP to invalidate the GPU caches. We do this
+ * automatically on each kernel launch in the CU, so that situation is safe
+ * for now.
 */
 void
 GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
    Addr host_pkt_addr)
 {
+    auto vendor_pkt = (_hsa_generic_vendor_pkt *)raw_pkt;
+
+    if (vendor_pkt->completion_signal) {
+        sendCompletionSignal(vendor_pkt->completion_signal);
+    }
+
+    warn("Ignoring vendor packet\n");
+
    hsaPP->finishPkt(raw_pkt, queue_id);
 }

--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -99,6 +99,8 @@ class GPUCommandProcessor : public DmaVirtDevice
                         Addr host_pkt_addr);
    void attachDriver(GPUComputeDriver *driver);

+    void dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
+                              uint32_t queue_id, Addr host_pkt_addr);
    void dispatchPkt(HSAQueueEntry *task);
    void signalWakeupEvent(uint32_t event_id);

@@ -149,6 +151,9 @@ class GPUCommandProcessor : public DmaVirtDevice
    HSAPacketProcessor *hsaPP;
    TranslationGenPtr translate(Addr vaddr, Addr size) override;

+    // Running counter of dispatched tasks
+    int dynamic_task_id = 0;
+
    // Keep track of start times for task dispatches.
    std::unordered_map<Addr, Tick> dispatchStartTime;

--- a/src/gpu-compute/hsa_queue_entry.hh
+++ b/src/gpu-compute/hsa_queue_entry.hh
@@ -70,8 +70,6 @@ class HSAQueueEntry
          _gridSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_x,
                    (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_y,
                    (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_z}},
-          numVgprs(akc->workitem_vgpr_count),
-          numSgprs(akc->wavefront_sgpr_count),
          _queueId(queue_id), _dispatchId(dispatch_id), dispPkt(disp_pkt),
          _hostDispPktAddr(host_pkt_addr),
          _completionSignal(((_hsa_dispatch_packet_t*)disp_pkt)
@@ -88,40 +86,36 @@ class HSAQueueEntry
          _globalWgId(0), dispatchComplete(false)

    {
-        // Precompiled BLIT kernels actually violate the spec a bit
-        // and don't set many of the required akc fields.  For these kernels,
-        // we need to rip register usage from the resource registers.
-        //
-        // We can't get an exact number of registers from the resource
-        // registers because they round, but we can get an upper bound on it.
-        // We determine the number of registers by solving for "vgprs_used"
-        // in the LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
+        // Use the resource descriptors to determine number of GPRs. This will
+        // round up in some cases, however the exact number field in the AMD
+        // kernel code struct is not backwards compatible and that field is
+        // not populated in newer compiles. The resource descriptor dword must
+        // be backwards compatible, so use that always.
+        // LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html
        //     #code-object-v3-kernel-descriptor
+        //
        // Currently, the only supported gfx version in gem5 that computes
-        // this differently is gfx90a.
-        if (!numVgprs) {
-            if (gfx_version == GfxVersion::gfx90a) {
-                numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
-            } else {
-                numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
-            }
+        // VGPR count differently is gfx90a.
+        if (gfx_version == GfxVersion::gfx90a) {
+            numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;
+        } else {
+            numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;
        }

-        if (!numSgprs || numSgprs ==
-            std::numeric_limits<decltype(akc->wavefront_sgpr_count)>::max()) {
-            // Supported major generation numbers: 0 (BLIT kernels), 8, and 9
-            uint16_t version = akc->amd_machine_version_major;
-            assert((version == 0) || (version == 8) || (version == 9));
-            // SGPR allocation granularies:
-            // - GFX8: 8
-            // - GFX9: 16
-            // Source: https://llvm.org/docs/AMDGPUUsage.html
-            if ((version == 0) || (version == 8)) {
-                // We assume that BLIT kernels use the same granularity as GFX8
-                numSgprs = (akc->granulated_wavefront_sgpr_count + 1) * 8;
-            } else if (version == 9) {
-                numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2;
-            }
+        // SGPR allocation granularies:
+        // - GFX8: 8
+        // - GFX9: 16
+        // Source: https://llvm.org/docs/.html
+        if (gfx_version == GfxVersion::gfx801 ||
+                gfx_version == GfxVersion::gfx803) {
+            numSgprs = (akc->granulated_wavefront_sgpr_count + 1) * 8;
+        } else if (gfx_version == GfxVersion::gfx900 ||
+                gfx_version == GfxVersion::gfx902 ||
+                gfx_version == GfxVersion::gfx908 ||
+                gfx_version == GfxVersion::gfx90a) {
+            numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2;
+        } else {
+            panic("Saw unknown gfx version setting up GPR counts\n");
        }

        initialVgprState.reset();