From f07e0e7f5d38f56d147ac07f03757e0b89094e49 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Fri, 20 Oct 2023 16:30:43 -0500
Subject: [PATCH] gpu-compute: Read dispatch packet with timing DMA

This fixes occasional readBlob fatals caused by the functional read of
system memory, seen often with the KVM CPU.

Change-Id: Ifccee666f62faa5b2fcf0a64a9d77c8cf95b3add
---
 src/gpu-compute/gpu_command_processor.cc | 155 ++++++++++++++---------
 src/gpu-compute/gpu_command_processor.hh |   5 +
 2 files changed, 99 insertions(+), 61 deletions(-)

diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc
index 5bed04b9dd..05c9a95eed 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -116,28 +116,52 @@ void
 GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
                                        Addr host_pkt_addr)
 {
-    static int dynamic_task_id = 0;
     _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
     assert(!(disp_pkt->kernel_object & (system()->cacheLineSize() - 1)));
 
     /**
-     * we need to read a pointer in the application's address
-     * space to pull out the kernel code descriptor.
+     * Need to use a raw pointer for DmaVirtDevice API. This is deleted
+     * in the dispatchKernelObject method.
      */
-    auto *tc = sys->threads[0];
-
-    TranslatingPortProxy fs_proxy(tc);
-    SETranslatingPortProxy se_proxy(tc);
-    PortProxy &virt_proxy = FullSystem ? fs_proxy : se_proxy;
+    AMDKernelCode *akc = new AMDKernelCode;
 
     /**
-     * In full system mode, the page table entry may point to a system page
-     * or a device page. System pages use the proxy as normal, but a device
-     * page needs to be read from device memory. Check what type it is here.
+     * The kernel_object is a pointer to the machine code, whose entry
+     * point is an 'amd_kernel_code_t' type, which is included in the
+     * kernel binary, and describes various aspects of the kernel. The
+     * desired entry is the 'kernel_code_entry_byte_offset' field,
+     * which provides the byte offset (positive or negative) from the
+     * address of the amd_kernel_code_t to the start of the machine
+     * instructions.
+     *
+     * For SE mode we can read from the port proxy. In FS mode, we may need
+     * to wait for the guest OS to setup translations, especially when using
+     * the KVM CPU, so it is preferred to read the code object using a timing
+     * DMA request.
      */
-    bool is_system_page = true;
-    Addr phys_addr = disp_pkt->kernel_object;
-    if (FullSystem) {
+    if (!FullSystem) {
+        /**
+         * we need to read a pointer in the application's address
+         * space to pull out the kernel code descriptor.
+         */
+        auto *tc = sys->threads[0];
+        SETranslatingPortProxy virt_proxy(tc);
+
+        DPRINTF(GPUCommandProc, "reading kernel_object using proxy\n");
+        virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)akc,
+            sizeof(AMDKernelCode));
+
+        dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
+    } else {
+        /**
+         * In full system mode, the page table entry may point to a system
+         * page or a device page. System pages use the proxy as normal, but
+         * a device page needs to be read from device memory. Check what type
+         * it is here.
+         */
+        bool is_system_page = true;
+        Addr phys_addr = disp_pkt->kernel_object;
+
         /**
          * Full system currently only supports running on single VMID (one
          * virtual memory space), i.e., one application running on GPU at a
@@ -149,61 +173,68 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
         walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),
                                 phys_addr, tmp_bytes, BaseMMU::Mode::Read,
                                 is_system_page);
-    }
 
-    DPRINTF(GPUCommandProc, "kernobj vaddr %#lx paddr %#lx size %d s:%d\n",
-            disp_pkt->kernel_object, phys_addr, sizeof(AMDKernelCode),
-            is_system_page);
+        DPRINTF(GPUCommandProc, "kernel_object vaddr %#lx paddr %#lx size %d"
+                " s:%d\n", disp_pkt->kernel_object, phys_addr,
+                sizeof(AMDKernelCode), is_system_page);
 
-    /**
-     * The kernel_object is a pointer to the machine code, whose entry
-     * point is an 'amd_kernel_code_t' type, which is included in the
-     * kernel binary, and describes various aspects of the kernel. The
-     * desired entry is the 'kernel_code_entry_byte_offset' field,
-     * which provides the byte offset (positive or negative) from the
-     * address of the amd_kernel_code_t to the start of the machine
-     * instructions.
-     */
-    AMDKernelCode akc;
-    if (is_system_page) {
-        DPRINTF(GPUCommandProc, "kernel_object in system, using proxy\n");
-        virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
-            sizeof(AMDKernelCode));
-    } else {
-        assert(FullSystem);
-        DPRINTF(GPUCommandProc, "kernel_object in device, using device mem\n");
+        /**
+         * System objects use DMA device. Device objects need to use device
+         * memory.
+         */
+        if (is_system_page) {
+            DPRINTF(GPUCommandProc,
+                    "sending system DMA read for kernel_object\n");
 
-        // Read from GPU memory manager one cache line at a time to prevent
-        // rare cases where the AKC spans two memory pages.
-        ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
-                           system()->cacheLineSize());
-        for (; !gen.done(); gen.next()) {
-            Addr chunk_addr = gen.addr();
-            int vmid = 1;
-            unsigned dummy;
-            walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),
-                                    chunk_addr, dummy, BaseMMU::Mode::Read,
-                                    is_system_page);
+            auto dma_callback = new DmaVirtCallback<uint32_t>(
+              [=](const uint32_t&) {
+                dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
+              });
 
-            Request::Flags flags = Request::PHYSICAL;
-            RequestPtr request = std::make_shared<Request>(chunk_addr,
-                system()->cacheLineSize(), flags, walker->getDevRequestor());
-            Packet *readPkt = new Packet(request, MemCmd::ReadReq);
-            readPkt->dataStatic((uint8_t *)&akc + gen.complete());
-            system()->getDeviceMemory(readPkt)->access(readPkt);
-            delete readPkt;
+            dmaReadVirt(disp_pkt->kernel_object, sizeof(AMDKernelCode),
+                    dma_callback, (void *)akc);
+        } else {
+            DPRINTF(GPUCommandProc,
+                    "kernel_object in device, using device mem\n");
+
+            // Read from GPU memory manager one cache line at a time to prevent
+            // rare cases where the AKC spans two memory pages.
+            ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
+                               system()->cacheLineSize());
+            for (; !gen.done(); gen.next()) {
+                Addr chunk_addr = gen.addr();
+                int vmid = 1;
+                unsigned dummy;
+                walker->startFunctional(
+                    gpuDevice->getVM().getPageTableBase(vmid), chunk_addr,
+                    dummy, BaseMMU::Mode::Read, is_system_page);
+
+                Request::Flags flags = Request::PHYSICAL;
+                RequestPtr request = std::make_shared<Request>(chunk_addr,
+                    system()->cacheLineSize(), flags,
+                    walker->getDevRequestor());
+                Packet *readPkt = new Packet(request, MemCmd::ReadReq);
+                readPkt->dataStatic((uint8_t *)akc + gen.complete());
+                system()->getDeviceMemory(readPkt)->access(readPkt);
+                delete readPkt;
+            }
+
+            dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
         }
     }
+}
+
+void
+GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
+                                        uint32_t queue_id, Addr host_pkt_addr)
+{
+    _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
 
     DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
-        "kernel object\n", akc.kernel_code_entry_byte_offset);
-
-    DPRINTF(GPUCommandProc,"GPUCommandProc: Sending dispatch pkt to %lu\n",
-        (uint64_t)tc->cpuId());
-
+        "kernel object\n", akc->kernel_code_entry_byte_offset);
 
     Addr machine_code_addr = (Addr)disp_pkt->kernel_object
-        + akc.kernel_code_entry_byte_offset;
+        + akc->kernel_code_entry_byte_offset;
 
     DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
         machine_code_addr);
@@ -219,7 +250,7 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
      * APUs to implement asynchronous memcopy operations from 2 pointers in
      * host memory.  I have no idea what BLIT stands for.
      * */
-    if (akc.runtime_loader_kernel_symbol) {
+    if (akc->runtime_loader_kernel_symbol) {
         kernel_name = "Some kernel";
     } else {
         kernel_name = "Blit kernel";
@@ -230,7 +261,7 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
     GfxVersion gfxVersion = FullSystem ? gpuDevice->getGfxVersion()
                           : driver()->getGfxVersion();
     HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
-        dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr,
+        dynamic_task_id, raw_pkt, akc, host_pkt_addr, machine_code_addr,
         gfxVersion);
 
     DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
@@ -252,6 +283,8 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
     // The driver expects the start time to be in ns
     Tick start_ts = curTick() / sim_clock::as_int::ns;
     dispatchStartTime.insert({disp_pkt->completion_signal, start_ts});
+
+    delete akc;
 }
 
 void
diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh
index f6783834eb..85b2a44494 100644
--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -99,6 +99,8 @@ class GPUCommandProcessor : public DmaVirtDevice
                          Addr host_pkt_addr);
     void attachDriver(GPUComputeDriver *driver);
 
+    void dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
+                              uint32_t queue_id, Addr host_pkt_addr);
     void dispatchPkt(HSAQueueEntry *task);
     void signalWakeupEvent(uint32_t event_id);
 
@@ -149,6 +151,9 @@ class GPUCommandProcessor : public DmaVirtDevice
     HSAPacketProcessor *hsaPP;
     TranslationGenPtr translate(Addr vaddr, Addr size) override;
 
+    // Running counter of dispatched tasks
+    int dynamic_task_id = 0;
+
     // Keep track of start times for task dispatches.
     std::unordered_map<Addr, Tick> dispatchStartTime;