gpu-compute: Add support for skipping GPU kernels (#940)

gpu-compute: Add support for skipping GPU kernels This commit adds two new command-line options: --skip-until-gpu-kernel N Skips (non-blit) GPU kernels until the target kernel is reached. Execution continues normally from there. Blit kernels are not skipped because they are responsible for copying the kernel code and metadata for the non-blit kernels. Note that skipping kernels can impact correctness; this feature is only useful if the kernel of interest has no data-dependent behavior, or its data-dependent behavior is not based on data generated by the skipped kernels. --exit-after-gpu-kernel N Ends the simulation after completing (non-blit) GPU kernel N. This commit also renames two existing command-line options: --debug-at-gpu-kernel -> --debug-at-gpu-task --exit-at-gpu-kernel -> --exit-at-gpu-task These were renamed because they count GPU tasks, which include both kernels launched by the application as well as blit kernels. Change-Id: If250b3fd2db05c1222e369e9e3f779c4422074bc
2024-03-21 07:46:27 -07:00
parent ba2f5615ba
commit acd9d3ff94
8 changed files with 111 additions and 20 deletions
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -36,6 +36,7 @@
 #include "arch/amdgpu/vega/pagetable_walker.hh"
 #include "base/chunk_generator.hh"
 #include "debug/GPUCommandProc.hh"
+#include "debug/GPUDisp.hh"
 #include "debug/GPUInitAbi.hh"
 #include "debug/GPUKernelInfo.hh"
 #include "dev/amdgpu/amdgpu_device.hh"
@@ -48,6 +49,7 @@
 #include "sim/full_system.hh"
 #include "sim/process.hh"
 #include "sim/proxy_ptr.hh"
+#include "sim/sim_exit.hh"
 #include "sim/syscall_emul_buf.hh"

 namespace gem5
@@ -55,7 +57,8 @@ namespace gem5

 GPUCommandProcessor::GPUCommandProcessor(const Params &p)
    : DmaVirtDevice(p), dispatcher(*p.dispatcher), _driver(nullptr),
-      walker(p.walker), hsaPP(p.hsapp)
+      walker(p.walker), hsaPP(p.hsapp),
+      target_non_blit_kernel_id(p.target_non_blit_kernel_id)
 {
    assert(hsaPP);
    hsaPP->setDevice(this);
@@ -259,10 +262,13 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
     * APUs to implement asynchronous memcopy operations from 2 pointers in
     * host memory.  I have no idea what BLIT stands for.
     * */
+    bool is_blit_kernel;
    if (!disp_pkt->completion_signal) {
        kernel_name = "Some kernel";
+        is_blit_kernel = false;
    } else {
        kernel_name = "Blit kernel";
+        is_blit_kernel = true;
    }

    DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
@@ -273,6 +279,38 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
        dynamic_task_id, raw_pkt, akc, host_pkt_addr, machine_code_addr,
        gfxVersion);

+    // The driver expects the start time to be in ns
+    Tick start_ts = curTick() / sim_clock::as_int::ns;
+    dispatchStartTime.insert({disp_pkt->completion_signal, start_ts});
+
+    // Potentially skip a non-blit kernel
+    if (!is_blit_kernel && (non_blit_kernel_id < target_non_blit_kernel_id)) {
+        DPRINTF(GPUCommandProc, "Skipping non-blit kernel %i (Task ID: %i)\n",
+                non_blit_kernel_id, dynamic_task_id);
+
+        // Notify the HSA PP that this kernel is complete
+        hsaPacketProc().finishPkt(task->dispPktPtr(), task->queueId());
+        if (task->completionSignal()) {
+            DPRINTF(GPUDisp, "HSA AQL Kernel Complete with completion "
+                    "signal! Addr: %d\n", task->completionSignal());
+
+            sendCompletionSignal(task->completionSignal());
+        } else {
+            DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
+                "signal\n");
+        }
+
+        ++dynamic_task_id;
+        ++non_blit_kernel_id;
+
+        delete akc;
+
+        // Notify the run script that a kernel has been skipped
+        exitSimLoop("Skipping GPU Kernel");
+
+        return;
+    }
+
    DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
        "grid size (%dx%dx%d) kernarg addr: %#x, completion "
        "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
@@ -288,10 +326,7 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,

    initABI(task);
    ++dynamic_task_id;
-
-    // The driver expects the start time to be in ns
-    Tick start_ts = curTick() / sim_clock::as_int::ns;
-    dispatchStartTime.insert({disp_pkt->completion_signal, start_ts});
+    if (!is_blit_kernel) ++non_blit_kernel_id;

    delete akc;
 }