gpu-compute: Add support for skipping GPU kernels (#940)

gpu-compute: Add support for skipping GPU kernels This commit adds two new command-line options: --skip-until-gpu-kernel N Skips (non-blit) GPU kernels until the target kernel is reached. Execution continues normally from there. Blit kernels are not skipped because they are responsible for copying the kernel code and metadata for the non-blit kernels. Note that skipping kernels can impact correctness; this feature is only useful if the kernel of interest has no data-dependent behavior, or its data-dependent behavior is not based on data generated by the skipped kernels. --exit-after-gpu-kernel N Ends the simulation after completing (non-blit) GPU kernel N. This commit also renames two existing command-line options: --debug-at-gpu-kernel -> --debug-at-gpu-task --exit-at-gpu-kernel -> --exit-at-gpu-task These were renamed because they count GPU tasks, which include both kernels launched by the application as well as blit kernels. Change-Id: If250b3fd2db05c1222e369e9e3f779c4422074bc
2024-03-21 07:46:27 -07:00
parent ba2f5615ba
commit acd9d3ff94
8 changed files with 111 additions and 20 deletions
--- a/configs/example/gpufs/runfs.py
+++ b/configs/example/gpufs/runfs.py
@@ -140,17 +140,35 @@ def addRunFSOptions(parser):
    )

    parser.add_argument(
-        "--debug-at-gpu-kernel",
+        "--debug-at-gpu-task",
        type=int,
        default=-1,
-        help="Turn on debug flags starting with this kernel",
+        help="Turn on debug flags starting with this task (counting both blit"
+        " and non-blit kernels)",
    )

    parser.add_argument(
-        "--exit-at-gpu-kernel",
+        "--exit-at-gpu-task",
        type=int,
        default=-1,
-        help="Exit simulation after running this many kernels",
+        help="Exit simulation after running this many tasks (counting both "
+        "blit and non-blit kernels)",
+    )
+
+    parser.add_argument(
+        "--exit-after-gpu-kernel",
+        type=int,
+        default=-1,
+        help="Exit simulation after completing this (non-blit) kernel",
+    )
+
+    parser.add_argument(
+        "--skip-until-gpu-kernel",
+        type=int,
+        default=0,
+        help="Skip (non-blit) kernels until reaching this kernel. Note that "
+        "this can impact correctness (the skipped kernels are completely "
+        "skipped, not fast forwarded)",
    )

    parser.add_argument(
@@ -230,8 +248,9 @@ def runGpuFSSystem(args):

    print("Running the simulation")
    sim_ticks = args.abs_max_tick
-    kernels_launched = 0
-    if args.debug_at_gpu_kernel != -1:
+    kernels_completed = 0
+    tasks_completed = 0
+    if args.debug_at_gpu_task != -1:
        m5.trace.disable()

    exit_event = m5.simulate(sim_ticks)
@@ -249,16 +268,27 @@ def runGpuFSSystem(args):
            m5.checkpoint(args.checkpoint_dir)
            break
        elif "GPU Kernel Completed" in exit_event.getCause():
-            kernels_launched += 1
+            if kernels_completed == args.exit_after_gpu_kernel:
+                print(f"Exiting after GPU kernel {kernels_completed}")
+                break
+            kernels_completed += 1
+            tasks_completed += 1
+        elif "GPU Blit Kernel Completed" in exit_event.getCause():
+            tasks_completed += 1
+        elif "Skipping GPU Kernel" in exit_event.getCause():
+            print(f"Skipping GPU kernel {kernels_completed}")
+            kernels_completed += 1
+            tasks_completed += 1
        else:
            print(
                f"Unknown exit event: {exit_event.getCause()}. Continuing..."
            )

-        if kernels_launched == args.debug_at_gpu_kernel:
+        if tasks_completed == args.debug_at_gpu_task:
+            print(f"Enabling debug flags @ GPU task {tasks_completed}")
            m5.trace.enable()
-        if kernels_launched == args.exit_at_gpu_kernel:
-            print(f"Exiting @ GPU kernel {kernels_launched}")
+        if tasks_completed == args.exit_at_gpu_task:
+            print(f"Exiting @ GPU task {tasks_completed}")
            break

        exit_event = m5.simulate(sim_ticks - m5.curTick())
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -115,11 +115,19 @@ def makeGpuFSSystem(args):
        numHWQueues=args.num_hw_queues,
        walker=hsapp_pt_walker,
    )
-    dispatcher_exit_events = True if args.exit_at_gpu_kernel > -1 else False
+    dispatcher_exit_events = False
+    if args.exit_at_gpu_task > -1:
+        dispatcher_exit_events = True
+    if args.exit_after_gpu_kernel > -1:
+        dispatcher_exit_events = True
    dispatcher = GPUDispatcher(kernel_exit_events=dispatcher_exit_events)
    cp_pt_walker = VegaPagetableWalker()
+    target_kernel = args.skip_until_gpu_kernel
    gpu_cmd_proc = GPUCommandProcessor(
-        hsapp=gpu_hsapp, dispatcher=dispatcher, walker=cp_pt_walker
+        hsapp=gpu_hsapp,
+        dispatcher=dispatcher,
+        walker=cp_pt_walker,
+        target_non_blit_kernel_id=target_kernel,
    )
    shader.dispatcher = dispatcher
    shader.gpu_cmd_proc = gpu_cmd_proc
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -360,6 +360,10 @@ class GPUCommandProcessor(DmaVirtDevice):
    walker = Param.VegaPagetableWalker(
        VegaPagetableWalker(), "Page table walker"
    )
+    target_non_blit_kernel_id = Param.Int(
+        0,
+        "Skip kernels until reaching this kernel (counting only non-blit kernels)",
+    )


 class StorageClassType(Enum):
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -324,7 +324,7 @@ GPUDispatcher::notifyWgCompl(Wavefront *wf)
        DPRINTF(GPUKernelInfo, "Completed kernel %d\n", kern_id);

        if (kernelExitEvents) {
-            shader->requestKernelExitEvent();
+            shader->requestKernelExitEvent(task->completionSignal());
        }
    }

--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -36,6 +36,7 @@
 #include "arch/amdgpu/vega/pagetable_walker.hh"
 #include "base/chunk_generator.hh"
 #include "debug/GPUCommandProc.hh"
+#include "debug/GPUDisp.hh"
 #include "debug/GPUInitAbi.hh"
 #include "debug/GPUKernelInfo.hh"
 #include "dev/amdgpu/amdgpu_device.hh"
@@ -48,6 +49,7 @@
 #include "sim/full_system.hh"
 #include "sim/process.hh"
 #include "sim/proxy_ptr.hh"
+#include "sim/sim_exit.hh"
 #include "sim/syscall_emul_buf.hh"

 namespace gem5
@@ -55,7 +57,8 @@ namespace gem5

 GPUCommandProcessor::GPUCommandProcessor(const Params &p)
    : DmaVirtDevice(p), dispatcher(*p.dispatcher), _driver(nullptr),
-      walker(p.walker), hsaPP(p.hsapp)
+      walker(p.walker), hsaPP(p.hsapp),
+      target_non_blit_kernel_id(p.target_non_blit_kernel_id)
 {
    assert(hsaPP);
    hsaPP->setDevice(this);
@@ -259,10 +262,13 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
     * APUs to implement asynchronous memcopy operations from 2 pointers in
     * host memory.  I have no idea what BLIT stands for.
     * */
+    bool is_blit_kernel;
    if (!disp_pkt->completion_signal) {
        kernel_name = "Some kernel";
+        is_blit_kernel = false;
    } else {
        kernel_name = "Blit kernel";
+        is_blit_kernel = true;
    }

    DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
@@ -273,6 +279,38 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
        dynamic_task_id, raw_pkt, akc, host_pkt_addr, machine_code_addr,
        gfxVersion);

+    // The driver expects the start time to be in ns
+    Tick start_ts = curTick() / sim_clock::as_int::ns;
+    dispatchStartTime.insert({disp_pkt->completion_signal, start_ts});
+
+    // Potentially skip a non-blit kernel
+    if (!is_blit_kernel && (non_blit_kernel_id < target_non_blit_kernel_id)) {
+        DPRINTF(GPUCommandProc, "Skipping non-blit kernel %i (Task ID: %i)\n",
+                non_blit_kernel_id, dynamic_task_id);
+
+        // Notify the HSA PP that this kernel is complete
+        hsaPacketProc().finishPkt(task->dispPktPtr(), task->queueId());
+        if (task->completionSignal()) {
+            DPRINTF(GPUDisp, "HSA AQL Kernel Complete with completion "
+                    "signal! Addr: %d\n", task->completionSignal());
+
+            sendCompletionSignal(task->completionSignal());
+        } else {
+            DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
+                "signal\n");
+        }
+
+        ++dynamic_task_id;
+        ++non_blit_kernel_id;
+
+        delete akc;
+
+        // Notify the run script that a kernel has been skipped
+        exitSimLoop("Skipping GPU Kernel");
+
+        return;
+    }
+
    DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
        "grid size (%dx%dx%d) kernarg addr: %#x, completion "
        "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
@@ -288,10 +326,7 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,

    initABI(task);
    ++dynamic_task_id;
-
-    // The driver expects the start time to be in ns
-    Tick start_ts = curTick() / sim_clock::as_int::ns;
-    dispatchStartTime.insert({disp_pkt->completion_signal, start_ts});
+    if (!is_blit_kernel) ++non_blit_kernel_id;

    delete akc;
 }
--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -155,6 +155,12 @@ class GPUCommandProcessor : public DmaVirtDevice
    // Running counter of dispatched tasks
    int dynamic_task_id = 0;

+    // Running counter of dispatched user (non-blit) kernels
+    int non_blit_kernel_id = 0;
+
+    // Skip all user (non-blit) kernels until reaching this kernel
+    int target_non_blit_kernel_id = 0;
+
    // Keep track of start times for task dispatches.
    std::unordered_map<Addr, Tick> dispatchStartTime;

--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -543,7 +543,11 @@ Shader::notifyCuSleep() {

        if (kernelExitRequested) {
            kernelExitRequested = false;
-            exitSimLoop("GPU Kernel Completed");
+            if (blitKernel) {
+                exitSimLoop("GPU Blit Kernel Completed");
+            } else {
+                exitSimLoop("GPU Kernel Completed");
+            }
        }
    }
 }
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@@ -101,6 +101,9 @@ class Shader : public ClockedObject
    // shader to complete before actually exiting so that stats are updated.
    bool kernelExitRequested = false;

+    // Set to true by the dispatcher if the current kernel is a blit kernel
+    bool blitKernel = false;
+
  public:
    typedef ShaderParams Params;
    enum hsail_mode_e {SIMT,VECTOR_SCALAR};
@@ -321,9 +324,10 @@ class Shader : public ClockedObject
    }

    void
-    requestKernelExitEvent()
+    requestKernelExitEvent(bool is_blit_kernel)
    {
        kernelExitRequested = true;
+        blitKernel = is_blit_kernel;
    }

  protected: