diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py
index 9dcc1187f3..fed155bc44 100644
--- a/configs/example/gpufs/runfs.py
+++ b/configs/example/gpufs/runfs.py
@@ -140,17 +140,35 @@ def addRunFSOptions(parser):
     )
 
     parser.add_argument(
-        "--debug-at-gpu-kernel",
+        "--debug-at-gpu-task",
         type=int,
         default=-1,
-        help="Turn on debug flags starting with this kernel",
+        help="Turn on debug flags starting with this task (counting both blit"
+        " and non-blit kernels)",
     )
 
     parser.add_argument(
-        "--exit-at-gpu-kernel",
+        "--exit-at-gpu-task",
         type=int,
         default=-1,
-        help="Exit simulation after running this many kernels",
+        help="Exit simulation after running this many tasks (counting both "
+        "blit and non-blit kernels)",
+    )
+
+    parser.add_argument(
+        "--exit-after-gpu-kernel",
+        type=int,
+        default=-1,
+        help="Exit simulation after completing this (non-blit) kernel",
+    )
+
+    parser.add_argument(
+        "--skip-until-gpu-kernel",
+        type=int,
+        default=0,
+        help="Skip (non-blit) kernels until reaching this kernel. Note that "
+        "this can impact correctness (the skipped kernels are completely "
+        "skipped, not fast forwarded)",
     )
 
     parser.add_argument(
@@ -230,8 +248,9 @@ def runGpuFSSystem(args):
 
     print("Running the simulation")
     sim_ticks = args.abs_max_tick
-    kernels_launched = 0
-    if args.debug_at_gpu_kernel != -1:
+    kernels_completed = 0
+    tasks_completed = 0
+    if args.debug_at_gpu_task != -1:
         m5.trace.disable()
 
     exit_event = m5.simulate(sim_ticks)
@@ -249,16 +268,27 @@ def runGpuFSSystem(args):
             m5.checkpoint(args.checkpoint_dir)
             break
         elif "GPU Kernel Completed" in exit_event.getCause():
-            kernels_launched += 1
+            if kernels_completed == args.exit_after_gpu_kernel:
+                print(f"Exiting after GPU kernel {kernels_completed}")
+                break
+            kernels_completed += 1
+            tasks_completed += 1
+        elif "GPU Blit Kernel Completed" in exit_event.getCause():
+            tasks_completed += 1
+        elif "Skipping GPU Kernel" in exit_event.getCause():
+            print(f"Skipping GPU kernel {kernels_completed}")
+            kernels_completed += 1
+            tasks_completed += 1
         else:
             print(
                 f"Unknown exit event: {exit_event.getCause()}. Continuing..."
             )
 
-        if kernels_launched == args.debug_at_gpu_kernel:
+        if tasks_completed == args.debug_at_gpu_task:
+            print(f"Enabling debug flags @ GPU task {tasks_completed}")
             m5.trace.enable()
-        if kernels_launched == args.exit_at_gpu_kernel:
-            print(f"Exiting @ GPU kernel {kernels_launched}")
+        if tasks_completed == args.exit_at_gpu_task:
+            print(f"Exiting @ GPU task {tasks_completed}")
             break
 
         exit_event = m5.simulate(sim_ticks - m5.curTick())
diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py
index 2803e10fb4..671d4efdc9 100644
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -115,11 +115,19 @@ def makeGpuFSSystem(args):
         numHWQueues=args.num_hw_queues,
         walker=hsapp_pt_walker,
     )
-    dispatcher_exit_events = True if args.exit_at_gpu_kernel > -1 else False
+    dispatcher_exit_events = False
+    if args.exit_at_gpu_task > -1:
+        dispatcher_exit_events = True
+    if args.exit_after_gpu_kernel > -1:
+        dispatcher_exit_events = True
     dispatcher = GPUDispatcher(kernel_exit_events=dispatcher_exit_events)
     cp_pt_walker = VegaPagetableWalker()
+    target_kernel = args.skip_until_gpu_kernel
     gpu_cmd_proc = GPUCommandProcessor(
-        hsapp=gpu_hsapp, dispatcher=dispatcher, walker=cp_pt_walker
+        hsapp=gpu_hsapp,
+        dispatcher=dispatcher,
+        walker=cp_pt_walker,
+        target_non_blit_kernel_id=target_kernel,
     )
     shader.dispatcher = dispatcher
     shader.gpu_cmd_proc = gpu_cmd_proc
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index b9a13dc85b..41ff9e7893 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -360,6 +360,10 @@ class GPUCommandProcessor(DmaVirtDevice):
     walker = Param.VegaPagetableWalker(
         VegaPagetableWalker(), "Page table walker"
     )
+    target_non_blit_kernel_id = Param.Int(
+        0,
+        "Skip kernels until reaching this kernel (counting only non-blit kernels)",
+    )
 
 
 class StorageClassType(Enum):
diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc
index 8a72fd73f4..e3cb53a342 100644
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -324,7 +324,7 @@ GPUDispatcher::notifyWgCompl(Wavefront *wf)
         DPRINTF(GPUKernelInfo, "Completed kernel %d\n", kern_id);
 
         if (kernelExitEvents) {
-            shader->requestKernelExitEvent();
+            shader->requestKernelExitEvent(task->completionSignal());
         }
     }
 
diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc
index 3737f8a6ff..5093cc4ff2 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -36,6 +36,7 @@
 #include "arch/amdgpu/vega/pagetable_walker.hh"
 #include "base/chunk_generator.hh"
 #include "debug/GPUCommandProc.hh"
+#include "debug/GPUDisp.hh"
 #include "debug/GPUInitAbi.hh"
 #include "debug/GPUKernelInfo.hh"
 #include "dev/amdgpu/amdgpu_device.hh"
@@ -48,6 +49,7 @@
 #include "sim/full_system.hh"
 #include "sim/process.hh"
 #include "sim/proxy_ptr.hh"
+#include "sim/sim_exit.hh"
 #include "sim/syscall_emul_buf.hh"
 
 namespace gem5
@@ -55,7 +57,8 @@ namespace gem5
 
 GPUCommandProcessor::GPUCommandProcessor(const Params &p)
     : DmaVirtDevice(p), dispatcher(*p.dispatcher), _driver(nullptr),
-      walker(p.walker), hsaPP(p.hsapp)
+      walker(p.walker), hsaPP(p.hsapp),
+      target_non_blit_kernel_id(p.target_non_blit_kernel_id)
 {
     assert(hsaPP);
     hsaPP->setDevice(this);
@@ -259,10 +262,13 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
      * APUs to implement asynchronous memcopy operations from 2 pointers in
      * host memory.  I have no idea what BLIT stands for.
      * */
+    bool is_blit_kernel;
     if (!disp_pkt->completion_signal) {
         kernel_name = "Some kernel";
+        is_blit_kernel = false;
     } else {
         kernel_name = "Blit kernel";
+        is_blit_kernel = true;
     }
 
     DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
@@ -273,6 +279,38 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
         dynamic_task_id, raw_pkt, akc, host_pkt_addr, machine_code_addr,
         gfxVersion);
 
+    // The driver expects the start time to be in ns
+    Tick start_ts = curTick() / sim_clock::as_int::ns;
+    dispatchStartTime.insert({disp_pkt->completion_signal, start_ts});
+
+    // Potentially skip a non-blit kernel
+    if (!is_blit_kernel && (non_blit_kernel_id < target_non_blit_kernel_id)) {
+        DPRINTF(GPUCommandProc, "Skipping non-blit kernel %i (Task ID: %i)\n",
+                non_blit_kernel_id, dynamic_task_id);
+
+        // Notify the HSA PP that this kernel is complete
+        hsaPacketProc().finishPkt(task->dispPktPtr(), task->queueId());
+        if (task->completionSignal()) {
+            DPRINTF(GPUDisp, "HSA AQL Kernel Complete with completion "
+                    "signal! Addr: %d\n", task->completionSignal());
+
+            sendCompletionSignal(task->completionSignal());
+        } else {
+            DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
+                "signal\n");
+        }
+
+        ++dynamic_task_id;
+        ++non_blit_kernel_id;
+
+        delete akc;
+
+        // Notify the run script that a kernel has been skipped
+        exitSimLoop("Skipping GPU Kernel");
+
+        return;
+    }
+
     DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
         "grid size (%dx%dx%d) kernarg addr: %#x, completion "
         "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
@@ -288,10 +326,7 @@ GPUCommandProcessor::dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt,
 
     initABI(task);
     ++dynamic_task_id;
-
-    // The driver expects the start time to be in ns
-    Tick start_ts = curTick() / sim_clock::as_int::ns;
-    dispatchStartTime.insert({disp_pkt->completion_signal, start_ts});
+    if (!is_blit_kernel) ++non_blit_kernel_id;
 
     delete akc;
 }
diff --git a/src/gpu-compute/gpu_command_processor.hh b/src/gpu-compute/gpu_command_processor.hh
index ac73c179d7..38b5257334 100644
--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -155,6 +155,12 @@ class GPUCommandProcessor : public DmaVirtDevice
     // Running counter of dispatched tasks
     int dynamic_task_id = 0;
 
+    // Running counter of dispatched user (non-blit) kernels
+    int non_blit_kernel_id = 0;
+
+    // Skip all user (non-blit) kernels until reaching this kernel
+    int target_non_blit_kernel_id = 0;
+
     // Keep track of start times for task dispatches.
     std::unordered_map<Addr, Tick> dispatchStartTime;
 
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
index b99950568e..6e3d556026 100644
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -543,7 +543,11 @@ Shader::notifyCuSleep() {
 
         if (kernelExitRequested) {
             kernelExitRequested = false;
-            exitSimLoop("GPU Kernel Completed");
+            if (blitKernel) {
+                exitSimLoop("GPU Blit Kernel Completed");
+            } else {
+                exitSimLoop("GPU Kernel Completed");
+            }
         }
     }
 }
diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh
index 89541a8ff4..0287ddc169 100644
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@@ -101,6 +101,9 @@ class Shader : public ClockedObject
     // shader to complete before actually exiting so that stats are updated.
     bool kernelExitRequested = false;
 
+    // Set to true by the dispatcher if the current kernel is a blit kernel
+    bool blitKernel = false;
+
   public:
     typedef ShaderParams Params;
     enum hsail_mode_e {SIMT,VECTOR_SCALAR};
@@ -321,9 +324,10 @@ class Shader : public ClockedObject
     }
 
     void
-    requestKernelExitEvent()
+    requestKernelExitEvent(bool is_blit_kernel)
     {
         kernelExitRequested = true;
+        blitKernel = is_blit_kernel;
     }
 
   protected: