diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc
index 7b36bce591..babc938489 100644
--- a/src/gpu-compute/dispatcher.cc
+++ b/src/gpu-compute/dispatcher.cc
@@ -334,7 +334,7 @@ GPUDispatcher::notifyWgCompl(Wavefront *wf)
         DPRINTF(GPUKernelInfo, "Completed kernel %d\n", kern_id);
 
         if (kernelExitEvents) {
-            exitSimLoop("GPU Kernel Completed");
+            shader->requestKernelExitEvent();
         }
     }
 
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
index 73d2366b74..620d0152c1 100644
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -519,8 +519,14 @@ Shader::notifyCuSleep() {
     panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
              "Invalid activeCu size\n");
     _activeCus--;
-    if (!_activeCus)
+    if (!_activeCus) {
         stats.shaderActiveTicks += curTick() - _lastInactiveTick;
+
+        if (kernelExitRequested) {
+            kernelExitRequested = false;
+            exitSimLoop("GPU Kernel Completed");
+        }
+    }
 }
 
 /**
diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh
index 08dfd24b76..32ddf3d15b 100644
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@@ -97,6 +97,10 @@ class Shader : public ClockedObject
     // Last tick that all CUs attached to this shader were inactive
     Tick _lastInactiveTick;
 
+    // If a kernel-based exit event was requested, wait for all CUs in the
+    // shader to complete before actually exiting so that stats are updated.
+    bool kernelExitRequested = false;
+
   public:
     typedef ShaderParams Params;
     enum hsail_mode_e {SIMT,VECTOR_SCALAR};
@@ -314,6 +318,12 @@ class Shader : public ClockedObject
         stats.vectorInstDstOperand[num_operands]++;
     }
 
+    void
+    requestKernelExitEvent()
+    {
+        kernelExitRequested = true;
+    }
+
   protected:
     struct ShaderStats : public statistics::Group
     {