From 440409d80708fb7ace1b3749ce478fb6b5ee68d6 Mon Sep 17 00:00:00 2001
From: Vishnu Ramadas <vramadas@outlook.com>
Date: Tue, 30 Jan 2024 14:45:12 -0600
Subject: [PATCH] gpu-compute: Add Icache invalidation at kernel start

Previously, the data caches were invalidated at the start of each
kernel. This commit adds support for invalidating instruction cache at
kernel launch time

Change-Id: I32e50f63fa1442c2514d4dd8f9d7689759f503d3
---
 src/gpu-compute/compute_unit.cc | 33 ++++++++++++++++++++++++++++++---
 src/gpu-compute/compute_unit.hh |  1 +
 src/gpu-compute/shader.cc       |  5 +++++
 3 files changed, 36 insertions(+), 3 deletions(-)
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index f28a8e39c7..ba4c14c4f0 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -397,9 +397,9 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
 }
 
 /**
- * trigger invalidate operation in the cu
+ * trigger invalidate operation in the CU
  *
- * req: request initialized in shader, carrying the invlidate flags
+ * req: request initialized in shader, carrying the invalidate flags
  */
 void
 ComputeUnit::doInvalidate(RequestPtr req, int kernId){
@@ -425,6 +425,26 @@ ComputeUnit::doFlush(GPUDynInstPtr gpuDynInst) {
     injectGlobalMemFence(gpuDynInst, true);
 }
 
+/**
+ * trigger SQCinvalidate operation in the CU
+ *
+ * req: request initialized in shader, carrying the invalidate flags
+ */
+void
+ComputeUnit::doSQCInvalidate(RequestPtr req, int kernId){
+    GPUDynInstPtr gpuDynInst
+        = std::make_shared<GPUDynInst>(this, nullptr,
+            new KernelLaunchStaticInst(), getAndIncSeqNum());
+
+    // kern_id will be used in inv responses
+    gpuDynInst->kern_id = kernId;
+    // update contextId field
+    req->setContext(gpuDynInst->wfDynId);
+
+    gpuDynInst->staticInstruction()->setFlag(GPUStaticInst::Scalar);
+    scalarMemoryPipe.injectScalarMemFence(gpuDynInst, true, req);
+}
+
 // reseting SIMD register pools
 // I couldn't think of any other place and
 // I think it is needed in my implementation
@@ -1012,7 +1032,14 @@ ComputeUnit::DataPort::recvReqRetry()
 bool
 ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
 {
-    computeUnit->handleSQCReturn(pkt);
+    SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+    /** Process the response only if there is a wavefront associated with it.
+     * Otherwise, it is from SQC invalidate that was issued at kernel start
+     * and doesn't have a wavefront or instruction associated with it.
+     */
+    if (sender_state->wavefront != nullptr) {
+        computeUnit->handleSQCReturn(pkt);
+    }
 
     return true;
 }
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index 24324bb515..7e3f05d070 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -412,6 +412,7 @@ class ComputeUnit : public ClockedObject
 
     void doInvalidate(RequestPtr req, int kernId);
     void doFlush(GPUDynInstPtr gpuDynInst);
+    void doSQCInvalidate(RequestPtr req, int kernId);
 
     void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);
     bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
index e13e7c9cf4..a83b413cf9 100644
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -221,6 +221,11 @@ Shader::prepareInvalidate(HSAQueueEntry *task) {
         // all necessary INV flags are all set now, call cu to execute
         cuList[i_cu]->doInvalidate(req, task->dispatchId());
 
+
+        if ((i_cu % 4) == 0) {
+            cuList[i_cu]->doSQCInvalidate(req, task->dispatchId());
+        }
+
         // I don't like this. This is intrusive coding.
         cuList[i_cu]->resetRegisterPool();
     }