From 23dc98ea725c51d8d817cb974c35bfeca047752a Mon Sep 17 00:00:00 2001
From: Vishnu Ramadas <vramadas@outlook.com>
Date: Thu, 25 Jan 2024 13:24:57 -0600
Subject: [PATCH 1/8] mem-ruby: Add SQC cache invalidation support  to GPU
 VIPER

This commit adds support for cache invalidation in GPU VIPER protocol's
SQC cache. To support this, the commit also adds L1 cache invalidation
framework in the Sequencer such that the Sequencer sends out an
invalidation request for each line in the cache and declares completion
once all lines are evicted.

Change-Id: I2f52eacabb2412b16f467f994e985c378230f841
---
 src/mem/ruby/protocol/GPU_VIPER-SQC.sm   | 21 +++++-
 src/mem/ruby/protocol/RubySlicc_Types.sm |  3 +
 src/mem/ruby/system/Sequencer.cc         | 94 +++++++++++++++++++++++-
 src/mem/ruby/system/Sequencer.hh         |  8 ++
 4 files changed, 124 insertions(+), 2 deletions(-)

diff --git a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
index bdc5d73f20..3086aab77a 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
@@ -60,6 +60,7 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
     // Mem sys initiated
     Repl,           desc="Replacing block from cache";
     Data,           desc="Received Data";
+    Evict,          desc="Evict cache line";
   }
 
   enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
@@ -67,6 +68,7 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
     DataArrayWrite,   desc="Write the data array";
     TagArrayRead,     desc="Read the data array";
     TagArrayWrite,    desc="Write the data array";
+    TagArrayFlash,    desc="Flash clear the data array";
   }
 
 
@@ -242,7 +244,12 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
       peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
         Entry cache_entry := getCacheEntry(in_msg.LineAddress);
         TBE tbe := TBEs.lookup(in_msg.LineAddress);
-        trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe);
+        DPRINTF(RubySlicc, "%s\n", in_msg);
+        if (in_msg.Type == RubyRequestType:REPLACEMENT) {
+          trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
+        } else {
+          trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe);
+        }
       }
     }
   }
@@ -313,6 +320,11 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
     APPEND_TRANSITION_COMMENT(cache_entry.DataBlk);
   }
 
+  action(inv_invDone, "inv", desc="local inv done") {
+    sequencer.invL1Callback();
+  }
+
+
   action(w_writeCache, "w", desc="write data to cache") {
     peek(responseToSQC_in, ResponseMsg) {
       assert(is_valid(cache_entry));
@@ -350,6 +362,13 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
     ic_invCache;
   }
 
+  transition({I, IV, V}, Evict, I) {TagArrayRead, TagArrayWrite} {
+    // since we're evicting something, don't bother classifying as hit/miss
+    ic_invCache;
+    inv_invDone;
+    p_popMandatoryQueue;
+  }
+
   // if we got a response for a load where the line is in I, then
   // another request must have come in that replaced the line in question in
   // the cache.  Thus, complete this request without allocating the line, but
diff --git a/src/mem/ruby/protocol/RubySlicc_Types.sm b/src/mem/ruby/protocol/RubySlicc_Types.sm
index 2206effa29..cc56d3b1b4 100644
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -157,6 +157,9 @@ structure (Sequencer, external = "yes") {
   void llscClearLocalMonitor();
 
   void evictionCallback(Addr);
+
+  void invL1Callback();
+
   void recordRequestType(SequencerRequestType);
   bool checkResourceAvailable(CacheResourceType, Addr);
 }
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
index 48054febef..0a37c64adf 100644
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -85,6 +85,8 @@ Sequencer::Sequencer(const Params &p)
 
     m_runningGarnetStandalone = p.garnet_standalone;
 
+    m_num_pending_invs = 0;
+    m_cache_inv_pkt = nullptr;
 
     // These statistical variables are not for display.
     // The profiler will collate these across different
@@ -348,6 +350,10 @@ Sequencer::insertRequest(PacketPtr pkt, RubyRequestType primary_type,
         return RequestStatus_Ready;
     }
 
+    if (pkt->cmd == MemCmd::MemSyncReq) {
+        return RequestStatus_Aliased;
+    }
+
     Addr line_addr = makeLineAddress(pkt->getAddr());
     // Check if there is any outstanding request for the same cache line.
     auto &seq_req_list = m_RequestTable[line_addr];
@@ -576,7 +582,8 @@ Sequencer::readCallback(Addr address, DataBlock& data,
         }
         if ((seq_req.m_type != RubyRequestType_LD) &&
             (seq_req.m_type != RubyRequestType_Load_Linked) &&
-            (seq_req.m_type != RubyRequestType_IFETCH)) {
+            (seq_req.m_type != RubyRequestType_IFETCH) &&
+            (seq_req.m_type != RubyRequestType_REPLACEMENT)) {
             // Write request: reissue request to the cache hierarchy
             issueRequest(seq_req.pkt, seq_req.m_second_type);
             break;
@@ -811,6 +818,86 @@ Sequencer::unaddressedCallback(Addr unaddressedReqId,
     }
 }
 
+void
+Sequencer::completeHitCallback(std::vector<PacketPtr> & mylist)
+{
+    for (auto& pkt : mylist) {
+        // When Ruby is in warmup or cooldown phase, the requests come
+        // from the cache recorder. They do not track which port to use
+        // and do not need to send the response back
+        if (!RubySystem::getWarmupEnabled()
+                && !RubySystem::getCooldownEnabled()) {
+            RubyPort::SenderState *ss =
+                safe_cast<RubyPort::SenderState *>(pkt->senderState);
+            MemResponsePort *port = ss->port;
+            assert(port != NULL);
+
+            pkt->senderState = ss->predecessor;
+
+            if (pkt->cmd != MemCmd::WriteReq) {
+                // for WriteReq, we keep the original senderState until
+                // writeCompleteCallback
+                delete ss;
+            }
+
+            port->hitCallback(pkt);
+            trySendRetries();
+        }
+    }
+
+    RubySystem *rs = m_ruby_system;
+    if (RubySystem::getWarmupEnabled()) {
+        rs->m_cache_recorder->enqueueNextFetchRequest();
+    } else if (RubySystem::getCooldownEnabled()) {
+        rs->m_cache_recorder->enqueueNextFlushRequest();
+    } else {
+        testDrainComplete();
+    }
+}
+
+void
+Sequencer::invL1Callback()
+{
+    // Since L1 invalidate is currently done with paddr = 0
+    assert(m_cache_inv_pkt && m_num_pending_invs > 0);
+
+    m_num_pending_invs--;
+
+    if (m_num_pending_invs == 0) {
+        std::vector<PacketPtr> pkt_list { m_cache_inv_pkt };
+        m_cache_inv_pkt = nullptr;
+        completeHitCallback(pkt_list);
+    }
+}
+
+void
+Sequencer::invL1()
+{
+    int size = m_dataCache_ptr->getNumBlocks();
+    DPRINTF(RubySequencer,
+            "There are %d Invalidations outstanding before Cache Walk\n",
+            m_num_pending_invs);
+    // Walk the cache
+    for (int i = 0; i < size; i++) {
+        Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+        // Evict Read-only data
+        RubyRequestType request_type = RubyRequestType_REPLACEMENT;
+        std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+            clockEdge(), addr, 0, 0,
+            request_type, RubyAccessMode_Supervisor,
+            nullptr);
+        DPRINTF(RubySequencer, "Evicting addr 0x%x\n", addr);
+        assert(m_mandatory_q_ptr != NULL);
+        Tick latency = cyclesToTicks(
+            m_controller->mandatoryQueueLatency(request_type));
+        m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
+        m_num_pending_invs++;
+    }
+    DPRINTF(RubySequencer,
+            "There are %d Invalidations outstanding after Cache Walk\n",
+            m_num_pending_invs);
+}
+
 bool
 Sequencer::empty() const
 {
@@ -915,6 +1002,11 @@ Sequencer::makeRequest(PacketPtr pkt)
             }
         } else if (pkt->isFlush()) {
           primary_type = secondary_type = RubyRequestType_FLUSH;
+        } else if (pkt->cmd == MemCmd::MemSyncReq) {
+            primary_type = secondary_type = RubyRequestType_REPLACEMENT;
+            assert(!m_cache_inv_pkt);
+            m_cache_inv_pkt = pkt;
+            invL1();
         } else {
             panic("Unsupported ruby packet type\n");
         }
diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh
index 8f736da6d5..3dc61ab4fa 100644
--- a/src/mem/ruby/system/Sequencer.hh
+++ b/src/mem/ruby/system/Sequencer.hh
@@ -141,6 +141,10 @@ class Sequencer : public RubyPort
                              const Cycles forwardRequestTime = Cycles(0),
                              const Cycles firstResponseTime = Cycles(0));
 
+    void completeHitCallback(std::vector<PacketPtr>& list);
+    void invL1Callback();
+    void invL1();
+
     RequestStatus makeRequest(PacketPtr pkt) override;
     virtual bool empty() const;
     int outstandingCount() const override { return m_outstanding_count; }
@@ -243,6 +247,10 @@ class Sequencer : public RubyPort
   private:
     int m_max_outstanding_requests;
 
+    int m_num_pending_invs;
+
+    PacketPtr m_cache_inv_pkt;
+
     CacheMemory* m_dataCache_ptr;
 
     // The cache access latency for top-level caches (L0/L1). These are

From 03838afce0650efab482247c1f847657639979aa Mon Sep 17 00:00:00 2001
From: Vishnu Ramadas <vramadas@outlook.com>
Date: Thu, 25 Jan 2024 13:37:31 -0600
Subject: [PATCH 2/8] gpu-compute: Add support for injecting scalar memory
 barrier

This commit adds support for injecting a scalar memory barrier in the
GPU. The barrier will primarily be used to invalidate the entire SQC
cache. The commit also invalidates all buffers and decrements related
counters upon completion of the invalidation request

Change-Id: Ib8e270bbeb8229a4470d606c96876ba5c87335bf
---
 src/gpu-compute/compute_unit.cc           | 22 ++++++++++
 src/gpu-compute/compute_unit.hh           | 35 ++++++++++++++++
 src/gpu-compute/fetch_unit.cc             | 30 +++++++++++++
 src/gpu-compute/fetch_unit.hh             |  1 +
 src/gpu-compute/scalar_memory_pipeline.cc | 51 +++++++++++++++++++++++
 src/gpu-compute/scalar_memory_pipeline.hh |  4 ++
 6 files changed, 143 insertions(+)

diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index 8259f0a950..f28a8e39c7 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -1046,6 +1046,28 @@ ComputeUnit::SQCPort::recvReqRetry()
     }
 }
 
+const char*
+ComputeUnit::SQCPort::MemReqEvent::description() const
+{
+    return "ComputeUnit SQC memory request event";
+}
+
+void
+ComputeUnit::SQCPort::MemReqEvent::process()
+{
+    SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+    [[maybe_unused]] ComputeUnit *compute_unit = sqcPort.computeUnit;
+
+    if (pkt->req->systemReq()) {
+        assert(compute_unit->shader->systemHub);
+        SystemHubEvent *resp_event = new SystemHubEvent(pkt, &sqcPort);
+        compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
+    } else if (!(sqcPort.sendTimingReq(pkt))) {
+        sqcPort.retries.push_back(std::pair<PacketPtr, Wavefront*>
+                (pkt, sender_state->wavefront));
+    }
+}
+
 void
 ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
 {
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index e6bc03da7d..24324bb515 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -680,6 +680,41 @@ class ComputeUnit : public ClockedObject
                 kernId(_kernId){ }
         };
 
+        class MemReqEvent : public Event
+        {
+          private:
+            SQCPort &sqcPort;
+            PacketPtr pkt;
+
+          public:
+            MemReqEvent(SQCPort &_sqc_port, PacketPtr _pkt)
+                : Event(), sqcPort(_sqc_port), pkt(_pkt)
+            {
+              setFlags(Event::AutoDelete);
+            }
+
+            void process();
+            const char *description() const;
+        };
+
+        class SystemHubEvent : public Event
+        {
+          SQCPort *sqcPort;
+          PacketPtr reqPkt;
+
+          public:
+            SystemHubEvent(PacketPtr pkt, SQCPort *_sqcPort)
+                : sqcPort(_sqcPort), reqPkt(pkt)
+            {
+                setFlags(Event::AutoDelete);
+            }
+
+            void
+            process()
+            {
+            }
+        };
+
         std::deque<std::pair<PacketPtr, Wavefront*>> retries;
 
       protected:
diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc
index 4dadbd363d..19144d55e2 100644
--- a/src/gpu-compute/fetch_unit.cc
+++ b/src/gpu-compute/fetch_unit.cc
@@ -388,6 +388,29 @@ FetchUnit::FetchBufDesc::flushBuf()
             wavefront->wfDynId);
 }
 
+void
+FetchUnit::FetchBufDesc::invBuf()
+{
+    restartFromBranch = false;
+    /**
+     * free list may have some entries
+     * so we clear it here to avoid duplicates
+     */
+    freeList.clear();
+    bufferedPCs.clear();
+    reservedPCs.clear();
+    readPtr = bufStart;
+
+    for (int i = 0; i < fetchDepth; ++i) {
+        freeList.push_back(bufStart + i * cacheLineSize);
+    }
+
+    DPRINTF(GPUFetch, "WF[%d][%d]: Id%d Fetch dropped, flushing fetch "
+            "buffer\n", wavefront->simdId, wavefront->wfSlotId,
+            wavefront->wfDynId);
+
+}
+
 Addr
 FetchUnit::FetchBufDesc::nextFetchAddr()
 {
@@ -471,6 +494,13 @@ FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr)
 void
 FetchUnit::FetchBufDesc::fetchDone(Addr vaddr)
 {
+    if (vaddr == 0) {
+        // S_ICACHE_INV fetch done
+        wavefront->decLGKMInstsIssued();
+        invBuf();
+        return;
+    }
+
     assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
     DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for addr %#x\n",
             wavefront->simdId, wavefront->wfSlotId,
diff --git a/src/gpu-compute/fetch_unit.hh b/src/gpu-compute/fetch_unit.hh
index 0ba88c7d95..99c91b7299 100644
--- a/src/gpu-compute/fetch_unit.hh
+++ b/src/gpu-compute/fetch_unit.hh
@@ -104,6 +104,7 @@ class FetchUnit
         int reservedLines() const { return reservedPCs.size(); }
         bool hasFreeSpace() const { return !freeList.empty(); }
         void flushBuf();
+        void invBuf();
         Addr nextFetchAddr();
 
         /**
diff --git a/src/gpu-compute/scalar_memory_pipeline.cc b/src/gpu-compute/scalar_memory_pipeline.cc
index de24f9448b..767e4e05a7 100644
--- a/src/gpu-compute/scalar_memory_pipeline.cc
+++ b/src/gpu-compute/scalar_memory_pipeline.cc
@@ -160,4 +160,55 @@ ScalarMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
     issuedRequests.push(gpuDynInst);
 }
 
+void
+ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst,
+                                        bool kernelMemSync,
+                                        RequestPtr req)
+{
+    assert(gpuDynInst->isScalar());
+
+    if (!req) {
+        req = std::make_shared<Request>(
+                0, 0, 0, computeUnit.requestorId(), 0, gpuDynInst->wfDynId);
+    } else {
+        req->requestorId(computeUnit.requestorId());
+    }
+
+    req->setPaddr(0);
+
+    PacketPtr pkt = nullptr;
+
+    if (kernelMemSync) {
+        req->setCacheCoherenceFlags(Request::INV_L1);
+        req->setReqInstSeqNum(gpuDynInst->seqNum());
+        req->setFlags(Request::KERNEL);
+        pkt = new Packet(req, MemCmd::MemSyncReq);
+        pkt->pushSenderState(
+                new ComputeUnit::SQCPort::SenderState(
+                    gpuDynInst->wavefront(), nullptr));
+        ComputeUnit::SQCPort::MemReqEvent *sqc_event =
+                new ComputeUnit::SQCPort::MemReqEvent
+                (computeUnit.sqcPort, pkt);
+
+        computeUnit.schedule(
+                sqc_event, curTick() + computeUnit.scalar_req_tick_latency);
+    } else {
+        gpuDynInst->setRequestFlags(req);
+
+        req->setReqInstSeqNum(gpuDynInst->seqNum());
+
+        pkt = new Packet(req, MemCmd::MemSyncReq);
+        pkt->pushSenderState(
+                new ComputeUnit::SQCPort::SenderState(
+                    gpuDynInst->wavefront(), nullptr));
+
+        ComputeUnit::SQCPort::MemReqEvent *sqc_event =
+                new ComputeUnit::SQCPort::MemReqEvent
+                (computeUnit.sqcPort, pkt);
+
+        computeUnit.schedule(
+                sqc_event, curTick() + computeUnit.scalar_req_tick_latency);
+    }
+}
+
 } // namespace gem5
diff --git a/src/gpu-compute/scalar_memory_pipeline.hh b/src/gpu-compute/scalar_memory_pipeline.hh
index 5512c7c01f..e5dc7b4292 100644
--- a/src/gpu-compute/scalar_memory_pipeline.hh
+++ b/src/gpu-compute/scalar_memory_pipeline.hh
@@ -36,6 +36,7 @@
 #include <string>
 
 #include "gpu-compute/misc.hh"
+#include "mem/request.hh"
 #include "params/ComputeUnit.hh"
 #include "sim/stats.hh"
 
@@ -67,6 +68,9 @@ class ScalarMemPipeline
 
     void issueRequest(GPUDynInstPtr gpuDynInst);
 
+    void injectScalarMemFence(
+            GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req);
+
     bool
     isGMLdRespFIFOWrRdy() const
     {

From 440409d80708fb7ace1b3749ce478fb6b5ee68d6 Mon Sep 17 00:00:00 2001
From: Vishnu Ramadas <vramadas@outlook.com>
Date: Tue, 30 Jan 2024 14:45:12 -0600
Subject: [PATCH 3/8] gpu-compute: Add Icache invalidation at kernel start

Previously, the data caches were invalidated at the start of each
kernel. This commit adds support for invalidating instruction cache at
kernel launch time

Change-Id: I32e50f63fa1442c2514d4dd8f9d7689759f503d3
---
 src/gpu-compute/compute_unit.cc | 33 ++++++++++++++++++++++++++++++---
 src/gpu-compute/compute_unit.hh |  1 +
 src/gpu-compute/shader.cc       |  5 +++++
 3 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index f28a8e39c7..ba4c14c4f0 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -397,9 +397,9 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
 }
 
 /**
- * trigger invalidate operation in the cu
+ * trigger invalidate operation in the CU
  *
- * req: request initialized in shader, carrying the invlidate flags
+ * req: request initialized in shader, carrying the invalidate flags
  */
 void
 ComputeUnit::doInvalidate(RequestPtr req, int kernId){
@@ -425,6 +425,26 @@ ComputeUnit::doFlush(GPUDynInstPtr gpuDynInst) {
     injectGlobalMemFence(gpuDynInst, true);
 }
 
+/**
+ * trigger SQCinvalidate operation in the CU
+ *
+ * req: request initialized in shader, carrying the invalidate flags
+ */
+void
+ComputeUnit::doSQCInvalidate(RequestPtr req, int kernId){
+    GPUDynInstPtr gpuDynInst
+        = std::make_shared<GPUDynInst>(this, nullptr,
+            new KernelLaunchStaticInst(), getAndIncSeqNum());
+
+    // kern_id will be used in inv responses
+    gpuDynInst->kern_id = kernId;
+    // update contextId field
+    req->setContext(gpuDynInst->wfDynId);
+
+    gpuDynInst->staticInstruction()->setFlag(GPUStaticInst::Scalar);
+    scalarMemoryPipe.injectScalarMemFence(gpuDynInst, true, req);
+}
+
 // reseting SIMD register pools
 // I couldn't think of any other place and
 // I think it is needed in my implementation
@@ -1012,7 +1032,14 @@ ComputeUnit::DataPort::recvReqRetry()
 bool
 ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
 {
-    computeUnit->handleSQCReturn(pkt);
+    SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+    /** Process the response only if there is a wavefront associated with it.
+     * Otherwise, it is from SQC invalidate that was issued at kernel start
+     * and doesn't have a wavefront or instruction associated with it.
+     */
+    if (sender_state->wavefront != nullptr) {
+        computeUnit->handleSQCReturn(pkt);
+    }
 
     return true;
 }
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index 24324bb515..7e3f05d070 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -412,6 +412,7 @@ class ComputeUnit : public ClockedObject
 
     void doInvalidate(RequestPtr req, int kernId);
     void doFlush(GPUDynInstPtr gpuDynInst);
+    void doSQCInvalidate(RequestPtr req, int kernId);
 
     void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);
     bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
index e13e7c9cf4..a83b413cf9 100644
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -221,6 +221,11 @@ Shader::prepareInvalidate(HSAQueueEntry *task) {
         // all necessary INV flags are all set now, call cu to execute
         cuList[i_cu]->doInvalidate(req, task->dispatchId());
 
+
+        if ((i_cu % 4) == 0) {
+            cuList[i_cu]->doSQCInvalidate(req, task->dispatchId());
+        }
+
         // I don't like this. This is intrusive coding.
         cuList[i_cu]->resetRegisterPool();
     }

From 0e93e6142a7bce60c4f19b4ecc8085d90d781b66 Mon Sep 17 00:00:00 2001
From: Vishnu Ramadas <vramadas@outlook.com>
Date: Tue, 6 Feb 2024 13:52:09 -0600
Subject: [PATCH 4/8] arch-vega, gpu-compute, mem-ruby: Remove extra empty
 lines

Change-Id: I18770ec7e38c4a992a0ae6de95b0be49ab4426c2
---
 src/gpu-compute/fetch_unit.cc          | 1 -
 src/mem/ruby/protocol/GPU_VIPER-SQC.sm | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc
index 19144d55e2..66abfe1fb7 100644
--- a/src/gpu-compute/fetch_unit.cc
+++ b/src/gpu-compute/fetch_unit.cc
@@ -408,7 +408,6 @@ FetchUnit::FetchBufDesc::invBuf()
     DPRINTF(GPUFetch, "WF[%d][%d]: Id%d Fetch dropped, flushing fetch "
             "buffer\n", wavefront->simdId, wavefront->wfSlotId,
             wavefront->wfDynId);
-
 }
 
 Addr
diff --git a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
index 3086aab77a..67c7753f09 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
@@ -324,7 +324,6 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
     sequencer.invL1Callback();
   }
 
-
   action(w_writeCache, "w", desc="write data to cache") {
     peek(responseToSQC_in, ResponseMsg) {
       assert(is_valid(cache_entry));

From 7dae25e8818088f039cd3b96dc6b0f29dca44a6c Mon Sep 17 00:00:00 2001
From: Vishnu Ramadas <vramadas@outlook.com>
Date: Tue, 6 Feb 2024 15:30:58 -0600
Subject: [PATCH 5/8] configs, gpu-compute: Add parameter in shader for CUs per
 SQC

Change-Id: If0ae0db1b6ccc08a92f169a271b137f69f410f7b
---
 configs/example/apu_se.py              | 1 +
 configs/example/gpufs/system/amdgpu.py | 5 ++++-
 src/gpu-compute/GPU.py                 | 1 +
 src/gpu-compute/shader.cc              | 5 ++++-
 src/gpu-compute/shader.hh              | 2 ++
 5 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index 9c023fe5c6..17e46268ef 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -434,6 +434,7 @@ print(
 # shader is the GPU
 shader = Shader(
     n_wf=args.wfs_per_simd,
+    cu_per_sqc=args.cu_per_sqc,
     clk_domain=SrcClockDomain(
         clock=args.gpu_clock,
         voltage_domain=VoltageDomain(voltage=args.gpu_voltage),
diff --git a/configs/example/gpufs/system/amdgpu.py b/configs/example/gpufs/system/amdgpu.py
index 30e059d154..0813759e2a 100644
--- a/configs/example/gpufs/system/amdgpu.py
+++ b/configs/example/gpufs/system/amdgpu.py
@@ -33,7 +33,10 @@ from m5.objects import *
 
 def createGPU(system, args):
     shader = Shader(
-        n_wf=args.wfs_per_simd, timing=True, clk_domain=system.clk_domain
+        n_wf=args.wfs_per_simd,
+        cu_per_sqc=args.cu_per_sqc,
+        timing=True,
+        clk_domain=system.clk_domain,
     )
 
     # VIPER GPU protocol implements release consistency at GPU side. So,
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index 78baa596a7..b9a13dc85b 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -294,6 +294,7 @@ class Shader(ClockedObject):
     dispatcher = Param.GPUDispatcher("GPU workgroup dispatcher")
     system_hub = Param.AMDGPUSystemHub(NULL, "GPU System Hub (FS Mode only)")
     n_wf = Param.Int(10, "Number of wavefront slots per SIMD")
+    cu_per_sqc = Param.Int(4, "Number of CUs that share an SQC")
     impl_kern_launch_acq = Param.Bool(
         True,
         """Insert acq packet into
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
index a83b413cf9..b99950568e 100644
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -64,6 +64,7 @@ Shader::Shader(const Params &p) : ClockedObject(p),
     impl_kern_end_rel(p.impl_kern_end_rel),
     coissue_return(1),
     trace_vgpr_all(1), n_cu((p.CUs).size()), n_wf(p.n_wf),
+    n_cu_per_sqc(p.cu_per_sqc),
     globalMemSize(p.globalmem),
     nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc),
     _dispatcher(*p.dispatcher), systemHub(p.system_hub),
@@ -222,7 +223,9 @@ Shader::prepareInvalidate(HSAQueueEntry *task) {
         cuList[i_cu]->doInvalidate(req, task->dispatchId());
 
 
-        if ((i_cu % 4) == 0) {
+        // A set of CUs share a single SQC cache. Send a single invalidate
+        // request to each SQC
+        if ((i_cu % n_cu_per_sqc) == 0) {
             cuList[i_cu]->doSQCInvalidate(req, task->dispatchId());
         }
 
diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh
index 32ddf3d15b..89541a8ff4 100644
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@@ -237,6 +237,8 @@ class Shader : public ClockedObject
     int n_cu;
     // Number of wavefront slots per SIMD per CU
     int n_wf;
+    //Number of cu units per sqc in the shader
+    int n_cu_per_sqc;
 
     // The size of global memory
     int globalMemSize;

From 690b2b9462a6bc6afa4273a1161c02bae12f32f2 Mon Sep 17 00:00:00 2001
From: Vishnu Ramadas <vramadas@outlook.com>
Date: Tue, 6 Feb 2024 15:32:06 -0600
Subject: [PATCH 6/8] gpu-compute, mem-ruby: Add comments and reformat code

Change-Id: Id2b3886dce347fdcfcad22009a42b92febc00a6c
---
 src/gpu-compute/fetch_unit.cc             |  7 +++++-
 src/gpu-compute/scalar_memory_pipeline.cc | 26 +++++++++++------------
 src/mem/ruby/system/Sequencer.cc          |  5 +++++
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc
index 66abfe1fb7..3fe181b519 100644
--- a/src/gpu-compute/fetch_unit.cc
+++ b/src/gpu-compute/fetch_unit.cc
@@ -493,8 +493,13 @@ FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr)
 void
 FetchUnit::FetchBufDesc::fetchDone(Addr vaddr)
 {
+    // If the return vaddr is 0, then it belongs to an SQC invalidation
+    // request. This request calls incLGKMInstsIssued() function in its
+    // execution path. Since there is no valid memory return response
+    // associated with this instruction, decLGKMInstsIssued() is not
+    // executed. Do this here to decrement the counter and invalidate
+    // all buffers
     if (vaddr == 0) {
-        // S_ICACHE_INV fetch done
         wavefront->decLGKMInstsIssued();
         invBuf();
         return;
diff --git a/src/gpu-compute/scalar_memory_pipeline.cc b/src/gpu-compute/scalar_memory_pipeline.cc
index 767e4e05a7..54819e7d3f 100644
--- a/src/gpu-compute/scalar_memory_pipeline.cc
+++ b/src/gpu-compute/scalar_memory_pipeline.cc
@@ -174,10 +174,17 @@ ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst,
         req->requestorId(computeUnit.requestorId());
     }
 
+    // When the SQC invalidate instruction is executed, it calls
+    // injectScalarMemFence. The instruction does not contain an address
+    // as one of its operands. Therefore, set the physical address of the
+    // invalidation request to 0 and handle it in the sequencer
     req->setPaddr(0);
 
     PacketPtr pkt = nullptr;
 
+    // If kernelMemSync is true, then the invalidation request is from
+    // kernel launch and is an implicit invalidation.If false, then it is
+    // due to an S_ICACHE_INV instruction
     if (kernelMemSync) {
         req->setCacheCoherenceFlags(Request::INV_L1);
         req->setReqInstSeqNum(gpuDynInst->seqNum());
@@ -186,12 +193,6 @@ ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst,
         pkt->pushSenderState(
                 new ComputeUnit::SQCPort::SenderState(
                     gpuDynInst->wavefront(), nullptr));
-        ComputeUnit::SQCPort::MemReqEvent *sqc_event =
-                new ComputeUnit::SQCPort::MemReqEvent
-                (computeUnit.sqcPort, pkt);
-
-        computeUnit.schedule(
-                sqc_event, curTick() + computeUnit.scalar_req_tick_latency);
     } else {
         gpuDynInst->setRequestFlags(req);
 
@@ -201,14 +202,13 @@ ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst,
         pkt->pushSenderState(
                 new ComputeUnit::SQCPort::SenderState(
                     gpuDynInst->wavefront(), nullptr));
-
-        ComputeUnit::SQCPort::MemReqEvent *sqc_event =
-                new ComputeUnit::SQCPort::MemReqEvent
-                (computeUnit.sqcPort, pkt);
-
-        computeUnit.schedule(
-                sqc_event, curTick() + computeUnit.scalar_req_tick_latency);
     }
+
+    ComputeUnit::SQCPort::MemReqEvent *sqc_event =
+            new ComputeUnit::SQCPort::MemReqEvent
+            (computeUnit.sqcPort, pkt);
+    computeUnit.schedule(
+            sqc_event, curTick() + computeUnit.scalar_req_tick_latency);
 }
 
 } // namespace gem5
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
index 0a37c64adf..4fef7090b6 100644
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -350,6 +350,11 @@ Sequencer::insertRequest(PacketPtr pkt, RubyRequestType primary_type,
         return RequestStatus_Ready;
     }
 
+    // If command is MemSyncReq, it is used to invalidate the cache.
+    // As the cache invalidation requests are already issued in invL1(),
+    // there is no need to create a new request for the same here.
+    // Instead, return RequestStatus_Aliased, and make the sequencer skip
+    // an extra issueRequest
     if (pkt->cmd == MemCmd::MemSyncReq) {
         return RequestStatus_Aliased;
     }

From 85680ea58e26314cf6187c7be6b0e171b1e79fc4 Mon Sep 17 00:00:00 2001
From: Vishnu Ramadas <vramadas@outlook.com>
Date: Wed, 7 Feb 2024 12:22:06 -0600
Subject: [PATCH 7/8] gpu-compute: Remove unused and redundant functions

In ComputeUnit, a previous commit added a  SystemHubEvent event class to
the SQCPort. This was found to be unnecessary during the review process
and is removed in this commit. Similarly, invBuf() which was added in
FetchUnit as part of an earlier commit was found to be redundant. This
commit removes it

Change-Id: I6ee8d344d29e7bfade49fb9549654b71e3c4b96f
---
 src/gpu-compute/compute_unit.cc |  8 +++---
 src/gpu-compute/compute_unit.hh | 18 -------------
 src/gpu-compute/fetch_unit.cc   | 45 ++++++++++-----------------------
 src/gpu-compute/fetch_unit.hh   |  3 +--
 4 files changed, 17 insertions(+), 57 deletions(-)

diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index ba4c14c4f0..e485aa6161 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -1085,11 +1085,9 @@ ComputeUnit::SQCPort::MemReqEvent::process()
     SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
     [[maybe_unused]] ComputeUnit *compute_unit = sqcPort.computeUnit;
 
-    if (pkt->req->systemReq()) {
-        assert(compute_unit->shader->systemHub);
-        SystemHubEvent *resp_event = new SystemHubEvent(pkt, &sqcPort);
-        compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
-    } else if (!(sqcPort.sendTimingReq(pkt))) {
+    assert(!pkt->req->systemReq());
+
+    if (!(sqcPort.sendTimingReq(pkt))) {
         sqcPort.retries.push_back(std::pair<PacketPtr, Wavefront*>
                 (pkt, sender_state->wavefront));
     }
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index 7e3f05d070..cfa145551f 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -698,24 +698,6 @@ class ComputeUnit : public ClockedObject
             const char *description() const;
         };
 
-        class SystemHubEvent : public Event
-        {
-          SQCPort *sqcPort;
-          PacketPtr reqPkt;
-
-          public:
-            SystemHubEvent(PacketPtr pkt, SQCPort *_sqcPort)
-                : sqcPort(_sqcPort), reqPkt(pkt)
-            {
-                setFlags(Event::AutoDelete);
-            }
-
-            void
-            process()
-            {
-            }
-        };
-
         std::deque<std::pair<PacketPtr, Wavefront*>> retries;
 
       protected:
diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc
index 3fe181b519..20b89f6384 100644
--- a/src/gpu-compute/fetch_unit.cc
+++ b/src/gpu-compute/fetch_unit.cc
@@ -320,7 +320,7 @@ FetchUnit::processFetchReturn(PacketPtr pkt)
         assert(!fetchBuf.at(wavefront->wfSlotId).hasFetchDataToProcess());
         wavefront->dropFetch = false;
     } else {
-        fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt->req->getVaddr());
+        fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt);
     }
 
     wavefront->pendingFetch = false;
@@ -388,28 +388,6 @@ FetchUnit::FetchBufDesc::flushBuf()
             wavefront->wfDynId);
 }
 
-void
-FetchUnit::FetchBufDesc::invBuf()
-{
-    restartFromBranch = false;
-    /**
-     * free list may have some entries
-     * so we clear it here to avoid duplicates
-     */
-    freeList.clear();
-    bufferedPCs.clear();
-    reservedPCs.clear();
-    readPtr = bufStart;
-
-    for (int i = 0; i < fetchDepth; ++i) {
-        freeList.push_back(bufStart + i * cacheLineSize);
-    }
-
-    DPRINTF(GPUFetch, "WF[%d][%d]: Id%d Fetch dropped, flushing fetch "
-            "buffer\n", wavefront->simdId, wavefront->wfSlotId,
-            wavefront->wfDynId);
-}
-
 Addr
 FetchUnit::FetchBufDesc::nextFetchAddr()
 {
@@ -491,20 +469,23 @@ FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr)
 }
 
 void
-FetchUnit::FetchBufDesc::fetchDone(Addr vaddr)
+FetchUnit::FetchBufDesc::fetchDone(PacketPtr pkt)
 {
-    // If the return vaddr is 0, then it belongs to an SQC invalidation
-    // request. This request calls incLGKMInstsIssued() function in its
-    // execution path. Since there is no valid memory return response
-    // associated with this instruction, decLGKMInstsIssued() is not
-    // executed. Do this here to decrement the counter and invalidate
-    // all buffers
-    if (vaddr == 0) {
+    // If the return command is MemSyncResp, then it belongs to
+    // an SQC invalidation request. This request calls
+    // incLGKMInstsIssued() function in its execution path.
+    // Since there is no valid memory return response associated with
+    // this instruction, decLGKMInstsIssued() is not executed. Do this
+    // here to decrement the counter and invalidate all buffers
+    if (pkt->cmd == MemCmd::MemSyncResp) {
         wavefront->decLGKMInstsIssued();
-        invBuf();
+        flushBuf();
+        restartFromBranch = false;
         return;
     }
 
+    Addr vaddr = pkt->req->getVaddr();
+
     assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
     DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for addr %#x\n",
             wavefront->simdId, wavefront->wfSlotId,
diff --git a/src/gpu-compute/fetch_unit.hh b/src/gpu-compute/fetch_unit.hh
index 99c91b7299..85bf2472ec 100644
--- a/src/gpu-compute/fetch_unit.hh
+++ b/src/gpu-compute/fetch_unit.hh
@@ -104,7 +104,6 @@ class FetchUnit
         int reservedLines() const { return reservedPCs.size(); }
         bool hasFreeSpace() const { return !freeList.empty(); }
         void flushBuf();
-        void invBuf();
         Addr nextFetchAddr();
 
         /**
@@ -139,7 +138,7 @@ class FetchUnit
             return is_reserved;
         }
 
-        void fetchDone(Addr vaddr);
+        void fetchDone(PacketPtr ptr);
 
         /**
          * checks if the buffer contains valid data. this essentially

From 8054459df6583f42740a5a2fa5f978d5b0207530 Mon Sep 17 00:00:00 2001
From: Vishnu Ramadas <vramadas@outlook.com>
Date: Fri, 9 Feb 2024 12:19:08 -0600
Subject: [PATCH 8/8] arch-vega: Add support for S_ICACHE_INV instruction

Previously, the S_ICACHE_INV instruction was unimplemented and
simulation panicked if it was encountered. This commit adds support for
executing the instruction by injecting a memory barrier in the scalar
pipeline and invalidating the ICACHE (or SQC)

Change-Id: I0fbd4e53f630a267971a23cea6f17d4fef403d15
---
 src/arch/amdgpu/vega/insts/sopp.cc | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/arch/amdgpu/vega/insts/sopp.cc b/src/arch/amdgpu/vega/insts/sopp.cc
index df5cdbf681..781113b204 100644
--- a/src/arch/amdgpu/vega/insts/sopp.cc
+++ b/src/arch/amdgpu/vega/insts/sopp.cc
@@ -669,6 +669,9 @@ namespace VegaISA
     Inst_SOPP__S_ICACHE_INV::Inst_SOPP__S_ICACHE_INV(InFmt_SOPP *iFmt)
         : Inst_SOPP(iFmt, "s_icache_inv")
     {
+        setFlag(MemBarrier);
+        setFlag(GPUStaticInst::MemSync);
+        setFlag(MemSync);
     } // Inst_SOPP__S_ICACHE_INV
 
     Inst_SOPP__S_ICACHE_INV::~Inst_SOPP__S_ICACHE_INV()
@@ -683,7 +686,26 @@ namespace VegaISA
     void
     Inst_SOPP__S_ICACHE_INV::execute(GPUDynInstPtr gpuDynInst)
     {
-        panicUnimplemented();
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        gpuDynInst->resetEntireStatusVector();
+        gpuDynInst->setStatusVector(0, 1);
+        RequestPtr req = std::make_shared<Request>(0, 0, 0,
+                                   gpuDynInst->computeUnit()->
+                                   requestorId(), 0,
+                                   gpuDynInst->wfDynId);
+        gpuDynInst->setRequestFlags(req);
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            injectScalarMemFence(gpuDynInst, false, req);
     } // execute
     // --- Inst_SOPP__S_INCPERFLEVEL class methods ---