From 0d3d456894b546bd37450abb7e42f65c2f070871 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Mon, 6 May 2024 07:35:38 -0700
Subject: [PATCH] gpu-compute: Invalidate Scalar cache when SQC invalidates
 (#1093)

The scalar cache is not being invalidated which causes stale data to be
left in the scalar cache between GPU kernels. This commit sends
invalidates to the scalar cache when the SQC is invalidated. This is a
sufficient baseline for simulation.

Since the number of invalidates might be larger than the mandatory queue
can hold and no flash invalidate mechanism exists in the VIPER protocol,
the command line option for the mandatory queue size is removed, which
is the same behavior as the SQC.

Change-Id: I1723f224711b04caa4c88beccfa8fb73ccf56572
---
 configs/ruby/GPU_VIPER.py                 | 11 +-----
 src/gpu-compute/compute_unit.cc           | 11 ++++++
 src/gpu-compute/scalar_memory_pipeline.cc | 44 +++++++++++++++++++----
 3 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/configs/ruby/GPU_VIPER.py b/configs/ruby/GPU_VIPER.py
index 0818b7f0eb..d0a0c61083 100644
--- a/configs/ruby/GPU_VIPER.py
+++ b/configs/ruby/GPU_VIPER.py
@@ -497,13 +497,6 @@ def define_options(parser):
     parser.add_argument(
         "--noL1", action="store_true", default=False, help="bypassL1"
     )
-    parser.add_argument(
-        "--scalar-buffer-size",
-        type=int,
-        default=128,
-        help="Size of the mandatory queue in the GPU scalar "
-        "cache controller",
-    )
     parser.add_argument(
         "--glc-atomic-latency", type=int, default=1, help="GLC Atomic Latency"
     )
@@ -841,9 +834,7 @@ def construct_scalars(options, system, ruby_system, network):
         scalar_cntrl.responseToSQC = MessageBuffer(ordered=True)
         scalar_cntrl.responseToSQC.in_port = network.out_port
 
-        scalar_cntrl.mandatoryQueue = MessageBuffer(
-            buffer_size=options.scalar_buffer_size
-        )
+        scalar_cntrl.mandatoryQueue = MessageBuffer()
 
     return (scalar_sequencers, scalar_cntrl_nodes)
 
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index daad5e9b40..1589e4564e 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -974,6 +974,14 @@ ComputeUnit::ScalarDataPort::recvTimingResp(PacketPtr pkt)
 bool
 ComputeUnit::ScalarDataPort::handleResponse(PacketPtr pkt)
 {
+    // From scalar cache invalidate that was issued at kernel start.
+    if (pkt->req->isKernel()) {
+        delete pkt->senderState;
+        delete pkt;
+
+        return true;
+    }
+
     assert(!pkt->req->isKernel());
 
     // retrieve sender state
@@ -1058,6 +1066,9 @@ ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
      */
     if (sender_state->wavefront != nullptr) {
         computeUnit->handleSQCReturn(pkt);
+    } else {
+        delete pkt->senderState;
+        delete pkt;
     }
 
     return true;
diff --git a/src/gpu-compute/scalar_memory_pipeline.cc b/src/gpu-compute/scalar_memory_pipeline.cc
index 54819e7d3f..cd8dd30b00 100644
--- a/src/gpu-compute/scalar_memory_pipeline.cc
+++ b/src/gpu-compute/scalar_memory_pipeline.cc
@@ -180,7 +180,7 @@ ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst,
     // invalidation request to 0 and handle it in the sequencer
     req->setPaddr(0);
 
-    PacketPtr pkt = nullptr;
+    PacketPtr sqc_pkt = nullptr;
 
     // If kernelMemSync is true, then the invalidation request is from
     // kernel launch and is an implicit invalidation.If false, then it is
@@ -189,8 +189,8 @@ ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst,
         req->setCacheCoherenceFlags(Request::INV_L1);
         req->setReqInstSeqNum(gpuDynInst->seqNum());
         req->setFlags(Request::KERNEL);
-        pkt = new Packet(req, MemCmd::MemSyncReq);
-        pkt->pushSenderState(
+        sqc_pkt = new Packet(req, MemCmd::MemSyncReq);
+        sqc_pkt->pushSenderState(
                 new ComputeUnit::SQCPort::SenderState(
                     gpuDynInst->wavefront(), nullptr));
     } else {
@@ -198,17 +198,49 @@ ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst,
 
         req->setReqInstSeqNum(gpuDynInst->seqNum());
 
-        pkt = new Packet(req, MemCmd::MemSyncReq);
-        pkt->pushSenderState(
+        sqc_pkt = new Packet(req, MemCmd::MemSyncReq);
+        sqc_pkt->pushSenderState(
                 new ComputeUnit::SQCPort::SenderState(
                     gpuDynInst->wavefront(), nullptr));
     }
 
     ComputeUnit::SQCPort::MemReqEvent *sqc_event =
             new ComputeUnit::SQCPort::MemReqEvent
-            (computeUnit.sqcPort, pkt);
+            (computeUnit.sqcPort, sqc_pkt);
     computeUnit.schedule(
             sqc_event, curTick() + computeUnit.scalar_req_tick_latency);
+
+    // When the SQC is invalidated, perform a scalar cache invalidate as well.
+    // The SQC and Scalar cache are implement using the same SLICC SM, so this
+    // invalidate is identical to the SQC invalidate, however we need to make
+    // a new packet and request as they have different cache destinations.
+    PacketPtr scalar_pkt = nullptr;
+    RequestPtr scalar_req(req);
+
+    if (kernelMemSync) {
+        scalar_req->setCacheCoherenceFlags(Request::INV_L1);
+        scalar_req->setReqInstSeqNum(gpuDynInst->seqNum());
+        scalar_req->setFlags(Request::KERNEL);
+        scalar_pkt = new Packet(scalar_req, MemCmd::MemSyncReq);
+        scalar_pkt->pushSenderState(
+                new ComputeUnit::ScalarDataPort::SenderState(
+                    gpuDynInst));
+    } else {
+        gpuDynInst->setRequestFlags(scalar_req);
+
+        scalar_req->setReqInstSeqNum(gpuDynInst->seqNum());
+
+        scalar_pkt = new Packet(scalar_req, MemCmd::MemSyncReq);
+        scalar_pkt->pushSenderState(
+                new ComputeUnit::ScalarDataPort::SenderState(
+                    gpuDynInst));
+    }
+
+    ComputeUnit::ScalarDataPort::MemReqEvent *scalar_event =
+            new ComputeUnit::ScalarDataPort::MemReqEvent
+            (computeUnit.scalarDataPort, scalar_pkt);
+    computeUnit.schedule(
+            scalar_event, curTick() + computeUnit.scalar_req_tick_latency);
 }
 
 } // namespace gem5