gpu-compute: Invalidate Scalar cache when SQC invalidates (#1093)

The scalar cache is not being invalidated which causes stale data to be left in the scalar cache between GPU kernels. This commit sends invalidates to the scalar cache when the SQC is invalidated. This is a sufficient baseline for simulation. Since the number of invalidates might be larger than the mandatory queue can hold and no flash invalidate mechanism exists in the VIPER protocol, the command line option for the mandatory queue size is removed, which is the same behavior as the SQC. Change-Id: I1723f224711b04caa4c88beccfa8fb73ccf56572
2024-05-06 07:35:38 -07:00
parent 36c1ea9c61
commit 0d3d456894
3 changed files with 50 additions and 16 deletions
--- a/configs/ruby/GPU_VIPER.py
+++ b/configs/ruby/GPU_VIPER.py
@@ -497,13 +497,6 @@ def define_options(parser):
    parser.add_argument(
        "--noL1", action="store_true", default=False, help="bypassL1"
    )
-    parser.add_argument(
-        "--scalar-buffer-size",
-        type=int,
-        default=128,
-        help="Size of the mandatory queue in the GPU scalar "
-        "cache controller",
-    )
    parser.add_argument(
        "--glc-atomic-latency", type=int, default=1, help="GLC Atomic Latency"
    )
@@ -841,9 +834,7 @@ def construct_scalars(options, system, ruby_system, network):
        scalar_cntrl.responseToSQC = MessageBuffer(ordered=True)
        scalar_cntrl.responseToSQC.in_port = network.out_port

-        scalar_cntrl.mandatoryQueue = MessageBuffer(
-            buffer_size=options.scalar_buffer_size
-        )
+        scalar_cntrl.mandatoryQueue = MessageBuffer()

    return (scalar_sequencers, scalar_cntrl_nodes)

--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -974,6 +974,14 @@ ComputeUnit::ScalarDataPort::recvTimingResp(PacketPtr pkt)
 bool
 ComputeUnit::ScalarDataPort::handleResponse(PacketPtr pkt)
 {
+    // From scalar cache invalidate that was issued at kernel start.
+    if (pkt->req->isKernel()) {
+        delete pkt->senderState;
+        delete pkt;
+
+        return true;
+    }
+
    assert(!pkt->req->isKernel());

    // retrieve sender state
@@ -1058,6 +1066,9 @@ ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
     */
    if (sender_state->wavefront != nullptr) {
        computeUnit->handleSQCReturn(pkt);
+    } else {
+        delete pkt->senderState;
+        delete pkt;
    }

    return true;
--- a/src/gpu-compute/scalar_memory_pipeline.cc
+++ b/src/gpu-compute/scalar_memory_pipeline.cc
@@ -180,7 +180,7 @@ ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst,
    // invalidation request to 0 and handle it in the sequencer
    req->setPaddr(0);

-    PacketPtr pkt = nullptr;
+    PacketPtr sqc_pkt = nullptr;

    // If kernelMemSync is true, then the invalidation request is from
    // kernel launch and is an implicit invalidation.If false, then it is
@@ -189,8 +189,8 @@ ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst,
        req->setCacheCoherenceFlags(Request::INV_L1);
        req->setReqInstSeqNum(gpuDynInst->seqNum());
        req->setFlags(Request::KERNEL);
-        pkt = new Packet(req, MemCmd::MemSyncReq);
-        pkt->pushSenderState(
+        sqc_pkt = new Packet(req, MemCmd::MemSyncReq);
+        sqc_pkt->pushSenderState(
                new ComputeUnit::SQCPort::SenderState(
                    gpuDynInst->wavefront(), nullptr));
    } else {
@@ -198,17 +198,49 @@ ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst,

        req->setReqInstSeqNum(gpuDynInst->seqNum());

-        pkt = new Packet(req, MemCmd::MemSyncReq);
-        pkt->pushSenderState(
+        sqc_pkt = new Packet(req, MemCmd::MemSyncReq);
+        sqc_pkt->pushSenderState(
                new ComputeUnit::SQCPort::SenderState(
                    gpuDynInst->wavefront(), nullptr));
    }

    ComputeUnit::SQCPort::MemReqEvent *sqc_event =
            new ComputeUnit::SQCPort::MemReqEvent
-            (computeUnit.sqcPort, pkt);
+            (computeUnit.sqcPort, sqc_pkt);
    computeUnit.schedule(
            sqc_event, curTick() + computeUnit.scalar_req_tick_latency);
+
+    // When the SQC is invalidated, perform a scalar cache invalidate as well.
+    // The SQC and Scalar cache are implement using the same SLICC SM, so this
+    // invalidate is identical to the SQC invalidate, however we need to make
+    // a new packet and request as they have different cache destinations.
+    PacketPtr scalar_pkt = nullptr;
+    RequestPtr scalar_req(req);
+
+    if (kernelMemSync) {
+        scalar_req->setCacheCoherenceFlags(Request::INV_L1);
+        scalar_req->setReqInstSeqNum(gpuDynInst->seqNum());
+        scalar_req->setFlags(Request::KERNEL);
+        scalar_pkt = new Packet(scalar_req, MemCmd::MemSyncReq);
+        scalar_pkt->pushSenderState(
+                new ComputeUnit::ScalarDataPort::SenderState(
+                    gpuDynInst));
+    } else {
+        gpuDynInst->setRequestFlags(scalar_req);
+
+        scalar_req->setReqInstSeqNum(gpuDynInst->seqNum());
+
+        scalar_pkt = new Packet(scalar_req, MemCmd::MemSyncReq);
+        scalar_pkt->pushSenderState(
+                new ComputeUnit::ScalarDataPort::SenderState(
+                    gpuDynInst));
+    }
+
+    ComputeUnit::ScalarDataPort::MemReqEvent *scalar_event =
+            new ComputeUnit::ScalarDataPort::MemReqEvent
+            (computeUnit.scalarDataPort, scalar_pkt);
+    computeUnit.schedule(
+            scalar_event, curTick() + computeUnit.scalar_req_tick_latency);
 }

 } // namespace gem5