From 833392e7b2cd32410405880909ccfb0004b65814 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Fri, 15 Mar 2024 17:38:23 -0500
Subject: [PATCH 1/2] mem-ruby,gpu-compute: Allow memory reqs without inst

The GPUDynInst for sending memory requests through the CUs data port
is required but only used for DPRINTFs. Relax this constraint so that
the methods can be reused for requests such as probes generated by the
GPU device.

Change-Id: I16094e400968225596370b684d6471580888d98a
---
 src/gpu-compute/compute_unit.cc     | 22 +++++++++++++---------
 src/mem/ruby/system/GPUCoalescer.cc |  6 +++---
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index e485aa6161..90090a9288 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -1701,16 +1701,20 @@ ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt)
     } else if (!(sendTimingReq(pkt))) {
         retries.push_back(std::make_pair(pkt, gpuDynInst));
 
-        DPRINTF(GPUPort,
-                "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
-                compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
-                id, pkt->req->getPaddr());
+        if (gpuDynInst) {
+            DPRINTF(GPUPort,
+                    "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
+                    compute_unit->cu_id, gpuDynInst->simdId,
+                    gpuDynInst->wfSlotId, id, pkt->req->getPaddr());
+        }
     } else {
-        DPRINTF(GPUPort,
-                "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
-                "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
-                gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
-                pkt->req->getPaddr());
+        if (gpuDynInst) {
+            DPRINTF(GPUPort,
+                    "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data"
+                    " req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
+                    gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
+                    pkt->req->getPaddr());
+        }
     }
 }
 
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
index 90d6031c6e..5ee4105597 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -669,14 +669,14 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
 RequestStatus
 GPUCoalescer::makeRequest(PacketPtr pkt)
 {
-    // all packets must have valid instruction sequence numbers
-    assert(pkt->req->hasInstSeqNum());
-
     if (pkt->cmd == MemCmd::MemSyncReq) {
         // issue mem_sync requests immediately to the cache system without
         // going through uncoalescedTable like normal LD/ST/Atomic requests
         issueMemSyncRequest(pkt);
     } else {
+        // all packets must have valid instruction sequence numbers
+        assert(pkt->req->hasInstSeqNum());
+
         // otherwise, this must be either read or write command
         assert(pkt->isRead() || pkt->isWrite() || pkt->isFlush());
 

From 1d646694733585d1e39eb21c31fdc0824012c534 Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Fri, 15 Mar 2024 17:40:42 -0500
Subject: [PATCH 2/2] mem,gpu-compute: Implement GPU TCC directed invalidate

The GPU device currently supports large BAR which means that the driver
can write directly to GPU memory over the PCI bus without using SDMA or
PM4 packets. The gem5 PCI interface only provides an atomic interface
for BAR reads/writes, which means the values cannot go through timing
mode Ruby caches. This causes bugs as the TCC cache is allowed to keep
clean data between kernels for performance reasons. If there is a BAR
write directly to memory bypassing the cache, the value in the cache is
stale and must be invalidated.

In this commit a TCC invalidate is generated for all writes over PCI
that go directly to GPU memory. This will also invalidate TCP along the
way if necessary. This currently relies on the driver synchonization
which only allows BAR writes in between kernels. Therefore, the cache
should only be in I or V state.

To handle a race condition between invalidates and launching the next
kernel, the invalidates return a response and the GPU command processor
will wait for all TCC invalidates to be complete before launching the
next kernel.

This fixes issues with stale data in nanoGPT and possibly PENNANT.

Change-Id: I8e1290f842122682c271e5508a48037055bfbcdf
---
 src/dev/amdgpu/amdgpu_device.cc             |  6 +++
 src/gpu-compute/compute_unit.cc             | 36 ++++++++++++++
 src/gpu-compute/compute_unit.hh             |  9 ++++
 src/gpu-compute/gpu_command_processor.cc    | 16 ++++++
 src/gpu-compute/shader.cc                   | 23 +++++++++
 src/gpu-compute/shader.hh                   | 12 +++++
 src/mem/request.hh                          |  1 +
 src/mem/ruby/protocol/GPU_VIPER-TCC.sm      | 22 +++++++++
 src/mem/ruby/protocol/GPU_VIPER-TCP.sm      | 54 +++++++++++++++++++--
 src/mem/ruby/protocol/GPU_VIPER-msg.sm      |  1 +
 src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm |  2 +
 src/mem/ruby/protocol/RubySlicc_Exports.sm  |  1 +
 src/mem/ruby/system/VIPERCoalescer.cc       | 51 +++++++++++++++++++
 src/mem/ruby/system/VIPERCoalescer.hh       |  5 ++
 14 files changed, 236 insertions(+), 3 deletions(-)

diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc
index 5ddd7756ba..f5bf0192bc 100644
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -420,6 +420,12 @@ AMDGPUDevice::writeFrame(PacketPtr pkt, Addr offset)
 {
     DPRINTF(AMDGPUDevice, "Wrote framebuffer address %#lx\n", offset);
 
+    for (auto& cu: CP()->shader()->cuList) {
+        auto system = CP()->shader()->gpuCmdProc.system();
+        Addr aligned_addr = offset & ~(system->cacheLineSize() - 1);
+        cu->sendInvL2(aligned_addr);
+    }
+
     Addr aperture = gpuvm.getFrameAperture(offset);
     Addr aperture_offset = offset - aperture;
 
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index 90090a9288..5daa82e576 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -864,6 +864,25 @@ ComputeUnit::DataPort::handleResponse(PacketPtr pkt)
         //  - kernel end
         //  - non-kernel mem sync
 
+        // Non-kernel mem sync not from an instruction
+        if (!gpuDynInst) {
+            // If there is no dynamic instruction, a CU must be present.
+            ComputeUnit *cu = sender_state->computeUnit;
+            assert(cu != nullptr);
+
+            if (pkt->req->isInvL2()) {
+                cu->shader->decNumOutstandingInvL2s();
+                assert(cu->shader->getNumOutstandingInvL2s() >= 0);
+            } else {
+                panic("Unknown MemSyncResp not from an instruction");
+            }
+
+            // Cleanup and return, no other response events needed.
+            delete pkt->senderState;
+            delete pkt;
+            return true;
+        }
+
         // Kernel Launch
         // wavefront was nullptr when launching kernel, so it is meaningless
         // here (simdId=-1, wfSlotId=-1)
@@ -1403,6 +1422,23 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
     }
 }
 
+void
+ComputeUnit::sendInvL2(Addr paddr)
+{
+    auto req = std::make_shared<Request>(paddr, 64, 0, vramRequestorId());
+    req->setCacheCoherenceFlags(Request::GL2_CACHE_INV);
+
+    auto pkt = new Packet(req, MemCmd::MemSyncReq);
+    pkt->pushSenderState(
+       new ComputeUnit::DataPort::SenderState(this, 0, nullptr));
+
+    EventFunctionWrapper *mem_req_event = memPort[0].createMemReqEvent(pkt);
+
+    schedule(mem_req_event, curTick() + req_tick_latency);
+
+    shader->incNumOutstandingInvL2s();
+}
+
 void
 ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
 {
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index cfa145551f..6cdc22ea57 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -474,6 +474,8 @@ class ComputeUnit : public ClockedObject
 
     void handleSQCReturn(PacketPtr pkt);
 
+    void sendInvL2(Addr paddr);
+
   protected:
     RequestorID _requestorId;
 
@@ -527,6 +529,7 @@ class ComputeUnit : public ClockedObject
 
         struct SenderState : public Packet::SenderState
         {
+            ComputeUnit *computeUnit = nullptr;
             GPUDynInstPtr _gpuDynInst;
             PortID port_index;
             Packet::SenderState *saved;
@@ -536,6 +539,12 @@ class ComputeUnit : public ClockedObject
                 : _gpuDynInst(gpuDynInst),
                   port_index(_port_index),
                   saved(sender_state) { }
+
+            SenderState(ComputeUnit *cu, PortID _port_index,
+                        Packet::SenderState *sender_state=nullptr)
+                : computeUnit(cu),
+                  port_index(_port_index),
+                  saved(sender_state) { }
         };
 
         class SystemHubEvent : public Event
diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc
index 5093cc4ff2..2af54a262e 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -41,6 +41,7 @@
 #include "debug/GPUKernelInfo.hh"
 #include "dev/amdgpu/amdgpu_device.hh"
 #include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/shader.hh"
 #include "mem/abstract_mem.hh"
 #include "mem/packet_access.hh"
 #include "mem/se_translating_port_proxy.hh"
@@ -126,6 +127,21 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
     unsigned akc_alignment_granularity = 64;
     assert(!(disp_pkt->kernel_object & (akc_alignment_granularity - 1)));
 
+    /**
+     * Make sure there is not a race condition with invalidates in the L2
+     * cache. The full system driver may write directly to memory using
+     * large BAR while the L2 cache is allowed to keep data in the valid
+     * state between kernel launches. This is a rare event but is required
+     * for correctness.
+     */
+    if (shader()->getNumOutstandingInvL2s() > 0) {
+        DPRINTF(GPUCommandProc,
+                "Deferring kernel launch due to outstanding L2 invalidates\n");
+        shader()->addDeferredDispatch(raw_pkt, queue_id, host_pkt_addr);
+
+        return;
+    }
+
     /**
      * Need to use a raw pointer for DmaVirtDevice API. This is deleted
      * in the dispatchKernelObject method.
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
index 6e3d556026..437d590b70 100644
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -552,6 +552,29 @@ Shader::notifyCuSleep() {
     }
 }
 
+void
+Shader::decNumOutstandingInvL2s()
+{
+    num_outstanding_invl2s--;
+
+    if (num_outstanding_invl2s == 0 && !deferred_dispatches.empty()) {
+        for (auto &dispatch : deferred_dispatches) {
+            gpuCmdProc.submitDispatchPkt(std::get<0>(dispatch),
+                                         std::get<1>(dispatch),
+                                         std::get<2>(dispatch));
+        }
+        deferred_dispatches.clear();
+    }
+}
+
+void
+Shader::addDeferredDispatch(void *raw_pkt, uint32_t queue_id,
+                            Addr host_pkt_addr)
+{
+    deferred_dispatches.push_back(
+            std::make_tuple(raw_pkt, queue_id, host_pkt_addr));
+}
+
 /**
  * Forward the VRAM requestor ID needed for device memory from CP.
  */
diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh
index 0287ddc169..c68f4d15b6 100644
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@@ -104,6 +104,11 @@ class Shader : public ClockedObject
     // Set to true by the dispatcher if the current kernel is a blit kernel
     bool blitKernel = false;
 
+    // Number of pending non-instruction invalidates outstanding. The shader
+    // should wait for these to be done to ensure correctness.
+    int num_outstanding_invl2s = 0;
+    std::vector<std::tuple<void *, uint32_t, Addr>> deferred_dispatches;
+
   public:
     typedef ShaderParams Params;
     enum hsail_mode_e {SIMT,VECTOR_SCALAR};
@@ -330,6 +335,13 @@ class Shader : public ClockedObject
         blitKernel = is_blit_kernel;
     }
 
+    void decNumOutstandingInvL2s();
+    void incNumOutstandingInvL2s() { num_outstanding_invl2s++; };
+    int getNumOutstandingInvL2s() const { return num_outstanding_invl2s; };
+
+    void addDeferredDispatch(void *raw_pkt, uint32_t queue_id,
+                             Addr host_pkt_addr);
+
   protected:
     struct ShaderStats : public statistics::Group
     {
diff --git a/src/mem/request.hh b/src/mem/request.hh
index 783e4212ab..80bd4c817a 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -1096,6 +1096,7 @@ class Request : public Extensible<Request>
      * setting extraFlags should be done via setCacheCoherenceFlags().
      */
     bool isInvL1() const { return _cacheCoherenceFlags.isSet(INV_L1); }
+    bool isInvL2() const { return _cacheCoherenceFlags.isSet(GL2_CACHE_INV); }
 
     bool
     isGL2CacheFlush() const
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
index 5812eef577..f6ac25be36 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -72,6 +72,7 @@ machine(MachineType:TCC, "TCC Cache")
     L2_Repl,                desc="L2 Replacement";
     // Probes
     PrbInv,                 desc="Invalidating probe";
+    InvCache,               desc="Invalidating probe from TCP";
     // Coming from Memory Controller
     WBAck,                  desc="writethrough ack from memory";
     Bypass,                 desc="Bypass the entire L2 cache";
@@ -413,6 +414,8 @@ machine(MachineType:TCC, "TCC Cache")
           }
         } else if (in_msg.Type == CoherenceRequestType:WriteFlush) {
             trigger(Event:Flush, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:InvCache) {
+            trigger(Event:InvCache, in_msg.addr, cache_entry, tbe);
         } else {
           DPRINTF(RubySlicc, "%s\n", in_msg);
           error("Unexpected Response Message to Core");
@@ -429,6 +432,19 @@ machine(MachineType:TCC, "TCC Cache")
     unset_cache_entry();
   }
 
+  action(ir_invL2Resp, "ir", desc="send L2 invalidate ack") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:InvL2Resp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
   action(sd_sendData, "sd", desc="send Shared response") {
     peek(coreRequestNetwork_in, CPURequestMsg) {
       enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
@@ -1188,6 +1204,12 @@ machine(MachineType:TCC, "TCC Cache")
     i_invL2;
   }
 
+  transition({I, V}, InvCache, I) {TagArrayRead, TagArrayWrite} {
+    i_invL2;
+    ir_invL2Resp;
+    p_popRequestQueue;
+  }
+
   transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
     pi_sendProbeResponseInv;
     pp_popProbeQueue;
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
index 97997a12b5..1ad935324c 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -75,6 +75,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     Evict,           desc="Evict if clean(invL1 for Load Acquire)";
     // Mem sys initiated
     Repl,            desc="Replacing block from cache";
+    InvL2,           desc="Invalidate to L2";
+    InvL2Resp,       desc="Invalidate L2 completed";
 
     // TCC initiated
     TCC_Ack,         desc="TCC Ack to Core Request";
@@ -286,9 +288,12 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
                      in_msg.Type == CoherenceResponseType:NBSysWBAck) {
             trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
             DPRINTF(RubySlicc, "Issuing TCC_AckWB\n");
-          } else {
-            error("Unexpected Response Message to Core");
-          }
+        } else if (in_msg.Type == CoherenceResponseType:InvL2Resp) {
+            DPRINTF(RubySlicc, "Issuing InvL2Resp\n");
+            trigger(Event:InvL2Resp, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unexpected Response Message to Core");
+        }
       }
     }
   }
@@ -333,6 +338,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
             trigger(Event:Flush, in_msg.LineAddress, cache_entry, tbe);
         } else if (in_msg.Type == RubyRequestType:REPLACEMENT){
             trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
+        } else if (in_msg.Type == RubyRequestType:InvL2){
+            trigger(Event:InvL2, in_msg.LineAddress, cache_entry, tbe);
         } else {
           error("Unexpected Request Message from VIC");
         }
@@ -609,6 +616,31 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     }
   }
 
+  action(il2_invL2, "il2", desc="Invalidate address in L2") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:InvCache;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+      peek(mandatoryQueue_in, RubyRequest) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
+    }
+  }
+
+  action(i2r_invL2Resp, "i2r", desc="Invalidate L2 completed") {
+    if (use_seq_not_coal) {
+        DPRINTF(RubySlicc, "Sequencer does not define invTCCCallback!\n");
+        assert(false);
+    } else {
+      coalescer.invTCCCallback(address);
+    }
+  }
+
   action(wd_wtDone, "wd", desc="writethrough done") {
     if (use_seq_not_coal) {
       DPRINTF(RubySlicc, "Sequencer does not define writeCompleteCallback!\n");
@@ -830,6 +862,22 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     p_popMandatoryQueue;
   }
 
+  transition(I, InvL2) {
+    il2_invL2;
+    p_popMandatoryQueue;
+  }
+
+  transition(V, InvL2, I) {
+    ic_invCache
+    il2_invL2;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, InvL2Resp) {
+    i2r_invL2Resp;
+    pr_popResponseQueue;
+  }
+
   // if a line is in IV and a TCC_AckWB comes back, we must have had a WT
   // store followed by a load. Thus, complete the store without affecting
   // TBE or line state.
diff --git a/src/mem/ruby/protocol/GPU_VIPER-msg.sm b/src/mem/ruby/protocol/GPU_VIPER-msg.sm
index 9074a86b52..106433f2c5 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-msg.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-msg.sm
@@ -62,6 +62,7 @@ structure (VIPERCoalescer, external = "yes") {
                      Cycles, Cycles, Cycles, bool);
   void atomicCallback(Addr, MachineType, DataBlock);
   void invTCPCallback(Addr);
+  void invTCCCallback(Addr);
   void writeCompleteCallback(Addr, uint64_t);
   void evictionCallback(Addr);
 }
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
index b860ff1681..cb5a8c3a95 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
@@ -46,6 +46,7 @@ enumeration(CoherenceRequestType, desc="Coherence Request Types") {
   WriteThroughFifo, desc="WriteThrough with no data";
   WriteThroughDummy, desc="WriteThrough with no data for atomic operation";
   WriteFlush,   desc="Release Flush";
+  InvCache,     desc="Invalidate Cache";
 
   WrCancel,     desc="want to cancel WB to Memory"; // should this be here?
 
@@ -95,6 +96,7 @@ enumeration(CoherenceResponseType, desc="Coherence Response Types") {
   StaleNotif,      desc="Notification of Stale WBAck, No data to writeback";
   CPUCancelWB,     desc="want to cancel WB to Memory";
   MemData,         desc="Data from Memory";
+  InvL2Resp,       desc="Invalidate L2 response";
 
   // for regions
   PrivateAck,      desc="Ack that r-buf received private notify";
diff --git a/src/mem/ruby/protocol/RubySlicc_Exports.sm b/src/mem/ruby/protocol/RubySlicc_Exports.sm
index ca44fd3780..5a7324cb72 100644
--- a/src/mem/ruby/protocol/RubySlicc_Exports.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm
@@ -181,6 +181,7 @@ enumeration(RubyRequestType, desc="...", default="RubyRequestType_NULL") {
   COMMIT,            desc="Commit version";
   NULL,              desc="Invalid request type";
   FLUSH,             desc="Flush request type";
+  InvL2,             desc="Invalidate L2";
   Release,           desc="Release operation";
   Acquire,           desc="Acquire opertion";
   AcquireRelease,    desc="Acquire and Release opertion";
diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc
index 2adc41b578..47ceced3a7 100644
--- a/src/mem/ruby/system/VIPERCoalescer.cc
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -80,6 +80,7 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
     // VIPER does not expect MemSyncReq & Release since compute unit
     // does not specify an equivalent type of memory request.
     assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL1()) ||
+           (pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL2()) ||
             pkt->cmd == MemCmd::ReadReq ||
             pkt->cmd == MemCmd::WriteReq ||
             pkt->cmd == MemCmd::FlushReq ||
@@ -106,6 +107,10 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
         invTCP();
     }
 
+    if (pkt->req->isInvL2()) {
+        invTCC(pkt);
+    }
+
     return RequestStatus_Issued;
 }
 
@@ -306,5 +311,51 @@ VIPERCoalescer::invTCP()
             m_num_pending_invs);
 }
 
+void
+VIPERCoalescer::invTCCCallback(Addr addr)
+{
+    for (auto& pkt : m_pending_invl2s[addr]) {
+        RubyPort::SenderState *ss =
+            safe_cast<RubyPort::SenderState *>(pkt->senderState);
+        MemResponsePort *port = ss->port;
+        assert(port != nullptr);
+
+        // Now convert to MemSyncResp
+        pkt->makeResponse();
+
+        pkt->senderState = ss->predecessor;
+        delete ss;
+        port->hitCallback(pkt);
+    }
+    m_pending_invl2s.erase(addr);
+}
+
+/*
+ * Send an invalidate to a specific address in the TCC.
+ */
+void
+VIPERCoalescer::invTCC(PacketPtr pkt)
+{
+    assert(pkt);
+    assert(pkt->req);
+
+    Addr addr = pkt->req->getPaddr();
+    RubyRequestType request_type = RubyRequestType_InvL2;
+
+    std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+        clockEdge(), addr, 0, 0,
+        request_type, RubyAccessMode_Supervisor,
+        nullptr);
+
+    DPRINTF(GPUCoalescer, "Sending L2 invalidate to 0x%x\n", addr);
+
+    assert(m_mandatory_q_ptr);
+    Tick latency = cyclesToTicks(
+        m_controller->mandatoryQueueLatency(request_type));
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
+
+    m_pending_invl2s[addr].push_back(pkt);
+}
+
 } // namespace ruby
 } // namespace gem5
diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh
index c7e21e946b..3054fc399c 100644
--- a/src/mem/ruby/system/VIPERCoalescer.hh
+++ b/src/mem/ruby/system/VIPERCoalescer.hh
@@ -63,11 +63,13 @@ class VIPERCoalescer : public GPUCoalescer
     ~VIPERCoalescer();
     void writeCompleteCallback(Addr address, uint64_t instSeqNum);
     void invTCPCallback(Addr address);
+    void invTCCCallback(Addr address);
     RequestStatus makeRequest(PacketPtr pkt) override;
     void issueRequest(CoalescedRequest* crequest) override;
 
   private:
     void invTCP();
+    void invTCC(PacketPtr pkt);
 
     // make write-complete response packets from original write request packets
     void makeWriteCompletePkts(CoalescedRequest* crequest);
@@ -79,6 +81,9 @@ class VIPERCoalescer : public GPUCoalescer
     // number of remaining cache lines to be invalidated in TCP
     int m_num_pending_invs;
 
+    // outstanding L2 invalidate packets
+    std::unordered_map<Addr, std::vector<PacketPtr>> m_pending_invl2s;
+
     // a map of instruction sequence number and corresponding pending
     // write-complete response packets. Each write-complete response
     // corresponds to a pending store request that is waiting for