diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc
index 5ddd7756ba..f5bf0192bc 100644
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -420,6 +420,12 @@ AMDGPUDevice::writeFrame(PacketPtr pkt, Addr offset)
 {
     DPRINTF(AMDGPUDevice, "Wrote framebuffer address %#lx\n", offset);
 
+    for (auto& cu: CP()->shader()->cuList) {
+        auto system = CP()->shader()->gpuCmdProc.system();
+        Addr aligned_addr = offset & ~(system->cacheLineSize() - 1);
+        cu->sendInvL2(aligned_addr);
+    }
+
     Addr aperture = gpuvm.getFrameAperture(offset);
     Addr aperture_offset = offset - aperture;
 
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index e485aa6161..5daa82e576 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -864,6 +864,25 @@ ComputeUnit::DataPort::handleResponse(PacketPtr pkt)
         //  - kernel end
         //  - non-kernel mem sync
 
+        // Non-kernel mem sync not from an instruction
+        if (!gpuDynInst) {
+            // If there is no dynamic instruction, a CU must be present.
+            ComputeUnit *cu = sender_state->computeUnit;
+            assert(cu != nullptr);
+
+            if (pkt->req->isInvL2()) {
+                cu->shader->decNumOutstandingInvL2s();
+                assert(cu->shader->getNumOutstandingInvL2s() >= 0);
+            } else {
+                panic("Unknown MemSyncResp not from an instruction");
+            }
+
+            // Cleanup and return, no other response events needed.
+            delete pkt->senderState;
+            delete pkt;
+            return true;
+        }
+
         // Kernel Launch
         // wavefront was nullptr when launching kernel, so it is meaningless
         // here (simdId=-1, wfSlotId=-1)
@@ -1403,6 +1422,23 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
     }
 }
 
+void
+ComputeUnit::sendInvL2(Addr paddr)
+{
+    auto req = std::make_shared<Request>(paddr, 64, 0, vramRequestorId());
+    req->setCacheCoherenceFlags(Request::GL2_CACHE_INV);
+
+    auto pkt = new Packet(req, MemCmd::MemSyncReq);
+    pkt->pushSenderState(
+       new ComputeUnit::DataPort::SenderState(this, 0, nullptr));
+
+    EventFunctionWrapper *mem_req_event = memPort[0].createMemReqEvent(pkt);
+
+    schedule(mem_req_event, curTick() + req_tick_latency);
+
+    shader->incNumOutstandingInvL2s();
+}
+
 void
 ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
 {
@@ -1701,16 +1737,20 @@ ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt)
     } else if (!(sendTimingReq(pkt))) {
         retries.push_back(std::make_pair(pkt, gpuDynInst));
 
-        DPRINTF(GPUPort,
-                "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
-                compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
-                id, pkt->req->getPaddr());
+        if (gpuDynInst) {
+            DPRINTF(GPUPort,
+                    "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
+                    compute_unit->cu_id, gpuDynInst->simdId,
+                    gpuDynInst->wfSlotId, id, pkt->req->getPaddr());
+        }
     } else {
-        DPRINTF(GPUPort,
-                "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
-                "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
-                gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
-                pkt->req->getPaddr());
+        if (gpuDynInst) {
+            DPRINTF(GPUPort,
+                    "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data"
+                    " req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
+                    gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
+                    pkt->req->getPaddr());
+        }
     }
 }
 
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index cfa145551f..6cdc22ea57 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -474,6 +474,8 @@ class ComputeUnit : public ClockedObject
 
     void handleSQCReturn(PacketPtr pkt);
 
+    void sendInvL2(Addr paddr);
+
   protected:
     RequestorID _requestorId;
 
@@ -527,6 +529,7 @@ class ComputeUnit : public ClockedObject
 
         struct SenderState : public Packet::SenderState
         {
+            ComputeUnit *computeUnit = nullptr;
             GPUDynInstPtr _gpuDynInst;
             PortID port_index;
             Packet::SenderState *saved;
@@ -536,6 +539,12 @@ class ComputeUnit : public ClockedObject
                 : _gpuDynInst(gpuDynInst),
                   port_index(_port_index),
                   saved(sender_state) { }
+
+            SenderState(ComputeUnit *cu, PortID _port_index,
+                        Packet::SenderState *sender_state=nullptr)
+                : computeUnit(cu),
+                  port_index(_port_index),
+                  saved(sender_state) { }
         };
 
         class SystemHubEvent : public Event
diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc
index 5093cc4ff2..2af54a262e 100644
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -41,6 +41,7 @@
 #include "debug/GPUKernelInfo.hh"
 #include "dev/amdgpu/amdgpu_device.hh"
 #include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/shader.hh"
 #include "mem/abstract_mem.hh"
 #include "mem/packet_access.hh"
 #include "mem/se_translating_port_proxy.hh"
@@ -126,6 +127,21 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
     unsigned akc_alignment_granularity = 64;
     assert(!(disp_pkt->kernel_object & (akc_alignment_granularity - 1)));
 
+    /**
+     * Make sure there is not a race condition with invalidates in the L2
+     * cache. The full system driver may write directly to memory using
+     * large BAR while the L2 cache is allowed to keep data in the valid
+     * state between kernel launches. This is a rare event but is required
+     * for correctness.
+     */
+    if (shader()->getNumOutstandingInvL2s() > 0) {
+        DPRINTF(GPUCommandProc,
+                "Deferring kernel launch due to outstanding L2 invalidates\n");
+        shader()->addDeferredDispatch(raw_pkt, queue_id, host_pkt_addr);
+
+        return;
+    }
+
     /**
      * Need to use a raw pointer for DmaVirtDevice API. This is deleted
      * in the dispatchKernelObject method.
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
index 6e3d556026..437d590b70 100644
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -552,6 +552,29 @@ Shader::notifyCuSleep() {
     }
 }
 
+void
+Shader::decNumOutstandingInvL2s()
+{
+    num_outstanding_invl2s--;
+
+    if (num_outstanding_invl2s == 0 && !deferred_dispatches.empty()) {
+        for (auto &dispatch : deferred_dispatches) {
+            gpuCmdProc.submitDispatchPkt(std::get<0>(dispatch),
+                                         std::get<1>(dispatch),
+                                         std::get<2>(dispatch));
+        }
+        deferred_dispatches.clear();
+    }
+}
+
+void
+Shader::addDeferredDispatch(void *raw_pkt, uint32_t queue_id,
+                            Addr host_pkt_addr)
+{
+    deferred_dispatches.push_back(
+            std::make_tuple(raw_pkt, queue_id, host_pkt_addr));
+}
+
 /**
  * Forward the VRAM requestor ID needed for device memory from CP.
  */
diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh
index 0287ddc169..c68f4d15b6 100644
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@@ -104,6 +104,11 @@ class Shader : public ClockedObject
     // Set to true by the dispatcher if the current kernel is a blit kernel
     bool blitKernel = false;
 
+    // Number of pending non-instruction invalidates outstanding. The shader
+    // should wait for these to be done to ensure correctness.
+    int num_outstanding_invl2s = 0;
+    std::vector<std::tuple<void *, uint32_t, Addr>> deferred_dispatches;
+
   public:
     typedef ShaderParams Params;
     enum hsail_mode_e {SIMT,VECTOR_SCALAR};
@@ -330,6 +335,13 @@ class Shader : public ClockedObject
         blitKernel = is_blit_kernel;
     }
 
+    void decNumOutstandingInvL2s();
+    void incNumOutstandingInvL2s() { num_outstanding_invl2s++; };
+    int getNumOutstandingInvL2s() const { return num_outstanding_invl2s; };
+
+    void addDeferredDispatch(void *raw_pkt, uint32_t queue_id,
+                             Addr host_pkt_addr);
+
   protected:
     struct ShaderStats : public statistics::Group
     {
diff --git a/src/mem/request.hh b/src/mem/request.hh
index 783e4212ab..80bd4c817a 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -1096,6 +1096,7 @@ class Request : public Extensible<Request>
      * setting extraFlags should be done via setCacheCoherenceFlags().
      */
     bool isInvL1() const { return _cacheCoherenceFlags.isSet(INV_L1); }
+    bool isInvL2() const { return _cacheCoherenceFlags.isSet(GL2_CACHE_INV); }
 
     bool
     isGL2CacheFlush() const
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
index 5812eef577..f6ac25be36 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -72,6 +72,7 @@ machine(MachineType:TCC, "TCC Cache")
     L2_Repl,                desc="L2 Replacement";
     // Probes
     PrbInv,                 desc="Invalidating probe";
+    InvCache,               desc="Invalidating probe from TCP";
     // Coming from Memory Controller
     WBAck,                  desc="writethrough ack from memory";
     Bypass,                 desc="Bypass the entire L2 cache";
@@ -413,6 +414,8 @@ machine(MachineType:TCC, "TCC Cache")
           }
         } else if (in_msg.Type == CoherenceRequestType:WriteFlush) {
             trigger(Event:Flush, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:InvCache) {
+            trigger(Event:InvCache, in_msg.addr, cache_entry, tbe);
         } else {
           DPRINTF(RubySlicc, "%s\n", in_msg);
           error("Unexpected Response Message to Core");
@@ -429,6 +432,19 @@ machine(MachineType:TCC, "TCC Cache")
     unset_cache_entry();
   }
 
+  action(ir_invL2Resp, "ir", desc="send L2 invalidate ack") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:InvL2Resp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
   action(sd_sendData, "sd", desc="send Shared response") {
     peek(coreRequestNetwork_in, CPURequestMsg) {
       enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
@@ -1188,6 +1204,12 @@ machine(MachineType:TCC, "TCC Cache")
     i_invL2;
   }
 
+  transition({I, V}, InvCache, I) {TagArrayRead, TagArrayWrite} {
+    i_invL2;
+    ir_invL2Resp;
+    p_popRequestQueue;
+  }
+
   transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
     pi_sendProbeResponseInv;
     pp_popProbeQueue;
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
index 97997a12b5..1ad935324c 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -75,6 +75,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     Evict,           desc="Evict if clean(invL1 for Load Acquire)";
     // Mem sys initiated
     Repl,            desc="Replacing block from cache";
+    InvL2,           desc="Invalidate to L2";
+    InvL2Resp,       desc="Invalidate L2 completed";
 
     // TCC initiated
     TCC_Ack,         desc="TCC Ack to Core Request";
@@ -286,9 +288,12 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
                      in_msg.Type == CoherenceResponseType:NBSysWBAck) {
             trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
             DPRINTF(RubySlicc, "Issuing TCC_AckWB\n");
-          } else {
-            error("Unexpected Response Message to Core");
-          }
+        } else if (in_msg.Type == CoherenceResponseType:InvL2Resp) {
+            DPRINTF(RubySlicc, "Issuing InvL2Resp\n");
+            trigger(Event:InvL2Resp, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unexpected Response Message to Core");
+        }
       }
     }
   }
@@ -333,6 +338,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
             trigger(Event:Flush, in_msg.LineAddress, cache_entry, tbe);
         } else if (in_msg.Type == RubyRequestType:REPLACEMENT){
             trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
+        } else if (in_msg.Type == RubyRequestType:InvL2){
+            trigger(Event:InvL2, in_msg.LineAddress, cache_entry, tbe);
         } else {
           error("Unexpected Request Message from VIC");
         }
@@ -609,6 +616,31 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     }
   }
 
+  action(il2_invL2, "il2", desc="Invalidate address in L2") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:InvCache;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+      peek(mandatoryQueue_in, RubyRequest) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
+    }
+  }
+
+  action(i2r_invL2Resp, "i2r", desc="Invalidate L2 completed") {
+    if (use_seq_not_coal) {
+        DPRINTF(RubySlicc, "Sequencer does not define invTCCCallback!\n");
+        assert(false);
+    } else {
+      coalescer.invTCCCallback(address);
+    }
+  }
+
   action(wd_wtDone, "wd", desc="writethrough done") {
     if (use_seq_not_coal) {
       DPRINTF(RubySlicc, "Sequencer does not define writeCompleteCallback!\n");
@@ -830,6 +862,22 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     p_popMandatoryQueue;
   }
 
+  transition(I, InvL2) {
+    il2_invL2;
+    p_popMandatoryQueue;
+  }
+
+  transition(V, InvL2, I) {
+    ic_invCache
+    il2_invL2;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, InvL2Resp) {
+    i2r_invL2Resp;
+    pr_popResponseQueue;
+  }
+
   // if a line is in IV and a TCC_AckWB comes back, we must have had a WT
   // store followed by a load. Thus, complete the store without affecting
   // TBE or line state.
diff --git a/src/mem/ruby/protocol/GPU_VIPER-msg.sm b/src/mem/ruby/protocol/GPU_VIPER-msg.sm
index 9074a86b52..106433f2c5 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-msg.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-msg.sm
@@ -62,6 +62,7 @@ structure (VIPERCoalescer, external = "yes") {
                      Cycles, Cycles, Cycles, bool);
   void atomicCallback(Addr, MachineType, DataBlock);
   void invTCPCallback(Addr);
+  void invTCCCallback(Addr);
   void writeCompleteCallback(Addr, uint64_t);
   void evictionCallback(Addr);
 }
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
index b860ff1681..cb5a8c3a95 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
@@ -46,6 +46,7 @@ enumeration(CoherenceRequestType, desc="Coherence Request Types") {
   WriteThroughFifo, desc="WriteThrough with no data";
   WriteThroughDummy, desc="WriteThrough with no data for atomic operation";
   WriteFlush,   desc="Release Flush";
+  InvCache,     desc="Invalidate Cache";
 
   WrCancel,     desc="want to cancel WB to Memory"; // should this be here?
 
@@ -95,6 +96,7 @@ enumeration(CoherenceResponseType, desc="Coherence Response Types") {
   StaleNotif,      desc="Notification of Stale WBAck, No data to writeback";
   CPUCancelWB,     desc="want to cancel WB to Memory";
   MemData,         desc="Data from Memory";
+  InvL2Resp,       desc="Invalidate L2 response";
 
   // for regions
   PrivateAck,      desc="Ack that r-buf received private notify";
diff --git a/src/mem/ruby/protocol/RubySlicc_Exports.sm b/src/mem/ruby/protocol/RubySlicc_Exports.sm
index ca44fd3780..5a7324cb72 100644
--- a/src/mem/ruby/protocol/RubySlicc_Exports.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm
@@ -181,6 +181,7 @@ enumeration(RubyRequestType, desc="...", default="RubyRequestType_NULL") {
   COMMIT,            desc="Commit version";
   NULL,              desc="Invalid request type";
   FLUSH,             desc="Flush request type";
+  InvL2,             desc="Invalidate L2";
   Release,           desc="Release operation";
   Acquire,           desc="Acquire opertion";
   AcquireRelease,    desc="Acquire and Release opertion";
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
index 90d6031c6e..5ee4105597 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -669,14 +669,14 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
 RequestStatus
 GPUCoalescer::makeRequest(PacketPtr pkt)
 {
-    // all packets must have valid instruction sequence numbers
-    assert(pkt->req->hasInstSeqNum());
-
     if (pkt->cmd == MemCmd::MemSyncReq) {
         // issue mem_sync requests immediately to the cache system without
         // going through uncoalescedTable like normal LD/ST/Atomic requests
         issueMemSyncRequest(pkt);
     } else {
+        // all packets must have valid instruction sequence numbers
+        assert(pkt->req->hasInstSeqNum());
+
         // otherwise, this must be either read or write command
         assert(pkt->isRead() || pkt->isWrite() || pkt->isFlush());
 
diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc
index 2adc41b578..47ceced3a7 100644
--- a/src/mem/ruby/system/VIPERCoalescer.cc
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -80,6 +80,7 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
     // VIPER does not expect MemSyncReq & Release since compute unit
     // does not specify an equivalent type of memory request.
     assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL1()) ||
+           (pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL2()) ||
             pkt->cmd == MemCmd::ReadReq ||
             pkt->cmd == MemCmd::WriteReq ||
             pkt->cmd == MemCmd::FlushReq ||
@@ -106,6 +107,10 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
         invTCP();
     }
 
+    if (pkt->req->isInvL2()) {
+        invTCC(pkt);
+    }
+
     return RequestStatus_Issued;
 }
 
@@ -306,5 +311,51 @@ VIPERCoalescer::invTCP()
             m_num_pending_invs);
 }
 
+void
+VIPERCoalescer::invTCCCallback(Addr addr)
+{
+    for (auto& pkt : m_pending_invl2s[addr]) {
+        RubyPort::SenderState *ss =
+            safe_cast<RubyPort::SenderState *>(pkt->senderState);
+        MemResponsePort *port = ss->port;
+        assert(port != nullptr);
+
+        // Now convert to MemSyncResp
+        pkt->makeResponse();
+
+        pkt->senderState = ss->predecessor;
+        delete ss;
+        port->hitCallback(pkt);
+    }
+    m_pending_invl2s.erase(addr);
+}
+
+/*
+ * Send an invalidate to a specific address in the TCC.
+ */
+void
+VIPERCoalescer::invTCC(PacketPtr pkt)
+{
+    assert(pkt);
+    assert(pkt->req);
+
+    Addr addr = pkt->req->getPaddr();
+    RubyRequestType request_type = RubyRequestType_InvL2;
+
+    std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+        clockEdge(), addr, 0, 0,
+        request_type, RubyAccessMode_Supervisor,
+        nullptr);
+
+    DPRINTF(GPUCoalescer, "Sending L2 invalidate to 0x%x\n", addr);
+
+    assert(m_mandatory_q_ptr);
+    Tick latency = cyclesToTicks(
+        m_controller->mandatoryQueueLatency(request_type));
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
+
+    m_pending_invl2s[addr].push_back(pkt);
+}
+
 } // namespace ruby
 } // namespace gem5
diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh
index c7e21e946b..3054fc399c 100644
--- a/src/mem/ruby/system/VIPERCoalescer.hh
+++ b/src/mem/ruby/system/VIPERCoalescer.hh
@@ -63,11 +63,13 @@ class VIPERCoalescer : public GPUCoalescer
     ~VIPERCoalescer();
     void writeCompleteCallback(Addr address, uint64_t instSeqNum);
     void invTCPCallback(Addr address);
+    void invTCCCallback(Addr address);
     RequestStatus makeRequest(PacketPtr pkt) override;
     void issueRequest(CoalescedRequest* crequest) override;
 
   private:
     void invTCP();
+    void invTCC(PacketPtr pkt);
 
     // make write-complete response packets from original write request packets
     void makeWriteCompletePkts(CoalescedRequest* crequest);
@@ -79,6 +81,9 @@ class VIPERCoalescer : public GPUCoalescer
     // number of remaining cache lines to be invalidated in TCP
     int m_num_pending_invs;
 
+    // outstanding L2 invalidate packets
+    std::unordered_map<Addr, std::vector<PacketPtr>> m_pending_invl2s;
+
     // a map of instruction sequence number and corresponding pending
     // write-complete response packets. Each write-complete response
     // corresponds to a pending store request that is waiting for