diff --git a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
index 000cf5b1e6..bdc5d73f20 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
@@ -48,6 +48,9 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
 {
   state_declaration(State, desc="SQC Cache States", default="SQC_State_I") {
     I, AccessPermission:Invalid, desc="Invalid";
+    // Note: currently IV in the TCP is only for pending loads to a given cache
+    // line.  Since the SQC is read only, there are no stores.
+    IV, AccessPermission:Invalid, desc="Going from I to V, waiting on TCC data";
     V, AccessPermission:Read_Only, desc="Valid";
   }
 
@@ -98,6 +101,7 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
   void unset_tbe();
   void wakeUpAllBuffers();
   void wakeUpBuffers(Addr a);
+  void wakeUpAllBuffers(Addr a);
   Cycles curCycle();
 
   // Internal functions
@@ -270,6 +274,21 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
     }
   }
 
+  action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+    check_allocate(TBEs);
+    TBEs.allocate(address);
+    set_tbe(TBEs.lookup(address));
+  }
+
+  action(d_deallocateTBE, "d", desc="Deallocate TBE") {
+    TBEs.deallocate(address);
+    unset_tbe();
+  }
+
+  action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") {
+    stall_and_wait(mandatoryQueue_in, address);
+  }
+
   action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
     mandatoryQueue_in.dequeue(clockEdge());
   }
@@ -278,6 +297,10 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
     responseToSQC_in.dequeue(clockEdge());
   }
 
+  action(wada_wakeUpAllDependentsAddr, "wada", desc="Wake up any requests waiting for this address") {
+    wakeUpAllBuffers(address);
+  }
+
   action(l_loadDoneHit, "ldh", desc="local load done (hits in SQC)") {
     assert(is_valid(cache_entry));
     sequencer.readCallback(address, cache_entry.DataBlk, true, MachineType:L1Cache);
@@ -313,22 +336,52 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
 
   // Transitions
 
+  // if another request arrives for the same cache line that has a pending
+  // load, put it on the wakeup buffer.  This reduced resource contention since
+  // they won't try again every cycle and will instead only try again once woken
+  // up
+  transition(IV, {Fetch}) {
+      st_stallAndWaitRequest;
+  }
+
   // transitions from base
-  transition({I, V}, Repl, I) {TagArrayRead, TagArrayWrite} {
+  transition({I, IV, V}, Repl, I) {TagArrayRead, TagArrayWrite} {
     // since we're evicting something, don't bother classifying as hit/miss
     ic_invCache;
   }
 
-  transition(I, Data, V) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+  // if we got a response for a load where the line is in I, then
+  // another request must have come in that replaced the line in question in
+  // the cache.  Thus, complete this request without allocating the line, but
+  // still deallocate TBE and wakeup any dependent addresses.
+  transition(I, Data) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+    // don't profile this as a hit/miss since it's a reponse from L2,
+    // so we already counted it
+    l_loadDoneMiss;
+    wada_wakeUpAllDependentsAddr;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  // if line is currently in IV, then Data is returning the data for a
+  // pending load, so transition to V, deallocate TBE, and wakeup any dependent
+  // requests so they will be replayed now that this request has returned.
+  transition(IV, Data, V) {TagArrayRead, TagArrayWrite, DataArrayRead} {
     a_allocate;
     // don't profile this as a hit/miss since it's a reponse from L2,
     // so we already counted it
     w_writeCache;
     l_loadDoneMiss;
+    wada_wakeUpAllDependentsAddr;
+    d_deallocateTBE;
     pr_popResponseQueue;
   }
 
-  transition(I, Fetch) {TagArrayRead, TagArrayWrite} {
+  // if we have a load that misses, allocate TBE entry and transition to IV
+  // to prevent subsequent requests to same cache line from also going to TCC
+  // while this request is pending
+  transition(I, Fetch, IV) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
     nS_issueRdBlkS;
     uu_profileDataMiss; // since line wasn't in SQC, we missed
     p_popMandatoryQueue;
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
index 7cb3a00e26..14c9c8c1cc 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -61,6 +61,7 @@ machine(MachineType:TCC, "TCC Cache")
     WrVicBlk,               desc="L1 Write Through";
     WrVicBlkBack,           desc="L1 Write Through(dirty cache)";
     WrVicBlkEvict,          desc="L1 Write Through(dirty cache) and evict";
+    AtomicWait,             desc="Atomic Op that must wait for pending loads";
     Atomic,                 desc="Atomic Op";
     AtomicPassOn,           desc="Atomic Op Passed on to Directory";
     AtomicDone,             desc="AtomicOps Complete";
@@ -113,6 +114,7 @@ machine(MachineType:TCC, "TCC Cache")
     bool Shared,                     desc="Victim hit by shared probe";
     MachineID From,                  desc="Waiting for writeback from...";
     NetDest Destination,             desc="Data destination";
+    int numPending,                  desc="num pending requests";
     int numPendingDirectoryAtomics,  desc="number of pending atomics to be performed in directory";
     int atomicDoneCnt,               desc="number AtomicDones triggered";
     bool isGLCSet,                   desc="Bypass L1 Cache";
@@ -293,11 +295,14 @@ machine(MachineType:TCC, "TCC Cache")
       peek(responseFromNB_in, ResponseMsg, block_on="addr") {
         TBE tbe := TBEs.lookup(in_msg.addr);
         Entry cache_entry := getCacheEntry(in_msg.addr);
-        bool is_slc_set := false;
-
-        if (!is_invalid(tbe)) {
-            is_slc_set := tbe.isSLCSet;
-        }
+        /*
+          MOESI_AMD_Base-dir acts as the directory, and it always passes
+          SLC information back to L2 because of races at L2 with requests
+          from different CUs sending requests to same cache line in parallel.
+          If these requests have different GLC/SLC settings, the L2 TBE may
+          not have the correct GLC/SLC information for a given request.
+         */
+        bool is_slc_set := in_msg.isSLCSet;
 
         // Whether the SLC bit is set or not, WB acks should invoke the
         // WBAck event. For cases where a read response will follow a
@@ -372,16 +377,29 @@ machine(MachineType:TCC, "TCC Cache")
         } else if (in_msg.Type == CoherenceRequestType:Atomic ||
                    in_msg.Type == CoherenceRequestType:AtomicReturn ||
                    in_msg.Type == CoherenceRequestType:AtomicNoReturn) {
-          // If the request is system-level, if the address isn't in the cache,
-          // or if this cache is write-through, then send the request to the
-          // directory. Since non-SLC atomics won't be performed by the directory,
-          // TCC will perform the atomic on the return path on Event:Data.
-          // The action will invalidate the cache line if SLC is set and the address is
-          // in the cache.
-          if(in_msg.isSLCSet || !WB) {
-            trigger(Event:AtomicPassOn, in_msg.addr, cache_entry, tbe);
+	  /*
+	    If there are pending requests for this line already and those
+	    requests are not atomics, because we can't easily differentiate
+	    between different request types on return and because decrementing
+	    the atomic count assumes all returned requests in the A state are
+	    atomics, we will need to put this atomic to sleep and wake it up
+	    when the loads return.
+	   */
+	  if (is_valid(tbe) && (tbe.numPending > 0) &&
+	        (tbe.numPendingDirectoryAtomics == 0)) {
+            trigger(Event:AtomicWait, in_msg.addr, cache_entry, tbe);
           } else {
-            trigger(Event:Atomic, in_msg.addr, cache_entry, tbe);
+            // If the request is system-level, if the address isn't in the cache,
+            // or if this cache is write-through, then send the request to the
+            // directory. Since non-SLC atomics won't be performed by the directory,
+            // TCC will perform the atomic on the return path on Event:Data.
+            // The action will invalidate the cache line if SLC is set and the address is
+            // in the cache.
+            if(in_msg.isSLCSet || !WB) {
+              trigger(Event:AtomicPassOn, in_msg.addr, cache_entry, tbe);
+            } else {
+              trigger(Event:Atomic, in_msg.addr, cache_entry, tbe);
+            }
           }
         } else if (in_msg.Type == CoherenceRequestType:RdBlk) {
           if (in_msg.isSLCSet) {
@@ -433,24 +451,35 @@ machine(MachineType:TCC, "TCC Cache")
       out_msg.addr := address;
       out_msg.Type := CoherenceResponseType:TDSysResp;
       out_msg.Sender := machineID;
-      out_msg.Destination := tbe.Destination;
-      out_msg.DataBlk := cache_entry.DataBlk;
       out_msg.MessageSize := MessageSizeType:Response_Data;
       out_msg.Dirty := false;
       out_msg.State := CoherenceState:Shared;
-      DPRINTF(RubySlicc, "%s\n", out_msg);
       peek(responseFromNB_in, ResponseMsg) {
-        out_msg.isGLCSet := tbe.isGLCSet;
-        out_msg.isSLCSet := tbe.isSLCSet;
+        // if line state is Invalid, then we must be doing the transition(I, Data)
+        // so use the DataBlk from the incoming message
+        if ((getAccessPermission(address) == AccessPermission:NotPresent) ||
+	      (getAccessPermission(address) == AccessPermission:Invalid)) {
+          out_msg.DataBlk := in_msg.DataBlk;
+        } else {
+          out_msg.DataBlk := cache_entry.DataBlk;
+        }
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+        // reuse CURequestor field to allow multiple concurrent loads and
+        // track where they should go back to (since TBE can't distinguish
+        // destinations)
+        out_msg.Destination.clear();
+        out_msg.Destination.add(in_msg.CURequestor);
       }
+      DPRINTF(RubySlicc, "%s\n", out_msg);
     }
     enqueue(unblockToNB_out, UnblockMsg, 1) {
       out_msg.addr := address;
       out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
       out_msg.MessageSize := MessageSizeType:Unblock_Control;
       peek(responseFromNB_in, ResponseMsg) {
-        out_msg.isGLCSet := tbe.isGLCSet;
-        out_msg.isSLCSet := tbe.isSLCSet;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
       DPRINTF(RubySlicc, "%s\n", out_msg);
     }
@@ -462,13 +491,17 @@ machine(MachineType:TCC, "TCC Cache")
           out_msg.addr := address;
           out_msg.Type := CoherenceResponseType:TDSysResp;
           out_msg.Sender := machineID;
-          out_msg.Destination := tbe.Destination;
+          // reuse CURequestor field to allow multiple concurrent loads and
+          // track where they should go back to (since TBE can't distinguish
+          // destinations)
+          out_msg.Destination.clear();
+          out_msg.Destination.add(in_msg.CURequestor);
           out_msg.DataBlk := in_msg.DataBlk;
           out_msg.MessageSize := MessageSizeType:Response_Data;
           out_msg.Dirty := false;
           out_msg.State := CoherenceState:Shared;
-          out_msg.isGLCSet := tbe.isGLCSet;
-          out_msg.isSLCSet := tbe.isSLCSet;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
           DPRINTF(RubySlicc, "%s\n", out_msg);
         }
         enqueue(unblockToNB_out, UnblockMsg, 1) {
@@ -481,19 +514,25 @@ machine(MachineType:TCC, "TCC Cache")
   }
 
   action(rd_requestData, "r", desc="Miss in L2, pass on") {
-    if(tbe.Destination.count()==1){
-      peek(coreRequestNetwork_in, CPURequestMsg) {
-        enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
-          out_msg.addr := address;
-          out_msg.Type := in_msg.Type;
-          out_msg.Requestor := machineID;
-          out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
-          out_msg.Shared := false; // unneeded for this request
-          out_msg.MessageSize := in_msg.MessageSize;
-          out_msg.isGLCSet := tbe.isGLCSet;
-          out_msg.isSLCSet := tbe.isSLCSet;
-          DPRINTF(RubySlicc, "%s\n", out_msg);
-        }
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      DPRINTF(RubySlicc, "in_msg: %s\n", in_msg);
+      enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+        out_msg.addr := address;
+        out_msg.Type := in_msg.Type;
+        out_msg.Requestor := machineID;
+        /*
+          To allow multiple concurrent requests from different CUs, we pass
+          the orgin information along to the directory, which stores it in its
+          TBE as appropriate before passing it back to the TCC on the return
+          path.
+         */
+        out_msg.CURequestor := in_msg.Requestor;
+        out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
+        out_msg.Shared := false; // unneeded for this request
+        out_msg.MessageSize := in_msg.MessageSize;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+        DPRINTF(RubySlicc, "out_msg: %s\n", out_msg);
       }
     }
   }
@@ -504,7 +543,7 @@ machine(MachineType:TCC, "TCC Cache")
         out_msg.addr := address;
         out_msg.Type := CoherenceResponseType:TDSysWBAck;
         out_msg.Destination.clear();
-        out_msg.Destination.add(in_msg.WTRequestor);
+        out_msg.Destination.add(in_msg.CURequestor);
         out_msg.Sender := machineID;
         out_msg.MessageSize := MessageSizeType:Writeback_Control;
         out_msg.instSeqNum := in_msg.instSeqNum;
@@ -562,7 +601,7 @@ machine(MachineType:TCC, "TCC Cache")
         enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
           out_msg.addr := address;
           out_msg.Type := CoherenceResponseType:TDSysResp;
-          out_msg.Destination.add(in_msg.WTRequestor);
+          out_msg.Destination.add(in_msg.CURequestor);
           out_msg.Sender := machineID;
           out_msg.MessageSize := in_msg.MessageSize;
           out_msg.DataBlk := cache_entry.DataBlk;
@@ -578,12 +617,12 @@ machine(MachineType:TCC, "TCC Cache")
         enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
           out_msg.addr := address;
           out_msg.Type := CoherenceResponseType:TDSysResp;
-          out_msg.Destination.add(in_msg.WTRequestor);
+          out_msg.Destination.add(in_msg.CURequestor);
           out_msg.Sender := machineID;
           out_msg.MessageSize := in_msg.MessageSize;
           out_msg.DataBlk := in_msg.DataBlk;
-          out_msg.isGLCSet := tbe.isGLCSet;
-          out_msg.isSLCSet := tbe.isSLCSet;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
         }
     }
   }
@@ -611,7 +650,10 @@ machine(MachineType:TCC, "TCC Cache")
       tbe.Destination.clear();
       tbe.numPendingDirectoryAtomics := 0;
       tbe.atomicDoneCnt := 0;
+      tbe.numPending := 0;
     }
+    // each pending requests increments this count by 1
+    tbe.numPending := tbe.numPending + 1;
     if (coreRequestNetwork_in.isReady(clockEdge())) {
       peek(coreRequestNetwork_in, CPURequestMsg) {
         if(in_msg.Type == CoherenceRequestType:RdBlk ||
@@ -620,6 +662,16 @@ machine(MachineType:TCC, "TCC Cache")
            in_msg.Type == CoherenceRequestType:AtomicNoReturn){
           tbe.Destination.add(in_msg.Requestor);
         }
+        /*
+          If there are multiple concurrent requests to the same cache line, each
+          one will overwrite the previous ones GLC/SLC information here.
+          If these requests have different GLC/SLC information, this causes
+          a segfault.  Hence, currently the support relies on the directory to
+          pass back the GLC/SLC information instead of relying on L2 TBE to be
+          correct.
+
+          This message is left here as an FYI for future developers.
+         */
         tbe.isGLCSet := in_msg.isGLCSet;
         tbe.isSLCSet := in_msg.isSLCSet;
         if(in_msg.Type == CoherenceRequestType:Atomic ||
@@ -633,9 +685,14 @@ machine(MachineType:TCC, "TCC Cache")
   }
 
   action(dt_deallocateTBE, "dt", desc="Deallocate TBE entry") {
-    tbe.Destination.clear();
-    TBEs.deallocate(address);
-    unset_tbe();
+    // since we may have multiple destinations, can't deallocate if we aren't
+    // last one
+    tbe.numPending := tbe.numPending - 1;
+    if (tbe.numPending == 0) {
+      tbe.Destination.clear();
+      TBEs.deallocate(address);
+      unset_tbe();
+    }
   }
 
   action(wcb_writeCacheBlock, "wcb", desc="write data to TCC") {
@@ -672,7 +729,7 @@ machine(MachineType:TCC, "TCC Cache")
       enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
         out_msg.addr := address;
         out_msg.Requestor := machineID;
-        out_msg.WTRequestor := in_msg.Requestor;
+        out_msg.CURequestor := in_msg.Requestor;
         out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
         out_msg.MessageSize := MessageSizeType:Data;
         out_msg.Type := CoherenceRequestType:WriteThrough;
@@ -680,6 +737,8 @@ machine(MachineType:TCC, "TCC Cache")
         out_msg.DataBlk := in_msg.DataBlk;
         out_msg.writeMask.orMask(in_msg.writeMask);
         out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
     }
   }
@@ -688,7 +747,7 @@ machine(MachineType:TCC, "TCC Cache")
     enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
       out_msg.addr := address;
       out_msg.Requestor := machineID;
-      out_msg.WTRequestor := machineID;
+      out_msg.CURequestor := machineID;
       out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
       out_msg.MessageSize := MessageSizeType:Data;
       out_msg.Type := CoherenceRequestType:WriteThrough;
@@ -703,13 +762,15 @@ machine(MachineType:TCC, "TCC Cache")
       enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
         out_msg.addr := address;
         out_msg.Requestor := machineID;
-        out_msg.WTRequestor := in_msg.Requestor;
+        out_msg.CURequestor := in_msg.Requestor;
         out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
         out_msg.MessageSize := MessageSizeType:Data;
         out_msg.Type := CoherenceRequestType:WriteFlush;
         out_msg.Dirty := true;
         out_msg.DataBlk := cache_entry.DataBlk;
         out_msg.writeMask.orMask(cache_entry.writeMask);
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
     }
   }
@@ -719,7 +780,7 @@ machine(MachineType:TCC, "TCC Cache")
       enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
         out_msg.addr := address;
         out_msg.Requestor := machineID;
-        out_msg.WTRequestor := in_msg.Requestor;
+        out_msg.CURequestor := in_msg.Requestor;
         out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
         out_msg.MessageSize := MessageSizeType:Data;
         out_msg.Type := in_msg.Type;
@@ -768,9 +829,17 @@ machine(MachineType:TCC, "TCC Cache")
     wakeUpAllBuffers(address);
   }
 
+  /*
+    Currently z_stall is unused because it can lead to Protocol Stalls that
+    eventually lead to deadlock.  Instead, it is recommended to use
+    st_stallAndWaitRequest in combination with a wakeupBuffer call (e.g.,
+    wada_wakeUpAllDependentsAddr) to put the pending requests to sleep instead of
+    them causing head of line blocking -- wada_wakeUpAllDependentsAddr should wake
+    the request up once the request preventing it from completing is done.
   action(z_stall, "z", desc="stall") {
       // built-in
   }
+  */
 
 
   action(inpa_incrementNumPendingDirectoryAtomics, "inpa", desc="inc num atomics") {
@@ -792,8 +861,8 @@ machine(MachineType:TCC, "TCC Cache")
         out_msg.addr := address;
         out_msg.Type := TriggerType:AtomicDone;
         peek(responseFromNB_in, ResponseMsg) {
-          out_msg.isGLCSet := tbe.isGLCSet;
-          out_msg.isSLCSet := tbe.isSLCSet;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
         }
       }
     }
@@ -832,31 +901,53 @@ machine(MachineType:TCC, "TCC Cache")
   // they can cause a resource stall deadlock!
 
   transition(WI, {RdBlk, WrVicBlk, Atomic, AtomicPassOn, WrVicBlkBack}) { //TagArrayRead} {
-      // by putting the stalled requests in a buffer, we reduce resource contention
-      // since they won't try again every cycle and will instead only try again once
-      // woken up
+      // don't profile as hit or miss since it will be tried again
+      /*
+        By putting the stalled requests in a buffer, we reduce resource contention
+        since they won't try again every cycle and will instead only try again once
+        woken up.
+       */
       st_stallAndWaitRequest;
   }
   transition(WIB, {RdBlk, WrVicBlk, Atomic, WrVicBlkBack}) { //TagArrayRead} {
-      // by putting the stalled requests in a buffer, we reduce resource contention
-      // since they won't try again every cycle and will instead only try again once
-      // woken up
+      // don't profile as hit or miss since it will be tried again
+      /*
+        By putting the stalled requests in a buffer, we reduce resource contention
+        since they won't try again every cycle and will instead only try again once
+        woken up.
+       */
       st_stallAndWaitRequest;
   }
   transition(A, {RdBlk, WrVicBlk, WrVicBlkBack}) { //TagArrayRead} {
-      // by putting the stalled requests in a buffer, we reduce resource contention
-      // since they won't try again every cycle and will instead only try again once
-      // woken up
+      // don't profile as hit or miss since it will be tried again
+      /*
+        By putting the stalled requests in a buffer, we reduce resource contention
+        since they won't try again every cycle and will instead only try again once
+        woken up.
+       */
       st_stallAndWaitRequest;
   }
 
   transition(IV, {WrVicBlk, Atomic, AtomicPassOn, WrVicBlkBack}) { //TagArrayRead} {
-      // by putting the stalled requests in a buffer, we reduce resource contention
-      // since they won't try again every cycle and will instead only try again once
-      // woken up
+      // don't profile as hit or miss since it will be tried again
+      /*
+        By putting the stalled requests in a buffer, we reduce resource contention
+        since they won't try again every cycle and will instead only try again once
+        woken up.
+       */
       st_stallAndWaitRequest;
   }
 
+  transition({I, IV, V}, AtomicWait) {
+    // don't profile as hit or miss since it will be tried again
+    /*
+      By putting the stalled requests in a buffer, we reduce resource contention
+      since they won't try again every cycle and will instead only try again once
+      woken up.
+     */
+    st_stallAndWaitRequest;
+  }
+
   transition({M, V}, RdBlk) {TagArrayRead, DataArrayRead} {
     p_profileHit;
     sd_sendData;
@@ -865,12 +956,15 @@ machine(MachineType:TCC, "TCC Cache")
   }
 
   transition(W, RdBlk, WI) {TagArrayRead, DataArrayRead} {
+    // don't profile as hit or miss since it will be tried again
     t_allocateTBE;
     wb_writeBack;
-    // need to try this request again after writing back the current entry -- to
-    // do so, put it with other stalled requests in a buffer to reduce resource
-    // contention since they won't try again every cycle and will instead only
-    // try again once woken up
+    /*
+      Need to try this request again after writing back the current entry -- to
+      do so, put it with other stalled requests in a buffer to reduce resource
+      contention since they won't try again every cycle and will instead only
+      try again once woken up.
+     */
     st_stallAndWaitRequest;
   }
 
@@ -933,6 +1027,7 @@ machine(MachineType:TCC, "TCC Cache")
   // Transition to be called when a read request with SLC flag arrives at entry
   // in transient state. The request stalls until the pending transition is complete.
   transition({WI, WIB, IV}, RdBypassEvict)  {
+    // don't profile as hit or miss since it will be tried again
     st_stallAndWaitRequest;
   }
 
@@ -945,8 +1040,8 @@ machine(MachineType:TCC, "TCC Cache")
     p_popRequestQueue;
   }
 
-  transition(A, Atomic) {
-    p_profileMiss;
+  transition(A, {Atomic, AtomicWait}) {
+    // don't profile as hit or miss since it will be tried again
     // by putting the stalled requests in a buffer, we reduce resource contention
     // since they won't try again every cycle and will instead only try again once
     // woken up
@@ -993,7 +1088,7 @@ machine(MachineType:TCC, "TCC Cache")
   }
 
   transition(A, AtomicPassOn) {
-    p_profileMiss;
+    // don't profile as hit or miss since it will be tried again
     // by putting the stalled requests in a buffer, we reduce resource contention
     // since they won't try again every cycle and will instead only try again once
     // woken up
@@ -1136,9 +1231,41 @@ machine(MachineType:TCC, "TCC Cache")
     ut_updateTag;
     wcb_writeCacheBlock;
     sdr_sendDataResponse;
-    pr_popResponseQueue;
     wada_wakeUpAllDependentsAddr;
     dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  /*
+    Since the L2 now allows multiple loads from different CUs to proceed in
+    parallel to the directory, we may get Event:Data back when the line is
+    already in V.  In this case, send the response to the appropriate TCP
+    and update MRU/data in TCC, but don't need to allocate line.
+   */
+  transition(V, Data) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    ut_updateTag;
+    wcb_writeCacheBlock;
+    sdr_sendDataResponse;
+    wada_wakeUpAllDependentsAddr;
+    // tracks # pending requests, so need to decrement here too
+    dt_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  /*
+    Since the L2 now allows multiple loads from different CUs to proceed in
+    parallel to the directory, we may get Event:Data back when the line is
+    now in I because it has been evicted by an intervening request to the same
+    set index.  In this case, send the response to the appropriate TCP without
+    affecting the TCC (essentially, treat it similar to a bypass request except
+    we also send the unblock back to the directory).
+   */
+  transition(I, Data) {
+    sdr_sendDataResponse;
+    wada_wakeUpAllDependentsAddr;
+    // tracks # pending requests, so need to decrement here too
+    dt_deallocateTBE;
+    pr_popResponseQueue;
   }
 
   transition(A, Data, M) {TagArrayRead, TagArrayWrite, DataArrayWrite, AtomicALUOperation} {
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
index ae35d4c5f7..97997a12b5 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * Copyright (c) 2023 Matthew D. Sinclair
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -53,10 +54,14 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
 {
   state_declaration(State, desc="TCP Cache States", default="TCP_State_I") {
     I, AccessPermission:Invalid,   desc="Invalid";
+    // Note: currently IV in the TCP is only for pending loads to a given cache
+    // line. Since the TCP is write through, stores should be allowed to pass
+    // through without requiring them to wait.
+    IV, AccessPermission:Invalid,  desc="Going from I to V, waiting on TCC data";
     V, AccessPermission:Read_Only, desc="Valid";
     A, AccessPermission:Invalid,   desc="Waiting on Atomic";
 
-    F, AccessPermission:Invalid, desc="Flushing; Waiting for Ack";
+    F, AccessPermission:Invalid,   desc="Flushing; Waiting for Ack";
   }
 
   enumeration(Event, desc="TCP Events") {
@@ -102,6 +107,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     bool Dirty,        desc="Is the data dirty (different than memory)?";
     int NumPendingMsgs,desc="Number of acks/data messages that this processor is waiting for";
     bool Shared,       desc="Victim hit by shared probe";
+    bool isGLCSet,     desc="Bypass L1 Cache";
+    bool isSLCSet,     desc="Bypass L1 and L2 Cache";
    }
 
   structure(TBETable, external="yes") {
@@ -123,6 +130,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
   void unset_tbe();
   void wakeUpAllBuffers();
   void wakeUpBuffers(Addr a);
+  void wakeUpAllBuffers(Addr a);
   Cycles curCycle();
 
   // Internal functions
@@ -472,6 +480,15 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     check_allocate(TBEs);
     TBEs.allocate(address);
     set_tbe(TBEs.lookup(address));
+
+    // pass GLC/SLC information along
+    if (mandatoryQueue_in.isReady(clockEdge())) {
+      peek(mandatoryQueue_in, RubyRequest) {
+        DPRINTF(RubySlicc, "Address: %p\n", address);
+        tbe.isGLCSet := in_msg.isGLCSet;
+        tbe.isSLCSet := in_msg.isSLCSet;
+      }
+    }
   }
 
   action(d_deallocateTBE, "d", desc="Deallocate TBE") {
@@ -510,6 +527,10 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     responseToTCP_in.dequeue(clockEdge());
   }
 
+  action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") {
+    stall_and_wait(mandatoryQueue_in, address);
+  }
+
   action(l_loadDoneHit, "ldh", desc="local load done (hits in TCP)") {
     assert(is_valid(cache_entry));
     if (use_seq_not_coal) {
@@ -528,6 +549,20 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     }
   }
 
+  action(ldmi_loadDoneMissInv, "ldmi",
+         desc="local load done (misses in TCP and line was evicted)") {
+    // since line was evicted, can't rely on data from cache entry, so use from
+    // the response message
+    peek(responseToTCP_in, ResponseMsg) {
+      DataBlock tmp:= in_msg.DataBlk;
+      if (use_seq_not_coal) {
+        sequencer.readCallback(address, tmp, false, MachineType:L1Cache);
+      } else {
+        coalescer.readCallback(address, MachineType:L1Cache, tmp);
+      }
+    }
+  }
+
   action(ad_atomicDone, "ad", desc="atomic done") {
     assert(is_valid(cache_entry));
     coalescer.atomicCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
@@ -604,6 +639,10 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     L1cache.setMRU(address);
   }
 
+  action(wada_wakeUpAllDependentsAddr, "wada", desc="Wake up any requests waiting for this address") {
+    wakeUpAllBuffers(address);
+  }
+
 //  action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") {
 //    mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
 //  }
@@ -632,11 +671,19 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
   // Stalling transitions do NOT check the tag array...and if they do,
   // they can cause a resource stall deadlock!
 
-  transition({A}, {Load, Atomic, StoreThrough}) { //TagArrayRead} {
-      z_stall;
+  // if another request arrives for the same cache line that has a pending
+  // atomic or load, put it on the wakeup buffer instead of z_stall'ing it.  By
+  // doing so we reduce resource contention since they won't try again every cycle
+  // and will instead only try again once woken up
+  transition({A, IV}, {Load, LoadBypassEvict, Atomic, Store, StoreThrough, Flush}) {
+      st_stallAndWaitRequest;
   }
 
-  transition(I, Load) {TagArrayRead} {
+  // if we have a load that misses, allocate TBE entry and transition to IV
+  // to prevent subsequent requests to same cache line from also going to TCC
+  // while this request is pending
+  transition(I, Load, IV) {TagArrayRead} {
+    t_allocateTBE;
     n_issueRdBlk;
     uu_profileDataMiss;
     p_popMandatoryQueue;
@@ -694,14 +741,38 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     p_popMandatoryQueue;
   }
 
-  transition(I, TCC_Ack, V) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} {
-    a_allocate;
-    w_writeCache;
-    l_loadDoneMiss;
+  // if we got a response for a load where the line is in I, then
+  // another request must have come in that replaced the line in question in
+  // the cache.  Thus, complete this request without allocating the line, but
+  // still deallocate TBE and wakeup any dependent addresses.
+  // (Note: this assumes TCC_AckWB is what stores use)
+  transition(I, TCC_Ack) {TagArrayRead, TagArrayWrite} {
+    wada_wakeUpAllDependentsAddr;
+    // NOTE: Because we invalidated the cache line, the assert in l_loadDoneMiss
+    // will fail -- unlike atomics that automatically go to I when the line returns
+    // loads do not automatically go to I.  Resolve this by passing data from
+    // message.
+    ldmi_loadDoneMissInv;
+    d_deallocateTBE;
     pr_popResponseQueue;
   }
 
-  transition(I, Bypass, I) {
+  // if line is currently in IV, then TCC_Ack is returning the data for a
+  // pending load, so transition to V, deallocate TBE, and wakeup any dependent
+  // requests so they will be replayed now that this request has returned.
+  transition(IV, TCC_Ack, V) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} {
+    a_allocate;
+    w_writeCache;
+    wada_wakeUpAllDependentsAddr;
+    l_loadDoneMiss;
+    d_deallocateTBE;
+    pr_popResponseQueue;
+  }
+
+  // if a bypass request arrives back at the TCP, regardless of whether the line
+  // is in I (from the bypass request) or IV (from a subsequent non-bypassing
+  // load), retain the current state and complete the bypassing request.
+  transition({I, IV}, Bypass) {
     rb_bypassDone;
     pr_popResponseQueue;
   }
@@ -713,12 +784,13 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
   }
 
   transition(A, TCC_Ack, I) {TagArrayRead, DataArrayRead, DataArrayWrite} {
-    d_deallocateTBE;
     a_allocate;
     w_writeCache;
     ad_atomicDone;
-    pr_popResponseQueue;
     ic_invCache;
+    wada_wakeUpAllDependentsAddr;
+    d_deallocateTBE;
+    pr_popResponseQueue;
   }
 
   transition(V, TCC_Ack, V) {TagArrayRead, DataArrayRead, DataArrayWrite} {
@@ -735,20 +807,22 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     ic_invCache;
   }
 
+  // if a line with a pending load gets evicted, transition the line to I and
+  // invalidate it.
+  transition(IV, Repl, I) {TagArrayRead, TagArrayWrite} {
+    ic_invCache;
+  }
+
   transition({V,I}, Flush, F) {TagArrayFlash} {
     a_allocate;
     sf_setFlush;
     p_popMandatoryQueue;
   }
 
-  transition(A, Flush) {
-    z_stall;
-  }
-
   transition({I, V}, Evict, I) {TagArrayFlash} {
     inv_invDone;
-    p_popMandatoryQueue;
     ic_invCache;
+    p_popMandatoryQueue;
   }
 
   transition(A, Evict) {TagArrayFlash} {
@@ -756,8 +830,11 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     p_popMandatoryQueue;
   }
 
+  // if a line is in IV and a TCC_AckWB comes back, we must have had a WT
+  // store followed by a load. Thus, complete the store without affecting
+  // TBE or line state.
   // TCC_AckWB only snoops TBE
-  transition({V, I, A}, TCC_AckWB) {
+  transition({V, I, IV, A}, TCC_AckWB) {
     wd_wtDone;
     pr_popResponseQueue;
   }
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
index b9401d680a..c36fc9ec93 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
@@ -154,7 +154,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
     bool Dirty,         desc="Is the data dirty?";
     int NumPendingAcks,        desc="num acks expected";
     MachineID OriginalRequestor,        desc="Original Requestor";
-    MachineID WTRequestor,        desc="WT Requestor";
+    MachineID CURequestor,        desc="CU that initiated the request";
     bool Cached,        desc="data hit in Cache";
     bool MemData,       desc="Got MemData?",default="false";
     bool wtData,       desc="Got write through data?",default="false";
@@ -170,7 +170,9 @@ machine(MachineType:Directory, "AMD Baseline protocol")
     uint64_t probe_id,        desc="probe id for lifetime profiling";
     WriteMask writeMask,    desc="outstanding write through mask";
     int Len,            desc="Length of memory request for DMA";
-    bool isSLCSet,      desc="Bypass L1 and L2 Cache";
+    // GLC is passed along because it is needed in the return path
+    bool isGLCSet,      desc="Bypass GPU L1 Cache";
+    bool isSLCSet,      desc="Bypass GPU L1 and L2 Cache";
   }
 
   structure(TBETable, external="yes") {
@@ -470,6 +472,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
       out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
       out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
       out_msg.OriginalResponder := tbe.LastSender;
+      out_msg.CURequestor := tbe.CURequestor;
       out_msg.L3Hit := tbe.L3Hit;
       DPRINTF(RubySlicc, "%s\n", out_msg);
     }
@@ -498,6 +501,9 @@ machine(MachineType:Directory, "AMD Baseline protocol")
       out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
       out_msg.OriginalResponder := tbe.LastSender;
       out_msg.L3Hit := tbe.L3Hit;
+      out_msg.isGLCSet := tbe.isGLCSet;
+      out_msg.isSLCSet := tbe.isSLCSet;
+      out_msg.CURequestor := tbe.CURequestor;
       DPRINTF(RubySlicc, "%s\n", out_msg);
     }
   }
@@ -527,9 +533,11 @@ machine(MachineType:Directory, "AMD Baseline protocol")
         out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
         out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
         out_msg.OriginalResponder := tbe.LastSender;
-        if(tbe.atomicData){
-          out_msg.WTRequestor := tbe.WTRequestor;
-        }
+        out_msg.isGLCSet := tbe.isGLCSet;
+        out_msg.isSLCSet := tbe.isSLCSet;
+	if(tbe.atomicData){
+          out_msg.CURequestor := tbe.CURequestor;
+	}
         out_msg.L3Hit := tbe.L3Hit;
         DPRINTF(RubySlicc, "%s\n", out_msg);
       }
@@ -555,6 +563,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
         out_msg.InitialRequestTime := tbe.InitialRequestTime;
         out_msg.ForwardRequestTime := curCycle();
         out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+        out_msg.isGLCSet := tbe.isGLCSet;
+        out_msg.isSLCSet := tbe.isSLCSet;
         DPRINTF(RubySlicc, "%s\n", out_msg);
       }
   }
@@ -565,13 +575,15 @@ machine(MachineType:Directory, "AMD Baseline protocol")
         out_msg.addr := address;
         out_msg.Type := CoherenceResponseType:NBSysWBAck;
         out_msg.Destination.add(in_msg.Requestor);
-        out_msg.WTRequestor := in_msg.WTRequestor;
+        out_msg.CURequestor := in_msg.CURequestor;
         out_msg.Sender := machineID;
         out_msg.MessageSize := MessageSizeType:Writeback_Control;
         out_msg.InitialRequestTime := in_msg.InitialRequestTime;
         out_msg.ForwardRequestTime := curCycle();
         out_msg.ProbeRequestStartTime := curCycle();
         out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
     }
   }
@@ -582,7 +594,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
         out_msg.addr := address;
         out_msg.Type := CoherenceResponseType:NBSysWBAck;
         out_msg.Destination.add(tbe.OriginalRequestor);
-        out_msg.WTRequestor := tbe.WTRequestor;
+        out_msg.CURequestor := tbe.CURequestor;
         out_msg.Sender := machineID;
         out_msg.MessageSize := MessageSizeType:Writeback_Control;
         out_msg.InitialRequestTime := tbe.InitialRequestTime;
@@ -773,6 +785,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
           out_msg.MessageSize := MessageSizeType:Control;
           out_msg.Destination := probe_dests;
           tbe.NumPendingAcks := out_msg.Destination.count();
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
           DPRINTF(RubySlicc, "%s\n", out_msg);
           APPEND_TRANSITION_COMMENT(" dc: Acks remaining: ");
           APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
@@ -877,6 +891,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
           out_msg.MessageSize := MessageSizeType:Control;
           out_msg.Destination := probe_dests;
           tbe.NumPendingAcks := out_msg.Destination.count();
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
           DPRINTF(RubySlicc, "%s\n", (out_msg));
           APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
           APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
@@ -931,6 +947,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
           out_msg.ReturnData := false;
           out_msg.MessageSize := MessageSizeType:Control;
           out_msg.Destination := probe_dests;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
           tbe.NumPendingAcks := out_msg.Destination.count();
           APPEND_TRANSITION_COMMENT(" ic: Acks remaining: ");
           APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
@@ -1017,7 +1035,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
         tbe.writeMask.clear();
         tbe.writeMask.orMask(in_msg.writeMask);
         tbe.wtData := true;
-        tbe.WTRequestor := in_msg.WTRequestor;
+        tbe.CURequestor := in_msg.CURequestor;
         tbe.LastSender := in_msg.Requestor;
       }
       if (in_msg.Type == CoherenceRequestType:Atomic ||
@@ -1032,10 +1050,14 @@ machine(MachineType:Directory, "AMD Baseline protocol")
           assert(in_msg.Type == CoherenceRequestType:AtomicNoReturn);
           tbe.atomicDataNoReturn := true;
         }
-        tbe.WTRequestor := in_msg.WTRequestor;
+        tbe.CURequestor := in_msg.CURequestor;
         tbe.LastSender := in_msg.Requestor;
         tbe.isSLCSet := in_msg.isSLCSet;
       }
+      // GPU read requests also need to track where the requestor came from
+      if (in_msg.Type == CoherenceRequestType:RdBlk) {
+        tbe.CURequestor := in_msg.CURequestor;
+      }
       tbe.Dirty := false;
       if (in_msg.Type == CoherenceRequestType:WriteThrough) {
         tbe.DataBlk.copyPartial(in_msg.DataBlk,in_msg.writeMask);
@@ -1045,6 +1067,9 @@ machine(MachineType:Directory, "AMD Baseline protocol")
       tbe.NumPendingAcks := 0;
       tbe.Cached := in_msg.ForceShared;
       tbe.InitialRequestTime := in_msg.InitialRequestTime;
+      tbe.isGLCSet := in_msg.isGLCSet;
+      tbe.isSLCSet := in_msg.isSLCSet;
+      DPRINTF(RubySlicc, "t_allocateTBE in_msg: %s, tbe: %s\n", in_msg, tbe.CURequestor);
     }
   }
 
@@ -1277,11 +1302,20 @@ machine(MachineType:Directory, "AMD Baseline protocol")
   }
 
   action(wada_wakeUpAllDependentsAddr, "wada", desc="Wake up any requests waiting for this address") {
+    DPRINTF(RubySlicc, "wada wakeup: 0x%x\n", address);
     wakeUpAllBuffers(address);
   }
 
+  /*
+    Currently z_stall is unused because it can lead to Protocol Stalls that
+    eventually lead to deadlock.  Instead, it is recommended to use
+    st_stallAndWaitRequest in combination with a wakeupBuffer call (e.g.,
+    wada_wakeUpAllDependentsAddr) to put the pending requests to sleep instead of
+    them causing head of line blocking -- wada_wakeUpAllDependentsAddr should wake
+    the request up once the request preventing it from completing is done.
   action(z_stall, "z", desc="...") {
   }
+  */
 
   // TRANSITIONS
   transition({BL, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) {
@@ -1383,19 +1417,19 @@ machine(MachineType:Directory, "AMD Baseline protocol")
     d_writeDataToMemory;
     al_allocateL3Block;
     pr_profileL3HitMiss; //Must come after al_allocateL3Block and before dt_deallocateTBE
-    wad_wakeUpDependents;
+    wada_wakeUpAllDependentsAddr;
     dt_deallocateTBE;
     pr_popResponseQueue;
   }
 
   transition(BL, StaleWB, U) {L3TagArrayWrite} {
     dt_deallocateTBE;
-    wa_wakeUpAllDependents;
+    wada_wakeUpAllDependentsAddr;
     pr_popResponseQueue;
   }
 
   transition({B, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm}, {VicDirty, VicClean}) {
-    z_stall;
+    st_stallAndWaitRequest;
   }
 
   transition({U, BL, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, B}, WBAck) {
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
index 984362da39..b860ff1681 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
@@ -134,14 +134,14 @@ structure(CPURequestMsg, desc="...", interface="Message") {
   int Acks, default="0", desc="Acks that the dir (mem ctrl) should expect to receive";
   CoherenceRequestType OriginalType, default="CoherenceRequestType_NA",  desc="Type of request from core fwded through region buffer";
   WriteMask writeMask, desc="Write Through Data";
-  MachineID WTRequestor,            desc="Node who initiated the write through";
+  MachineID CURequestor,            desc="Node who initiated the request";
   int wfid,                         default="0", desc="wavefront id";
   uint64_t instSeqNum,              desc="instruction sequence number";
   bool NoWriteConflict,             default="true", desc="write collided with CAB entry";
   int ProgramCounter,               desc="PC that accesses to this block";
 
-  bool isGLCSet, default="false", desc="GLC flag value in the request";
-  bool isSLCSet, default="false", desc="SLC flag value in the request";
+  bool isGLCSet, default="false",   desc="GLC flag value in the request";
+  bool isSLCSet, default="false",   desc="SLC flag value in the request";
 
   bool functionalRead(Packet *pkt) {
     // Only PUTX messages contains the data block
@@ -170,6 +170,8 @@ structure(NBProbeRequestMsg, desc="...", interface="Message") {
   MachineID Requestor,          desc="Requestor id for 3-hop requests";
   bool NoAckNeeded, default="false", desc="For short circuting acks";
   int ProgramCounter,           desc="PC that accesses to this block";
+  bool isGLCSet, default="false", desc="GLC flag value in the request";
+  bool isSLCSet, default="false", desc="SLC flag value in the request";
 
   bool functionalRead(Packet *pkt) {
     return false;
@@ -240,7 +242,7 @@ structure(ResponseMsg, desc="...", interface="Message") {
 
   bool L3Hit, default="false", desc="Did memory or L3 supply the data?";
   MachineID OriginalResponder, desc="Mach which wrote the data to the L3";
-  MachineID WTRequestor,             desc="Node who started the writethrough";
+  MachineID CURequestor,             desc="Node who started the access";
 
   bool NotCached, default="false", desc="True when the Region buffer has already evicted the line";
 
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
index f9d071ba62..90d6031c6e 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -526,26 +526,16 @@ GPUCoalescer::readCallback(Addr address,
     fatal_if(crequest->getRubyType() != RubyRequestType_LD,
              "readCallback received non-read type response\n");
 
-    // Iterate over the coalesced requests to respond to as many loads as
-    // possible until another request type is seen. Models MSHR for TCP.
-    while (crequest->getRubyType() == RubyRequestType_LD) {
-        hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
-                    forwardRequestTime, firstResponseTime, isRegion);
-
-        delete crequest;
-        coalescedTable.at(address).pop_front();
-        if (coalescedTable.at(address).empty()) {
-            break;
-        }
-
-        crequest = coalescedTable.at(address).front();
-    }
+    hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
+                forwardRequestTime, firstResponseTime, isRegion);
 
+    delete crequest;
+    coalescedTable.at(address).pop_front();
     if (coalescedTable.at(address).empty()) {
-        coalescedTable.erase(address);
+      coalescedTable.erase(address);
     } else {
-        auto nextRequest = coalescedTable.at(address).front();
-        issueRequest(nextRequest);
+      auto nextRequest = coalescedTable.at(address).front();
+      issueRequest(nextRequest);
     }
 }