diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 9238dbec00..a80b918798 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -1100,6 +1100,16 @@ class Packet : public Printable
         flags.set(VALID_SIZE);
     }
 
+    /**
+     * Accessor functions for the cache bypass flags. The cache bypass
+     * can specify which levels in the hierarchy to bypass. If GLC_BIT
+     * is set, the requests are globally coherent and bypass TCP.
+     * If SLC_BIT is set, then the requests are system level coherent
+     * and bypass both TCP and TCC.
+     */
+    bool isGLCSet() const { return req->isGLCSet();}
+    bool isSLCSet() const { return req->isSLCSet();}
+
     /**
      * Check if packet corresponds to a given block-aligned address and
      * address space.
diff --git a/src/mem/request.hh b/src/mem/request.hh
index 39d9d7281c..6a0cbc21d4 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -1071,6 +1071,17 @@ class Request
 
     bool isAcquire() const { return _cacheCoherenceFlags.isSet(ACQUIRE); }
 
+
+    /**
+     * Accessor functions for the cache bypass flags. The cache bypass
+     * can specify which levels in the hierarchy to bypass. If GLC_BIT
+     * is set, the requests are globally coherent and bypass TCP.
+     * If SLC_BIT is set, then the requests are system level coherent
+     * and bypass both TCP and TCC.
+     */
+    bool isGLCSet() const {return _cacheCoherenceFlags.isSet(GLC_BIT); }
+    bool isSLCSet() const {return _cacheCoherenceFlags.isSet(SLC_BIT); }
+
     /**
      * Accessor functions for the memory space configuration flags and used by
      * GPU ISAs such as the Heterogeneous System Architecture (HSA). Note that
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
index 032a64cec4..ae142471fa 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -56,8 +56,10 @@ machine(MachineType:TCC, "TCC Cache")
   enumeration(Event, desc="TCC Events") {
     // Requests coming from the Cores
     RdBlk,                  desc="RdBlk event";
+    RdBypassEvict,          desc="Bypass L2 on reads. Evict if cache block already allocated";
     WrVicBlk,               desc="L1 Write Through";
     WrVicBlkBack,           desc="L1 Write Through(dirty cache)";
+    WrVicBlkEvict,          desc="L1 Write Through(dirty cache) and evict";
     Atomic,                 desc="Atomic Op";
     AtomicDone,             desc="AtomicOps Complete";
     AtomicNotDone,          desc="AtomicOps not Complete";
@@ -68,6 +70,7 @@ machine(MachineType:TCC, "TCC Cache")
     PrbInv,                 desc="Invalidating probe";
     // Coming from Memory Controller
     WBAck,                  desc="writethrough ack from memory";
+    Bypass,                 desc="Bypass the entire L2 cache";
   }
 
   // STATES
@@ -107,6 +110,8 @@ machine(MachineType:TCC, "TCC Cache")
     NetDest Destination, desc="Data destination";
     int numAtomics,     desc="number remaining atomics";
     int atomicDoneCnt,  desc="number AtomicDones triggered";
+    bool isGLCSet,      desc="Bypass L1 Cache";
+    bool isSLCSet,      desc="Bypass L1 and L2 Cache";
   }
 
   structure(TBETable, external="yes") {
@@ -173,7 +178,6 @@ machine(MachineType:TCC, "TCC Cache")
 
   int functionalWrite(Addr addr, Packet *pkt) {
     int num_functional_writes := 0;
-
     TBE tbe := TBEs.lookup(addr);
     if(is_valid(tbe)) {
       num_functional_writes := num_functional_writes +
@@ -279,7 +283,11 @@ machine(MachineType:TCC, "TCC Cache")
       peek(responseFromNB_in, ResponseMsg, block_on="addr") {
         TBE tbe := TBEs.lookup(in_msg.addr);
         Entry cache_entry := getCacheEntry(in_msg.addr);
-        if (in_msg.Type == CoherenceResponseType:NBSysResp) {
+        if (in_msg.isSLCSet) {
+            // If the SLC bit is set, the response needs to bypass the cache
+            // and should not be allocated an entry.
+            trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:NBSysResp) {
           if(presentOrAvail(in_msg.addr)) {
             trigger(Event:Data, in_msg.addr, cache_entry, tbe);
           } else {
@@ -313,7 +321,18 @@ machine(MachineType:TCC, "TCC Cache")
         TBE tbe := TBEs.lookup(in_msg.addr);
         Entry cache_entry := getCacheEntry(in_msg.addr);
         if (in_msg.Type == CoherenceRequestType:WriteThrough) {
-            if(WB) {
+            if (in_msg.isSLCSet) {
+                // The request should bypass the cache if SLC bit is set.
+                // If the cache entry exists already, then evict it.
+                // Else, perform a normal cache access.
+                // The cache entry is allocated only on response and bypass is
+                // handled there
+                if(presentOrAvail(in_msg.addr)) {
+                    trigger(Event:WrVicBlkEvict, in_msg.addr, cache_entry, tbe);
+                } else {
+                    trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
+                }
+            } else if(WB) {
                 if(presentOrAvail(in_msg.addr)) {
                     trigger(Event:WrVicBlkBack, in_msg.addr, cache_entry, tbe);
                 } else {
@@ -326,7 +345,13 @@ machine(MachineType:TCC, "TCC Cache")
         } else if (in_msg.Type == CoherenceRequestType:Atomic) {
           trigger(Event:Atomic, in_msg.addr, cache_entry, tbe);
         } else if (in_msg.Type == CoherenceRequestType:RdBlk) {
-          trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+          if (in_msg.isSLCSet) {
+            // If SLC bit is set, the request needs to go directly to memory.
+            // If a cache block already exists, then evict it.
+            trigger(Event:RdBypassEvict, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+          }
         } else {
           DPRINTF(RubySlicc, "%s\n", in_msg);
           error("Unexpected Response Message to Core");
@@ -354,6 +379,8 @@ machine(MachineType:TCC, "TCC Cache")
         out_msg.MessageSize := MessageSizeType:Response_Data;
         out_msg.Dirty := false;
         out_msg.State := CoherenceState:Shared;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
         DPRINTF(RubySlicc, "%s\n", out_msg);
       }
     }
@@ -371,15 +398,46 @@ machine(MachineType:TCC, "TCC Cache")
       out_msg.Dirty := false;
       out_msg.State := CoherenceState:Shared;
       DPRINTF(RubySlicc, "%s\n", out_msg);
+      peek(responseFromNB_in, ResponseMsg) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
     }
     enqueue(unblockToNB_out, UnblockMsg, 1) {
       out_msg.addr := address;
       out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
       out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      peek(responseFromNB_in, ResponseMsg) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
       DPRINTF(RubySlicc, "%s\n", out_msg);
     }
   }
 
+  action(rb_bypassDone, "rb", desc="bypass L2 of read access") {
+    peek(responseFromNB_in, ResponseMsg) {
+        enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:TDSysResp;
+          out_msg.Sender := machineID;
+          out_msg.Destination := tbe.Destination;
+          out_msg.DataBlk := in_msg.DataBlk;
+          out_msg.MessageSize := MessageSizeType:Response_Data;
+          out_msg.Dirty := false;
+          out_msg.State := CoherenceState:Shared;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+        enqueue(unblockToNB_out, UnblockMsg, 1) {
+          out_msg.addr := address;
+          out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
+          out_msg.MessageSize := MessageSizeType:Unblock_Control;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+    }
+  }
 
   action(rd_requestData, "r", desc="Miss in L2, pass on") {
     if(tbe.Destination.count()==1){
@@ -391,6 +449,8 @@ machine(MachineType:TCC, "TCC Cache")
           out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
           out_msg.Shared := false; // unneeded for this request
           out_msg.MessageSize := in_msg.MessageSize;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
           DPRINTF(RubySlicc, "%s\n", out_msg);
         }
       }
@@ -407,6 +467,9 @@ machine(MachineType:TCC, "TCC Cache")
         out_msg.Sender := machineID;
         out_msg.MessageSize := MessageSizeType:Writeback_Control;
         out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+
       }
     }
   }
@@ -421,6 +484,9 @@ machine(MachineType:TCC, "TCC Cache")
         out_msg.Sender := machineID;
         out_msg.MessageSize := MessageSizeType:Writeback_Control;
         out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+
       }
     }
   }
@@ -434,6 +500,9 @@ machine(MachineType:TCC, "TCC Cache")
           out_msg.Sender := machineID;
           out_msg.MessageSize := in_msg.MessageSize;
           out_msg.DataBlk := in_msg.DataBlk;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
+
         }
     }
   }
@@ -466,6 +535,8 @@ machine(MachineType:TCC, "TCC Cache")
       peek(coreRequestNetwork_in, CPURequestMsg) {
         if(in_msg.Type == CoherenceRequestType:RdBlk || in_msg.Type == CoherenceRequestType:Atomic){
           tbe.Destination.add(in_msg.Requestor);
+          tbe.isGLCSet := in_msg.isGLCSet;
+          tbe.isSLCSet := in_msg.isSLCSet;
         }
       }
     }
@@ -505,6 +576,8 @@ machine(MachineType:TCC, "TCC Cache")
         out_msg.DataBlk := in_msg.DataBlk;
         out_msg.writeMask.orMask(in_msg.writeMask);
         out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
     }
   }
@@ -520,6 +593,10 @@ machine(MachineType:TCC, "TCC Cache")
       out_msg.Dirty := true;
       out_msg.DataBlk := cache_entry.DataBlk;
       out_msg.writeMask.orMask(cache_entry.writeMask);
+      peek(coreRequestNetwork_in, CPURequestMsg) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
     }
   }
 
@@ -534,6 +611,8 @@ machine(MachineType:TCC, "TCC Cache")
         out_msg.Type := CoherenceRequestType:Atomic;
         out_msg.Dirty := true;
         out_msg.writeMask.orMask(in_msg.writeMask);
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
     }
   }
@@ -549,6 +628,10 @@ machine(MachineType:TCC, "TCC Cache")
       out_msg.Ntsl := true;
       out_msg.State := CoherenceState:NA;
       out_msg.MessageSize := MessageSizeType:Response_Control;
+      peek(probeNetwork_in, NBProbeRequestMsg) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
     }
   }
   action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
@@ -592,6 +675,10 @@ machine(MachineType:TCC, "TCC Cache")
         tbe.atomicDoneCnt := tbe.atomicDoneCnt + 1;
         out_msg.addr := address;
         out_msg.Type := TriggerType:AtomicDone;
+        peek(responseFromNB_in, ResponseMsg) {
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
+        }
       }
     }
   }
@@ -659,6 +746,54 @@ machine(MachineType:TCC, "TCC Cache")
     p_popRequestQueue;
   }
 
+  transition(I, RdBypassEvict) {TagArrayRead} {
+    p_profileMiss;
+    t_allocateTBE;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+// Transition to be called when a read request with SLC flag set arrives at
+// entry in state W. It evicts and invalidates the cache entry before
+// forwarding the request to global memory
+  transition(W, RdBypassEvict, I) {TagArrayRead} {
+    p_profileMiss;
+    t_allocateTBE;
+    wb_writeBack;
+    i_invL2;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+// Transition to be called when a read request with SLC flag set arrives at
+// entry in state M. It evicts and invalidates the cache entry before
+// forwarding the request to global memory to main memory
+  transition(M, RdBypassEvict, I) {TagArrayRead} {
+    p_profileMiss;
+    t_allocateTBE;
+    wb_writeBack;
+    i_invL2;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+// Transition to be called when a read request with SLC flag set arrives at
+// entry in state V. It invalidates the cache entry before forwarding the
+// request to global memory.
+  transition(V, RdBypassEvict, I) {TagArrayRead} {
+    p_profileMiss;
+    t_allocateTBE;
+    i_invL2;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+// Transition to be called when a read request with SLC flag arrives at entry
+// in transient state. The request stalls until the pending transition is complete.
+  transition({WI, IV}, RdBypassEvict)  {
+    st_stallAndWaitRequest;
+  }
+
   transition(V, Atomic, A) {TagArrayRead} {
     p_profileHit;
     i_invL2;
@@ -730,6 +865,31 @@ transition(I, Atomic, A) {TagArrayRead} {
     p_popRequestQueue;
   }
 
+// Transition to be called when a write request with SLC bit set arrives at an
+// entry with state V. The entry has to be evicted and invalidated before the
+// request is forwarded to global memory
+  transition(V, WrVicBlkEvict, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    p_profileMiss;
+    ut_updateTag;
+    t_allocateTBE;
+    wt_writeThrough;
+    i_invL2;
+    p_popRequestQueue;
+  }
+
+// Transition to be called when a write request with SLC bit set arrives at an
+// entry with state W. The entry has to be evicted and invalidated before the
+// request is forwarded to global memory.
+  transition(W, WrVicBlkEvict, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    p_profileMiss;
+    ut_updateTag;
+    wdb_writeDirtyBytes;
+    t_allocateTBE;
+    wb_writeBack;
+    i_invL2;
+    p_popRequestQueue;
+  }
+
   transition({W, M}, L2_Repl, WI) {TagArrayRead, DataArrayRead} {
     t_allocateTBE;
     wb_writeBack;
@@ -764,6 +924,16 @@ transition(I, Atomic, A) {TagArrayRead} {
     pp_popProbeQueue;
   }
 
+// Transition to be called when the response for a request with SLC bit set
+// arrives. The request has to be forwarded to the core that needs it while
+// making sure no entry is allocated.
+  transition(I, Bypass, I) {
+    rb_bypassDone;
+    pr_popResponseQueue;
+    wada_wakeUpAllDependentsAddr;
+    dt_deallocateTBE;
+  }
+
   transition(IV, Data, V) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
     a_allocateBlock;
     ut_updateTag;
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
index 775a62b174..3be1397d49 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -60,6 +60,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
   enumeration(Event, desc="TCP Events") {
     // Core initiated
     Load,           desc="Load";
+    LoadBypassEvict, desc="Bypass L1 on a load. Evict if cache block already allocated";
     Store,          desc="Store to L1 (L1 is dirty)";
     StoreThrough,   desc="Store directly to L2(L1 is clean)";
     Atomic,         desc="Atomic";
@@ -256,8 +257,10 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
         Entry cache_entry := getCacheEntry(in_msg.addr);
         TBE tbe := TBEs.lookup(in_msg.addr);
         if (in_msg.Type == CoherenceResponseType:TDSysResp) {
-          // disable L1 cache
-          if (disableL1) {
+          if (disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) {
+              // If L1 is disabled or requests have GLC or SLC flag set,
+              // then, the requests should not cache in the L1. The response
+              // from L2/global memory should bypass the cache
 	          trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
           } else {
             if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) {
@@ -284,13 +287,23 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
         TBE tbe := TBEs.lookup(in_msg.LineAddress);
         DPRINTF(RubySlicc, "%s\n", in_msg);
         if (in_msg.Type == RubyRequestType:LD) {
-          trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);
+          if ((in_msg.isGLCSet || in_msg.isSLCSet) && is_valid(cache_entry)) {
+            // Read rquests with GLC or SLC bit set should not cache in the L1.
+            // They need to bypass the L1 and go to the L2. If an entry exists
+            // in the L1, it needs to be evicted
+            trigger(Event:LoadBypassEvict, in_msg.LineAddress, cache_entry, tbe);
+          }
+          else {
+            trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);
+          }
         } else if (in_msg.Type == RubyRequestType:ATOMIC ||
                    in_msg.Type == RubyRequestType:ATOMIC_RETURN ||
                    in_msg.Type == RubyRequestType:ATOMIC_NO_RETURN) {
           trigger(Event:Atomic, in_msg.LineAddress, cache_entry, tbe);
         } else if (in_msg.Type == RubyRequestType:ST) {
-          if(disableL1) {
+          if(disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) {
+            // Write requests with GLC or SLC bit set, or when L1 is disabled,
+            // should not cache in the L1. They need to perform a store through
             trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
           } else {
             if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
@@ -330,6 +343,10 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
                               TCC_select_low_bit, TCC_select_num_bits));
       out_msg.MessageSize := MessageSizeType:Request_Control;
       out_msg.InitialRequestTime := curCycle();
+      peek(mandatoryQueue_in, RubyRequest) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
     }
   }
 
@@ -375,6 +392,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
                               TCC_select_low_bit, TCC_select_num_bits));
           out_msg.MessageSize := MessageSizeType:Request_Control;
           out_msg.InitialRequestTime := curCycle();
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
         }
       }
     }
@@ -401,6 +420,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
       // forward inst sequence number to lower TCC
       peek(mandatoryQueue_in, RubyRequest) {
         out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
     }
   }
@@ -418,6 +439,11 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
         out_msg.Type := CoherenceRequestType:Atomic;
         out_msg.InitialRequestTime := curCycle();
         out_msg.Shared := false;
+        peek(mandatoryQueue_in, RubyRequest) {
+          out_msg.instSeqNum := in_msg.instSeqNum;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
+        }
       }
     }
   }
@@ -583,6 +609,17 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     p_popMandatoryQueue;
   }
 
+// Transition to be called when a load request with GLC or SLC flag set arrives
+// at L1. This transition invalidates any existing entry and forwards the
+// request to L2.
+  transition(V, LoadBypassEvict, I) {TagArrayRead, TagArrayWrite} {
+    uu_profileDataMiss;
+    inv_invDone;
+    ic_invCache;
+    n_issueRdBlk;
+    p_popMandatoryQueue;
+}
+
   transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} {
     t_allocateTBE;
     mru_updateMRU;
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
index 3b38e3b1ff..57edef8f2b 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
@@ -161,6 +161,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
     uint64_t probe_id,        desc="probe id for lifetime profiling";
     WriteMask writeMask,    desc="outstanding write through mask";
     int Len,            desc="Length of memory request for DMA";
+    bool isGLCSet,      desc="Bypass L1 Cache";
+    bool isSLCSet,      desc="Bypass L1 and L2 Cache";
   }
 
   structure(TBETable, external="yes") {
@@ -483,6 +485,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
       out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
       out_msg.OriginalResponder := tbe.LastSender;
       out_msg.L3Hit := tbe.L3Hit;
+      out_msg.isGLCSet := tbe.isGLCSet;
+      out_msg.isSLCSet := tbe.isSLCSet;
       DPRINTF(RubySlicc, "%s\n", out_msg);
     }
   }
@@ -512,6 +516,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
         out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
         out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
         out_msg.OriginalResponder := tbe.LastSender;
+        out_msg.isGLCSet := tbe.isGLCSet;
+        out_msg.isSLCSet := tbe.isSLCSet;
         if(tbe.atomicData){
           out_msg.WTRequestor := tbe.WTRequestor;
         }
@@ -540,6 +546,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
         out_msg.InitialRequestTime := tbe.InitialRequestTime;
         out_msg.ForwardRequestTime := curCycle();
         out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+        out_msg.isGLCSet := tbe.isGLCSet;
+        out_msg.isSLCSet := tbe.isSLCSet;
         DPRINTF(RubySlicc, "%s\n", out_msg);
       }
   }
@@ -557,6 +565,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
         out_msg.ForwardRequestTime := curCycle();
         out_msg.ProbeRequestStartTime := curCycle();
         out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
     }
   }
@@ -569,6 +579,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
         out_msg.Sender := machineID;
         out_msg.MessageSize := MessageSizeType:Writeback_Data;
         out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
     }
   }
@@ -624,6 +636,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
           out_msg.Type := MemoryRequestType:MEMORY_READ;
           out_msg.Sender := machineID;
           out_msg.MessageSize := MessageSizeType:Request_Control;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
         }
       }
     }
@@ -739,6 +753,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
           out_msg.MessageSize := MessageSizeType:Control;
           out_msg.Destination := probe_dests;
           tbe.NumPendingAcks := out_msg.Destination.count();
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
           DPRINTF(RubySlicc, "%s\n", out_msg);
           APPEND_TRANSITION_COMMENT(" dc: Acks remaining: ");
           APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
@@ -842,6 +858,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
           out_msg.ReturnData := true;
           out_msg.MessageSize := MessageSizeType:Control;
           out_msg.Destination := probe_dests;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
           tbe.NumPendingAcks := out_msg.Destination.count();
           DPRINTF(RubySlicc, "%s\n", (out_msg));
           APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
@@ -897,6 +915,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
           out_msg.ReturnData := false;
           out_msg.MessageSize := MessageSizeType:Control;
           out_msg.Destination := probe_dests;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
           tbe.NumPendingAcks := out_msg.Destination.count();
           APPEND_TRANSITION_COMMENT(" ic: Acks remaining: ");
           APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
@@ -923,6 +943,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
         out_msg.Sender := machineID;
         out_msg.MessageSize := MessageSizeType:Writeback_Data;
         out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
       }
       if (tbe.Dirty == false) {
           // have to update the TBE, too, because of how this
@@ -985,6 +1007,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
       tbe.NumPendingAcks := 0;
       tbe.Cached := in_msg.ForceShared;
       tbe.InitialRequestTime := in_msg.InitialRequestTime;
+      tbe.isGLCSet := in_msg.isGLCSet;
+      tbe.isSLCSet := in_msg.isSLCSet;
     }
   }
 
@@ -1004,6 +1028,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
         out_msg.Sender := machineID;
         out_msg.MessageSize := MessageSizeType:Writeback_Data;
         out_msg.DataBlk := tbe.DataBlk;
+        out_msg.isGLCSet := tbe.isGLCSet;
+        out_msg.isSLCSet := tbe.isSLCSet;
         DPRINTF(ProtocolTrace, "%s\n", out_msg);
       }
     }
@@ -1104,6 +1130,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
             out_msg.Sender := machineID;
             out_msg.MessageSize := MessageSizeType:Writeback_Data;
             out_msg.DataBlk := victim_entry.DataBlk;
+            out_msg.isGLCSet := in_msg.isGLCSet;
+            out_msg.isSLCSet := in_msg.isSLCSet;
           }
           L3CacheMemory.deallocate(victim);
         }
@@ -1136,6 +1164,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
             out_msg.Sender := machineID;
             out_msg.MessageSize := MessageSizeType:Writeback_Data;
             out_msg.DataBlk := victim_entry.DataBlk;
+            out_msg.isGLCSet := tbe.isGLCSet;
+            out_msg.isSLCSet := tbe.isSLCSet;
           }
           L3CacheMemory.deallocate(victim);
         }
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
index 46bab43c22..6ff19e953b 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
@@ -138,6 +138,9 @@ structure(CPURequestMsg, desc="...", interface="Message") {
   bool NoWriteConflict,             default="true", desc="write collided with CAB entry";
   int ProgramCounter,               desc="PC that accesses to this block";
 
+  bool isGLCSet, default="false", desc="GLC flag value in the request";
+  bool isSLCSet, default="false", desc="SLC flag value in the request";
+
   bool functionalRead(Packet *pkt) {
     // Only PUTX messages contains the data block
     if (Type == CoherenceRequestType:VicDirty) {
@@ -165,6 +168,8 @@ structure(NBProbeRequestMsg, desc="...", interface="Message") {
   MachineID Requestor,          desc="Requestor id for 3-hop requests";
   bool NoAckNeeded, default="false", desc="For short circuting acks";
   int ProgramCounter,           desc="PC that accesses to this block";
+  bool isGLCSet,                desc="Bypass L1 Cache";
+  bool isSLCSet,                desc="Bypass L1 and L2 Caches";
 
   bool functionalRead(Packet *pkt) {
     return false;
@@ -248,6 +253,9 @@ structure(ResponseMsg, desc="...", interface="Message") {
   int ProgramCounter,       desc="PC that issues this request";
   bool mispred,              desc="tell TCP if the block should not be bypassed";
 
+  bool isGLCSet, default="false", desc="GLC flag value in the request that triggered response";
+  bool isSLCSet, default="false", desc="SLC flag value in the request that triggered response";
+
 
   bool functionalRead(Packet *pkt) {
     // Only PUTX messages contains the data block
@@ -277,6 +285,8 @@ structure(UnblockMsg, desc="...", interface="Message") {
   bool wasValid, default="false", desc="Was block valid when evicted";
   bool valid, default="false", desc="Is block valid";
   bool validToInvalid, default="false", desc="Was block valid when evicted";
+  bool isGLCSet, default="false", desc="GLC flag value in the request";
+  bool isSLCSet, default="false", desc="SLC flag value in the request";
 
   bool functionalRead(Packet *pkt) {
     return false;
@@ -321,6 +331,8 @@ structure(TriggerMsg, desc="...", interface="Message") {
   TriggerType Type,             desc="Type of trigger";
   CacheId Dest,         default="CacheId_NA", desc="Cache to invalidate";
   int ProgramCounter,           desc="PC that accesses to this block";
+  bool isGLCSet,        default="false", desc="GLC flag value in the request";
+  bool isSLCSet,        default="false", desc="SLC flag value in the request";
 
   bool functionalRead(Packet *pkt) {
     return false;
diff --git a/src/mem/ruby/protocol/RubySlicc_MemControl.sm b/src/mem/ruby/protocol/RubySlicc_MemControl.sm
index e8517a4a07..012b169dea 100644
--- a/src/mem/ruby/protocol/RubySlicc_MemControl.sm
+++ b/src/mem/ruby/protocol/RubySlicc_MemControl.sm
@@ -74,6 +74,8 @@ structure(MemoryMsg, desc="...", interface="Message") {
   PrefetchBit Prefetch,         desc="Is this a prefetch request";
   bool ReadX,                   desc="Exclusive";
   int Acks,                     desc="How many acks to expect";
+  bool isGLCSet,                desc="Bypass L1 Cache";
+  bool isSLCSet,                desc="Bypass L1 and L2 Caches";
 
   bool functionalRead(Packet *pkt) {
     if ((MessageSize == MessageSizeType:Response_Data) ||
diff --git a/src/mem/ruby/protocol/RubySlicc_Types.sm b/src/mem/ruby/protocol/RubySlicc_Types.sm
index 8d76f78f76..8ba9d935ff 100644
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -177,6 +177,8 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") {
   int htmTransactionUid,     desc="Used to identify the unique HTM transaction that produced this request";
   bool isTlbi,               desc="Memory request is a TLB shootdown (invalidation) operation";
   Addr tlbiTransactionUid,   desc="Unique identifier of the TLB shootdown operation that produced this request";
+  bool isGLCSet,             default="false",desc="If flag is set, bypass GPU L1 cache";
+  bool isSLCSet,             default="false",desc="If flag is set, bypass GPU L1 and L2 caches";
 
   RequestPtr getRequestPtr();
 }
diff --git a/src/mem/ruby/slicc_interface/RubyRequest.hh b/src/mem/ruby/slicc_interface/RubyRequest.hh
index 2345c224fb..89ce83451e 100644
--- a/src/mem/ruby/slicc_interface/RubyRequest.hh
+++ b/src/mem/ruby/slicc_interface/RubyRequest.hh
@@ -79,6 +79,11 @@ class RubyRequest : public Message
     bool m_isTlbi;
     // Should be uint64, but SLICC complains about casts
     Addr m_tlbiTransactionUid;
+    // GPU cache bypass flags. GLC bypasses L1 while SLC bypasses both L1 and
+    // L2 if set to true. They are set to false by default and they must be
+    // explicitly set to true in the program in order to bypass caches
+    bool m_isGLCSet;
+    bool m_isSLCSet;
 
     RubyRequest(Tick curTime, uint64_t _paddr, int _len,
         uint64_t _pc, RubyRequestType _type, RubyAccessMode _access_mode,
@@ -99,6 +104,13 @@ class RubyRequest : public Message
           m_tlbiTransactionUid(0)
     {
         m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        if (_pkt) {
+            m_isGLCSet = m_pkt->req->isGLCSet();
+            m_isSLCSet = m_pkt->req->isSLCSet();
+        } else {
+            m_isGLCSet = 0;
+            m_isSLCSet = 0;
+        }
     }
 
     /** RubyRequest for memory management commands */
@@ -120,6 +132,13 @@ class RubyRequest : public Message
           m_tlbiTransactionUid(0)
     {
         assert(m_pkt->req->isMemMgmt());
+        if (_pkt) {
+            m_isGLCSet = m_pkt->req->isGLCSet();
+            m_isSLCSet = m_pkt->req->isSLCSet();
+        } else {
+            m_isGLCSet = 0;
+            m_isSLCSet = 0;
+        }
     }
 
     RubyRequest(Tick curTime, uint64_t _paddr, int _len,
@@ -148,6 +167,13 @@ class RubyRequest : public Message
           m_tlbiTransactionUid(0)
     {
         m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        if (_pkt) {
+            m_isGLCSet = m_pkt->req->isGLCSet();
+            m_isSLCSet = m_pkt->req->isSLCSet();
+        } else {
+            m_isGLCSet = 0;
+            m_isSLCSet = 0;
+        }
     }
 
     RubyRequest(Tick curTime, uint64_t _paddr, int _len,
@@ -177,6 +203,14 @@ class RubyRequest : public Message
           m_tlbiTransactionUid(0)
     {
         m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        if (_pkt) {
+            m_isGLCSet = m_pkt->req->isGLCSet();
+            m_isSLCSet = m_pkt->req->isSLCSet();
+
+        } else {
+            m_isGLCSet = 0;
+            m_isSLCSet = 0;
+        }
     }
 
     RubyRequest(Tick curTime) : Message(curTime) {}