gpu-compute,mem-ruby: Add support for GPU cache bypassing

The GPU cache models do not support cache bypassing when the GLC or SLC AMDGPU instruction modifiers are used in a load or store. This commit adds cache bypass support by introducing new transitions in the coherence protocol used by the GPU memory system. Now, instructions with the GLC bit set will not cache in the L1 and instructions with SLC bit set will not cache in L1 or L2. Change-Id: Id29a47b0fa7e16a21a7718949db802f85e9897c3 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/66991 Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com> Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
2022-12-26 19:14:11 -06:00
parent 5db889572a
commit 66d4a15820
9 changed files with 316 additions and 8 deletions
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -1100,6 +1100,16 @@ class Packet : public Printable
        flags.set(VALID_SIZE);
    }

+    /**
+     * Accessor functions for the cache bypass flags. The cache bypass
+     * can specify which levels in the hierarchy to bypass. If GLC_BIT
+     * is set, the requests are globally coherent and bypass TCP.
+     * If SLC_BIT is set, then the requests are system level coherent
+     * and bypass both TCP and TCC.
+     */
+    bool isGLCSet() const { return req->isGLCSet();}
+    bool isSLCSet() const { return req->isSLCSet();}
+
    /**
     * Check if packet corresponds to a given block-aligned address and
     * address space.
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -1071,6 +1071,17 @@ class Request

    bool isAcquire() const { return _cacheCoherenceFlags.isSet(ACQUIRE); }

+
+    /**
+     * Accessor functions for the cache bypass flags. The cache bypass
+     * can specify which levels in the hierarchy to bypass. If GLC_BIT
+     * is set, the requests are globally coherent and bypass TCP.
+     * If SLC_BIT is set, then the requests are system level coherent
+     * and bypass both TCP and TCC.
+     */
+    bool isGLCSet() const {return _cacheCoherenceFlags.isSet(GLC_BIT); }
+    bool isSLCSet() const {return _cacheCoherenceFlags.isSet(SLC_BIT); }
+
    /**
     * Accessor functions for the memory space configuration flags and used by
     * GPU ISAs such as the Heterogeneous System Architecture (HSA). Note that
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -56,8 +56,10 @@ machine(MachineType:TCC, "TCC Cache")
  enumeration(Event, desc="TCC Events") {
    // Requests coming from the Cores
    RdBlk,                  desc="RdBlk event";
+    RdBypassEvict,          desc="Bypass L2 on reads. Evict if cache block already allocated";
    WrVicBlk,               desc="L1 Write Through";
    WrVicBlkBack,           desc="L1 Write Through(dirty cache)";
+    WrVicBlkEvict,          desc="L1 Write Through(dirty cache) and evict";
    Atomic,                 desc="Atomic Op";
    AtomicDone,             desc="AtomicOps Complete";
    AtomicNotDone,          desc="AtomicOps not Complete";
@@ -68,6 +70,7 @@ machine(MachineType:TCC, "TCC Cache")
    PrbInv,                 desc="Invalidating probe";
    // Coming from Memory Controller
    WBAck,                  desc="writethrough ack from memory";
+    Bypass,                 desc="Bypass the entire L2 cache";
  }

  // STATES
@@ -107,6 +110,8 @@ machine(MachineType:TCC, "TCC Cache")
    NetDest Destination, desc="Data destination";
    int numAtomics,     desc="number remaining atomics";
    int atomicDoneCnt,  desc="number AtomicDones triggered";
+    bool isGLCSet,      desc="Bypass L1 Cache";
+    bool isSLCSet,      desc="Bypass L1 and L2 Cache";
  }

  structure(TBETable, external="yes") {
@@ -173,7 +178,6 @@ machine(MachineType:TCC, "TCC Cache")

  int functionalWrite(Addr addr, Packet *pkt) {
    int num_functional_writes := 0;
-
    TBE tbe := TBEs.lookup(addr);
    if(is_valid(tbe)) {
      num_functional_writes := num_functional_writes +
@@ -279,7 +283,11 @@ machine(MachineType:TCC, "TCC Cache")
      peek(responseFromNB_in, ResponseMsg, block_on="addr") {
        TBE tbe := TBEs.lookup(in_msg.addr);
        Entry cache_entry := getCacheEntry(in_msg.addr);
-        if (in_msg.Type == CoherenceResponseType:NBSysResp) {
+        if (in_msg.isSLCSet) {
+            // If the SLC bit is set, the response needs to bypass the cache
+            // and should not be allocated an entry.
+            trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceResponseType:NBSysResp) {
          if(presentOrAvail(in_msg.addr)) {
            trigger(Event:Data, in_msg.addr, cache_entry, tbe);
          } else {
@@ -313,7 +321,18 @@ machine(MachineType:TCC, "TCC Cache")
        TBE tbe := TBEs.lookup(in_msg.addr);
        Entry cache_entry := getCacheEntry(in_msg.addr);
        if (in_msg.Type == CoherenceRequestType:WriteThrough) {
-            if(WB) {
+            if (in_msg.isSLCSet) {
+                // The request should bypass the cache if SLC bit is set.
+                // If the cache entry exists already, then evict it.
+                // Else, perform a normal cache access.
+                // The cache entry is allocated only on response and bypass is
+                // handled there
+                if(presentOrAvail(in_msg.addr)) {
+                    trigger(Event:WrVicBlkEvict, in_msg.addr, cache_entry, tbe);
+                } else {
+                    trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
+                }
+            } else if(WB) {
                if(presentOrAvail(in_msg.addr)) {
                    trigger(Event:WrVicBlkBack, in_msg.addr, cache_entry, tbe);
                } else {
@@ -326,7 +345,13 @@ machine(MachineType:TCC, "TCC Cache")
        } else if (in_msg.Type == CoherenceRequestType:Atomic) {
          trigger(Event:Atomic, in_msg.addr, cache_entry, tbe);
        } else if (in_msg.Type == CoherenceRequestType:RdBlk) {
-          trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+          if (in_msg.isSLCSet) {
+            // If SLC bit is set, the request needs to go directly to memory.
+            // If a cache block already exists, then evict it.
+            trigger(Event:RdBypassEvict, in_msg.addr, cache_entry, tbe);
+          } else {
+            trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+          }
        } else {
          DPRINTF(RubySlicc, "%s\n", in_msg);
          error("Unexpected Response Message to Core");
@@ -354,6 +379,8 @@ machine(MachineType:TCC, "TCC Cache")
        out_msg.MessageSize := MessageSizeType:Response_Data;
        out_msg.Dirty := false;
        out_msg.State := CoherenceState:Shared;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
        DPRINTF(RubySlicc, "%s\n", out_msg);
      }
    }
@@ -371,15 +398,46 @@ machine(MachineType:TCC, "TCC Cache")
      out_msg.Dirty := false;
      out_msg.State := CoherenceState:Shared;
      DPRINTF(RubySlicc, "%s\n", out_msg);
+      peek(responseFromNB_in, ResponseMsg) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
    }
    enqueue(unblockToNB_out, UnblockMsg, 1) {
      out_msg.addr := address;
      out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+      peek(responseFromNB_in, ResponseMsg) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
      DPRINTF(RubySlicc, "%s\n", out_msg);
    }
  }

+  action(rb_bypassDone, "rb", desc="bypass L2 of read access") {
+    peek(responseFromNB_in, ResponseMsg) {
+        enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:TDSysResp;
+          out_msg.Sender := machineID;
+          out_msg.Destination := tbe.Destination;
+          out_msg.DataBlk := in_msg.DataBlk;
+          out_msg.MessageSize := MessageSizeType:Response_Data;
+          out_msg.Dirty := false;
+          out_msg.State := CoherenceState:Shared;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+        enqueue(unblockToNB_out, UnblockMsg, 1) {
+          out_msg.addr := address;
+          out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
+          out_msg.MessageSize := MessageSizeType:Unblock_Control;
+          DPRINTF(RubySlicc, "%s\n", out_msg);
+        }
+    }
+  }

  action(rd_requestData, "r", desc="Miss in L2, pass on") {
    if(tbe.Destination.count()==1){
@@ -391,6 +449,8 @@ machine(MachineType:TCC, "TCC Cache")
          out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
          out_msg.Shared := false; // unneeded for this request
          out_msg.MessageSize := in_msg.MessageSize;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
          DPRINTF(RubySlicc, "%s\n", out_msg);
        }
      }
@@ -407,6 +467,9 @@ machine(MachineType:TCC, "TCC Cache")
        out_msg.Sender := machineID;
        out_msg.MessageSize := MessageSizeType:Writeback_Control;
        out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+
      }
    }
  }
@@ -421,6 +484,9 @@ machine(MachineType:TCC, "TCC Cache")
        out_msg.Sender := machineID;
        out_msg.MessageSize := MessageSizeType:Writeback_Control;
        out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+
      }
    }
  }
@@ -434,6 +500,9 @@ machine(MachineType:TCC, "TCC Cache")
          out_msg.Sender := machineID;
          out_msg.MessageSize := in_msg.MessageSize;
          out_msg.DataBlk := in_msg.DataBlk;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
+
        }
    }
  }
@@ -466,6 +535,8 @@ machine(MachineType:TCC, "TCC Cache")
      peek(coreRequestNetwork_in, CPURequestMsg) {
        if(in_msg.Type == CoherenceRequestType:RdBlk || in_msg.Type == CoherenceRequestType:Atomic){
          tbe.Destination.add(in_msg.Requestor);
+          tbe.isGLCSet := in_msg.isGLCSet;
+          tbe.isSLCSet := in_msg.isSLCSet;
        }
      }
    }
@@ -505,6 +576,8 @@ machine(MachineType:TCC, "TCC Cache")
        out_msg.DataBlk := in_msg.DataBlk;
        out_msg.writeMask.orMask(in_msg.writeMask);
        out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
      }
    }
  }
@@ -520,6 +593,10 @@ machine(MachineType:TCC, "TCC Cache")
      out_msg.Dirty := true;
      out_msg.DataBlk := cache_entry.DataBlk;
      out_msg.writeMask.orMask(cache_entry.writeMask);
+      peek(coreRequestNetwork_in, CPURequestMsg) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
    }
  }

@@ -534,6 +611,8 @@ machine(MachineType:TCC, "TCC Cache")
        out_msg.Type := CoherenceRequestType:Atomic;
        out_msg.Dirty := true;
        out_msg.writeMask.orMask(in_msg.writeMask);
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
      }
    }
  }
@@ -549,6 +628,10 @@ machine(MachineType:TCC, "TCC Cache")
      out_msg.Ntsl := true;
      out_msg.State := CoherenceState:NA;
      out_msg.MessageSize := MessageSizeType:Response_Control;
+      peek(probeNetwork_in, NBProbeRequestMsg) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
    }
  }
  action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
@@ -592,6 +675,10 @@ machine(MachineType:TCC, "TCC Cache")
        tbe.atomicDoneCnt := tbe.atomicDoneCnt + 1;
        out_msg.addr := address;
        out_msg.Type := TriggerType:AtomicDone;
+        peek(responseFromNB_in, ResponseMsg) {
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
+        }
      }
    }
  }
@@ -659,6 +746,54 @@ machine(MachineType:TCC, "TCC Cache")
    p_popRequestQueue;
  }

+  transition(I, RdBypassEvict) {TagArrayRead} {
+    p_profileMiss;
+    t_allocateTBE;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+// Transition to be called when a read request with SLC flag set arrives at
+// entry in state W. It evicts and invalidates the cache entry before
+// forwarding the request to global memory
+  transition(W, RdBypassEvict, I) {TagArrayRead} {
+    p_profileMiss;
+    t_allocateTBE;
+    wb_writeBack;
+    i_invL2;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+// Transition to be called when a read request with SLC flag set arrives at
+// entry in state M. It evicts and invalidates the cache entry before
+// forwarding the request to global memory to main memory
+  transition(M, RdBypassEvict, I) {TagArrayRead} {
+    p_profileMiss;
+    t_allocateTBE;
+    wb_writeBack;
+    i_invL2;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+// Transition to be called when a read request with SLC flag set arrives at
+// entry in state V. It invalidates the cache entry before forwarding the
+// request to global memory.
+  transition(V, RdBypassEvict, I) {TagArrayRead} {
+    p_profileMiss;
+    t_allocateTBE;
+    i_invL2;
+    rd_requestData;
+    p_popRequestQueue;
+  }
+
+// Transition to be called when a read request with SLC flag arrives at entry
+// in transient state. The request stalls until the pending transition is complete.
+  transition({WI, IV}, RdBypassEvict)  {
+    st_stallAndWaitRequest;
+  }
+
  transition(V, Atomic, A) {TagArrayRead} {
    p_profileHit;
    i_invL2;
@@ -730,6 +865,31 @@ transition(I, Atomic, A) {TagArrayRead} {
    p_popRequestQueue;
  }

+// Transition to be called when a write request with SLC bit set arrives at an
+// entry with state V. The entry has to be evicted and invalidated before the
+// request is forwarded to global memory
+  transition(V, WrVicBlkEvict, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    p_profileMiss;
+    ut_updateTag;
+    t_allocateTBE;
+    wt_writeThrough;
+    i_invL2;
+    p_popRequestQueue;
+  }
+
+// Transition to be called when a write request with SLC bit set arrives at an
+// entry with state W. The entry has to be evicted and invalidated before the
+// request is forwarded to global memory.
+  transition(W, WrVicBlkEvict, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+    p_profileMiss;
+    ut_updateTag;
+    wdb_writeDirtyBytes;
+    t_allocateTBE;
+    wb_writeBack;
+    i_invL2;
+    p_popRequestQueue;
+  }
+
  transition({W, M}, L2_Repl, WI) {TagArrayRead, DataArrayRead} {
    t_allocateTBE;
    wb_writeBack;
@@ -764,6 +924,16 @@ transition(I, Atomic, A) {TagArrayRead} {
    pp_popProbeQueue;
  }

+// Transition to be called when the response for a request with SLC bit set
+// arrives. The request has to be forwarded to the core that needs it while
+// making sure no entry is allocated.
+  transition(I, Bypass, I) {
+    rb_bypassDone;
+    pr_popResponseQueue;
+    wada_wakeUpAllDependentsAddr;
+    dt_deallocateTBE;
+  }
+
  transition(IV, Data, V) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
    a_allocateBlock;
    ut_updateTag;
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -60,6 +60,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
  enumeration(Event, desc="TCP Events") {
    // Core initiated
    Load,           desc="Load";
+    LoadBypassEvict, desc="Bypass L1 on a load. Evict if cache block already allocated";
    Store,          desc="Store to L1 (L1 is dirty)";
    StoreThrough,   desc="Store directly to L2(L1 is clean)";
    Atomic,         desc="Atomic";
@@ -256,8 +257,10 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
        Entry cache_entry := getCacheEntry(in_msg.addr);
        TBE tbe := TBEs.lookup(in_msg.addr);
        if (in_msg.Type == CoherenceResponseType:TDSysResp) {
-          // disable L1 cache
-          if (disableL1) {
+          if (disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) {
+              // If L1 is disabled or requests have GLC or SLC flag set,
+              // then, the requests should not cache in the L1. The response
+              // from L2/global memory should bypass the cache
 	          trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
          } else {
            if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) {
@@ -284,13 +287,23 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
        TBE tbe := TBEs.lookup(in_msg.LineAddress);
        DPRINTF(RubySlicc, "%s\n", in_msg);
        if (in_msg.Type == RubyRequestType:LD) {
-          trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);
+          if ((in_msg.isGLCSet || in_msg.isSLCSet) && is_valid(cache_entry)) {
+            // Read rquests with GLC or SLC bit set should not cache in the L1.
+            // They need to bypass the L1 and go to the L2. If an entry exists
+            // in the L1, it needs to be evicted
+            trigger(Event:LoadBypassEvict, in_msg.LineAddress, cache_entry, tbe);
+          }
+          else {
+            trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);
+          }
        } else if (in_msg.Type == RubyRequestType:ATOMIC ||
                   in_msg.Type == RubyRequestType:ATOMIC_RETURN ||
                   in_msg.Type == RubyRequestType:ATOMIC_NO_RETURN) {
          trigger(Event:Atomic, in_msg.LineAddress, cache_entry, tbe);
        } else if (in_msg.Type == RubyRequestType:ST) {
-          if(disableL1) {
+          if(disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) {
+            // Write requests with GLC or SLC bit set, or when L1 is disabled,
+            // should not cache in the L1. They need to perform a store through
            trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
          } else {
            if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
@@ -330,6 +343,10 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
                              TCC_select_low_bit, TCC_select_num_bits));
      out_msg.MessageSize := MessageSizeType:Request_Control;
      out_msg.InitialRequestTime := curCycle();
+      peek(mandatoryQueue_in, RubyRequest) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
    }
  }

@@ -375,6 +392,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
                              TCC_select_low_bit, TCC_select_num_bits));
          out_msg.MessageSize := MessageSizeType:Request_Control;
          out_msg.InitialRequestTime := curCycle();
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
        }
      }
    }
@@ -401,6 +420,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
      // forward inst sequence number to lower TCC
      peek(mandatoryQueue_in, RubyRequest) {
        out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
      }
    }
  }
@@ -418,6 +439,11 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
        out_msg.Type := CoherenceRequestType:Atomic;
        out_msg.InitialRequestTime := curCycle();
        out_msg.Shared := false;
+        peek(mandatoryQueue_in, RubyRequest) {
+          out_msg.instSeqNum := in_msg.instSeqNum;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
+        }
      }
    }
  }
@@ -583,6 +609,17 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
    p_popMandatoryQueue;
  }

+// Transition to be called when a load request with GLC or SLC flag set arrives
+// at L1. This transition invalidates any existing entry and forwards the
+// request to L2.
+  transition(V, LoadBypassEvict, I) {TagArrayRead, TagArrayWrite} {
+    uu_profileDataMiss;
+    inv_invDone;
+    ic_invCache;
+    n_issueRdBlk;
+    p_popMandatoryQueue;
+}
+
  transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} {
    t_allocateTBE;
    mru_updateMRU;
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
@@ -161,6 +161,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
    uint64_t probe_id,        desc="probe id for lifetime profiling";
    WriteMask writeMask,    desc="outstanding write through mask";
    int Len,            desc="Length of memory request for DMA";
+    bool isGLCSet,      desc="Bypass L1 Cache";
+    bool isSLCSet,      desc="Bypass L1 and L2 Cache";
  }

  structure(TBETable, external="yes") {
@@ -483,6 +485,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
      out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
      out_msg.OriginalResponder := tbe.LastSender;
      out_msg.L3Hit := tbe.L3Hit;
+      out_msg.isGLCSet := tbe.isGLCSet;
+      out_msg.isSLCSet := tbe.isSLCSet;
      DPRINTF(RubySlicc, "%s\n", out_msg);
    }
  }
@@ -512,6 +516,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
        out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
        out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
        out_msg.OriginalResponder := tbe.LastSender;
+        out_msg.isGLCSet := tbe.isGLCSet;
+        out_msg.isSLCSet := tbe.isSLCSet;
        if(tbe.atomicData){
          out_msg.WTRequestor := tbe.WTRequestor;
        }
@@ -540,6 +546,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
        out_msg.InitialRequestTime := tbe.InitialRequestTime;
        out_msg.ForwardRequestTime := curCycle();
        out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+        out_msg.isGLCSet := tbe.isGLCSet;
+        out_msg.isSLCSet := tbe.isSLCSet;
        DPRINTF(RubySlicc, "%s\n", out_msg);
      }
  }
@@ -557,6 +565,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
        out_msg.ForwardRequestTime := curCycle();
        out_msg.ProbeRequestStartTime := curCycle();
        out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
      }
    }
  }
@@ -569,6 +579,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
        out_msg.Sender := machineID;
        out_msg.MessageSize := MessageSizeType:Writeback_Data;
        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
      }
    }
  }
@@ -624,6 +636,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
          out_msg.Type := MemoryRequestType:MEMORY_READ;
          out_msg.Sender := machineID;
          out_msg.MessageSize := MessageSizeType:Request_Control;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
        }
      }
    }
@@ -739,6 +753,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
          out_msg.MessageSize := MessageSizeType:Control;
          out_msg.Destination := probe_dests;
          tbe.NumPendingAcks := out_msg.Destination.count();
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
          DPRINTF(RubySlicc, "%s\n", out_msg);
          APPEND_TRANSITION_COMMENT(" dc: Acks remaining: ");
          APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
@@ -842,6 +858,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
          out_msg.ReturnData := true;
          out_msg.MessageSize := MessageSizeType:Control;
          out_msg.Destination := probe_dests;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
          tbe.NumPendingAcks := out_msg.Destination.count();
          DPRINTF(RubySlicc, "%s\n", (out_msg));
          APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
@@ -897,6 +915,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
          out_msg.ReturnData := false;
          out_msg.MessageSize := MessageSizeType:Control;
          out_msg.Destination := probe_dests;
+          out_msg.isGLCSet := in_msg.isGLCSet;
+          out_msg.isSLCSet := in_msg.isSLCSet;
          tbe.NumPendingAcks := out_msg.Destination.count();
          APPEND_TRANSITION_COMMENT(" ic: Acks remaining: ");
          APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
@@ -923,6 +943,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
        out_msg.Sender := machineID;
        out_msg.MessageSize := MessageSizeType:Writeback_Data;
        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
      }
      if (tbe.Dirty == false) {
          // have to update the TBE, too, because of how this
@@ -985,6 +1007,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
      tbe.NumPendingAcks := 0;
      tbe.Cached := in_msg.ForceShared;
      tbe.InitialRequestTime := in_msg.InitialRequestTime;
+      tbe.isGLCSet := in_msg.isGLCSet;
+      tbe.isSLCSet := in_msg.isSLCSet;
    }
  }

@@ -1004,6 +1028,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
        out_msg.Sender := machineID;
        out_msg.MessageSize := MessageSizeType:Writeback_Data;
        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.isGLCSet := tbe.isGLCSet;
+        out_msg.isSLCSet := tbe.isSLCSet;
        DPRINTF(ProtocolTrace, "%s\n", out_msg);
      }
    }
@@ -1104,6 +1130,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
            out_msg.Sender := machineID;
            out_msg.MessageSize := MessageSizeType:Writeback_Data;
            out_msg.DataBlk := victim_entry.DataBlk;
+            out_msg.isGLCSet := in_msg.isGLCSet;
+            out_msg.isSLCSet := in_msg.isSLCSet;
          }
          L3CacheMemory.deallocate(victim);
        }
@@ -1136,6 +1164,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
            out_msg.Sender := machineID;
            out_msg.MessageSize := MessageSizeType:Writeback_Data;
            out_msg.DataBlk := victim_entry.DataBlk;
+            out_msg.isGLCSet := tbe.isGLCSet;
+            out_msg.isSLCSet := tbe.isSLCSet;
          }
          L3CacheMemory.deallocate(victim);
        }
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
@@ -138,6 +138,9 @@ structure(CPURequestMsg, desc="...", interface="Message") {
  bool NoWriteConflict,             default="true", desc="write collided with CAB entry";
  int ProgramCounter,               desc="PC that accesses to this block";

+  bool isGLCSet, default="false", desc="GLC flag value in the request";
+  bool isSLCSet, default="false", desc="SLC flag value in the request";
+
  bool functionalRead(Packet *pkt) {
    // Only PUTX messages contains the data block
    if (Type == CoherenceRequestType:VicDirty) {
@@ -165,6 +168,8 @@ structure(NBProbeRequestMsg, desc="...", interface="Message") {
  MachineID Requestor,          desc="Requestor id for 3-hop requests";
  bool NoAckNeeded, default="false", desc="For short circuting acks";
  int ProgramCounter,           desc="PC that accesses to this block";
+  bool isGLCSet,                desc="Bypass L1 Cache";
+  bool isSLCSet,                desc="Bypass L1 and L2 Caches";

  bool functionalRead(Packet *pkt) {
    return false;
@@ -248,6 +253,9 @@ structure(ResponseMsg, desc="...", interface="Message") {
  int ProgramCounter,       desc="PC that issues this request";
  bool mispred,              desc="tell TCP if the block should not be bypassed";

+  bool isGLCSet, default="false", desc="GLC flag value in the request that triggered response";
+  bool isSLCSet, default="false", desc="SLC flag value in the request that triggered response";
+

  bool functionalRead(Packet *pkt) {
    // Only PUTX messages contains the data block
@@ -277,6 +285,8 @@ structure(UnblockMsg, desc="...", interface="Message") {
  bool wasValid, default="false", desc="Was block valid when evicted";
  bool valid, default="false", desc="Is block valid";
  bool validToInvalid, default="false", desc="Was block valid when evicted";
+  bool isGLCSet, default="false", desc="GLC flag value in the request";
+  bool isSLCSet, default="false", desc="SLC flag value in the request";

  bool functionalRead(Packet *pkt) {
    return false;
@@ -321,6 +331,8 @@ structure(TriggerMsg, desc="...", interface="Message") {
  TriggerType Type,             desc="Type of trigger";
  CacheId Dest,         default="CacheId_NA", desc="Cache to invalidate";
  int ProgramCounter,           desc="PC that accesses to this block";
+  bool isGLCSet,        default="false", desc="GLC flag value in the request";
+  bool isSLCSet,        default="false", desc="SLC flag value in the request";

  bool functionalRead(Packet *pkt) {
    return false;
--- a/src/mem/ruby/protocol/RubySlicc_MemControl.sm
+++ b/src/mem/ruby/protocol/RubySlicc_MemControl.sm
@@ -74,6 +74,8 @@ structure(MemoryMsg, desc="...", interface="Message") {
  PrefetchBit Prefetch,         desc="Is this a prefetch request";
  bool ReadX,                   desc="Exclusive";
  int Acks,                     desc="How many acks to expect";
+  bool isGLCSet,                desc="Bypass L1 Cache";
+  bool isSLCSet,                desc="Bypass L1 and L2 Caches";

  bool functionalRead(Packet *pkt) {
    if ((MessageSize == MessageSizeType:Response_Data) ||
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -177,6 +177,8 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") {
  int htmTransactionUid,     desc="Used to identify the unique HTM transaction that produced this request";
  bool isTlbi,               desc="Memory request is a TLB shootdown (invalidation) operation";
  Addr tlbiTransactionUid,   desc="Unique identifier of the TLB shootdown operation that produced this request";
+  bool isGLCSet,             default="false",desc="If flag is set, bypass GPU L1 cache";
+  bool isSLCSet,             default="false",desc="If flag is set, bypass GPU L1 and L2 caches";

  RequestPtr getRequestPtr();
 }
--- a/src/mem/ruby/slicc_interface/RubyRequest.hh
+++ b/src/mem/ruby/slicc_interface/RubyRequest.hh
@@ -79,6 +79,11 @@ class RubyRequest : public Message
    bool m_isTlbi;
    // Should be uint64, but SLICC complains about casts
    Addr m_tlbiTransactionUid;
+    // GPU cache bypass flags. GLC bypasses L1 while SLC bypasses both L1 and
+    // L2 if set to true. They are set to false by default and they must be
+    // explicitly set to true in the program in order to bypass caches
+    bool m_isGLCSet;
+    bool m_isSLCSet;

    RubyRequest(Tick curTime, uint64_t _paddr, int _len,
        uint64_t _pc, RubyRequestType _type, RubyAccessMode _access_mode,
@@ -99,6 +104,13 @@ class RubyRequest : public Message
          m_tlbiTransactionUid(0)
    {
        m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        if (_pkt) {
+            m_isGLCSet = m_pkt->req->isGLCSet();
+            m_isSLCSet = m_pkt->req->isSLCSet();
+        } else {
+            m_isGLCSet = 0;
+            m_isSLCSet = 0;
+        }
    }

    /** RubyRequest for memory management commands */
@@ -120,6 +132,13 @@ class RubyRequest : public Message
          m_tlbiTransactionUid(0)
    {
        assert(m_pkt->req->isMemMgmt());
+        if (_pkt) {
+            m_isGLCSet = m_pkt->req->isGLCSet();
+            m_isSLCSet = m_pkt->req->isSLCSet();
+        } else {
+            m_isGLCSet = 0;
+            m_isSLCSet = 0;
+        }
    }

    RubyRequest(Tick curTime, uint64_t _paddr, int _len,
@@ -148,6 +167,13 @@ class RubyRequest : public Message
          m_tlbiTransactionUid(0)
    {
        m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        if (_pkt) {
+            m_isGLCSet = m_pkt->req->isGLCSet();
+            m_isSLCSet = m_pkt->req->isSLCSet();
+        } else {
+            m_isGLCSet = 0;
+            m_isSLCSet = 0;
+        }
    }

    RubyRequest(Tick curTime, uint64_t _paddr, int _len,
@@ -177,6 +203,14 @@ class RubyRequest : public Message
          m_tlbiTransactionUid(0)
    {
        m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        if (_pkt) {
+            m_isGLCSet = m_pkt->req->isGLCSet();
+            m_isSLCSet = m_pkt->req->isSLCSet();
+
+        } else {
+            m_isGLCSet = 0;
+            m_isSLCSet = 0;
+        }
    }

    RubyRequest(Tick curTime) : Message(curTime) {}