diff --git a/src/mem/packet.hh b/src/mem/packet.hh index 9238dbec00..a80b918798 100644 --- a/src/mem/packet.hh +++ b/src/mem/packet.hh @@ -1100,6 +1100,16 @@ class Packet : public Printable flags.set(VALID_SIZE); } + /** + * Accessor functions for the cache bypass flags. The cache bypass + * can specify which levels in the hierarchy to bypass. If GLC_BIT + * is set, the requests are globally coherent and bypass TCP. + * If SLC_BIT is set, then the requests are system level coherent + * and bypass both TCP and TCC. + */ + bool isGLCSet() const { return req->isGLCSet();} + bool isSLCSet() const { return req->isSLCSet();} + /** * Check if packet corresponds to a given block-aligned address and * address space. diff --git a/src/mem/request.hh b/src/mem/request.hh index 39d9d7281c..6a0cbc21d4 100644 --- a/src/mem/request.hh +++ b/src/mem/request.hh @@ -1071,6 +1071,17 @@ class Request bool isAcquire() const { return _cacheCoherenceFlags.isSet(ACQUIRE); } + + /** + * Accessor functions for the cache bypass flags. The cache bypass + * can specify which levels in the hierarchy to bypass. If GLC_BIT + * is set, the requests are globally coherent and bypass TCP. + * If SLC_BIT is set, then the requests are system level coherent + * and bypass both TCP and TCC. + */ + bool isGLCSet() const {return _cacheCoherenceFlags.isSet(GLC_BIT); } + bool isSLCSet() const {return _cacheCoherenceFlags.isSet(SLC_BIT); } + /** * Accessor functions for the memory space configuration flags and used by * GPU ISAs such as the Heterogeneous System Architecture (HSA). Note that diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm index 032a64cec4..ae142471fa 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm @@ -56,8 +56,10 @@ machine(MachineType:TCC, "TCC Cache") enumeration(Event, desc="TCC Events") { // Requests coming from the Cores RdBlk, desc="RdBlk event"; + RdBypassEvict, desc="Bypass L2 on reads. Evict if cache block already allocated"; WrVicBlk, desc="L1 Write Through"; WrVicBlkBack, desc="L1 Write Through(dirty cache)"; + WrVicBlkEvict, desc="L1 Write Through(dirty cache) and evict"; Atomic, desc="Atomic Op"; AtomicDone, desc="AtomicOps Complete"; AtomicNotDone, desc="AtomicOps not Complete"; @@ -68,6 +70,7 @@ machine(MachineType:TCC, "TCC Cache") PrbInv, desc="Invalidating probe"; // Coming from Memory Controller WBAck, desc="writethrough ack from memory"; + Bypass, desc="Bypass the entire L2 cache"; } // STATES @@ -107,6 +110,8 @@ machine(MachineType:TCC, "TCC Cache") NetDest Destination, desc="Data destination"; int numAtomics, desc="number remaining atomics"; int atomicDoneCnt, desc="number AtomicDones triggered"; + bool isGLCSet, desc="Bypass L1 Cache"; + bool isSLCSet, desc="Bypass L1 and L2 Cache"; } structure(TBETable, external="yes") { @@ -173,7 +178,6 @@ machine(MachineType:TCC, "TCC Cache") int functionalWrite(Addr addr, Packet *pkt) { int num_functional_writes := 0; - TBE tbe := TBEs.lookup(addr); if(is_valid(tbe)) { num_functional_writes := num_functional_writes + @@ -279,7 +283,11 @@ machine(MachineType:TCC, "TCC Cache") peek(responseFromNB_in, ResponseMsg, block_on="addr") { TBE tbe := TBEs.lookup(in_msg.addr); Entry cache_entry := getCacheEntry(in_msg.addr); - if (in_msg.Type == CoherenceResponseType:NBSysResp) { + if (in_msg.isSLCSet) { + // If the SLC bit is set, the response needs to bypass the cache + // and should not be allocated an entry. + trigger(Event:Bypass, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceResponseType:NBSysResp) { if(presentOrAvail(in_msg.addr)) { trigger(Event:Data, in_msg.addr, cache_entry, tbe); } else { @@ -313,7 +321,18 @@ machine(MachineType:TCC, "TCC Cache") TBE tbe := TBEs.lookup(in_msg.addr); Entry cache_entry := getCacheEntry(in_msg.addr); if (in_msg.Type == CoherenceRequestType:WriteThrough) { - if(WB) { + if (in_msg.isSLCSet) { + // The request should bypass the cache if SLC bit is set. + // If the cache entry exists already, then evict it. + // Else, perform a normal cache access. + // The cache entry is allocated only on response and bypass is + // handled there + if(presentOrAvail(in_msg.addr)) { + trigger(Event:WrVicBlkEvict, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe); + } + } else if(WB) { if(presentOrAvail(in_msg.addr)) { trigger(Event:WrVicBlkBack, in_msg.addr, cache_entry, tbe); } else { @@ -326,7 +345,13 @@ machine(MachineType:TCC, "TCC Cache") } else if (in_msg.Type == CoherenceRequestType:Atomic) { trigger(Event:Atomic, in_msg.addr, cache_entry, tbe); } else if (in_msg.Type == CoherenceRequestType:RdBlk) { - trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe); + if (in_msg.isSLCSet) { + // If SLC bit is set, the request needs to go directly to memory. + // If a cache block already exists, then evict it. + trigger(Event:RdBypassEvict, in_msg.addr, cache_entry, tbe); + } else { + trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe); + } } else { DPRINTF(RubySlicc, "%s\n", in_msg); error("Unexpected Response Message to Core"); @@ -354,6 +379,8 @@ machine(MachineType:TCC, "TCC Cache") out_msg.MessageSize := MessageSizeType:Response_Data; out_msg.Dirty := false; out_msg.State := CoherenceState:Shared; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; DPRINTF(RubySlicc, "%s\n", out_msg); } } @@ -371,15 +398,46 @@ machine(MachineType:TCC, "TCC Cache") out_msg.Dirty := false; out_msg.State := CoherenceState:Shared; DPRINTF(RubySlicc, "%s\n", out_msg); + peek(responseFromNB_in, ResponseMsg) { + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; + } } enqueue(unblockToNB_out, UnblockMsg, 1) { out_msg.addr := address; out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); out_msg.MessageSize := MessageSizeType:Unblock_Control; + peek(responseFromNB_in, ResponseMsg) { + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; + } DPRINTF(RubySlicc, "%s\n", out_msg); } } + action(rb_bypassDone, "rb", desc="bypass L2 of read access") { + peek(responseFromNB_in, ResponseMsg) { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysResp; + out_msg.Sender := machineID; + out_msg.Destination := tbe.Destination; + out_msg.DataBlk := in_msg.DataBlk; + out_msg.MessageSize := MessageSizeType:Response_Data; + out_msg.Dirty := false; + out_msg.State := CoherenceState:Shared; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + enqueue(unblockToNB_out, UnblockMsg, 1) { + out_msg.addr := address; + out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); + out_msg.MessageSize := MessageSizeType:Unblock_Control; + DPRINTF(RubySlicc, "%s\n", out_msg); + } + } + } action(rd_requestData, "r", desc="Miss in L2, pass on") { if(tbe.Destination.count()==1){ @@ -391,6 +449,8 @@ machine(MachineType:TCC, "TCC Cache") out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); out_msg.Shared := false; // unneeded for this request out_msg.MessageSize := in_msg.MessageSize; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; DPRINTF(RubySlicc, "%s\n", out_msg); } } @@ -407,6 +467,9 @@ machine(MachineType:TCC, "TCC Cache") out_msg.Sender := machineID; out_msg.MessageSize := MessageSizeType:Writeback_Control; out_msg.instSeqNum := in_msg.instSeqNum; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; + } } } @@ -421,6 +484,9 @@ machine(MachineType:TCC, "TCC Cache") out_msg.Sender := machineID; out_msg.MessageSize := MessageSizeType:Writeback_Control; out_msg.instSeqNum := in_msg.instSeqNum; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; + } } } @@ -434,6 +500,9 @@ machine(MachineType:TCC, "TCC Cache") out_msg.Sender := machineID; out_msg.MessageSize := in_msg.MessageSize; out_msg.DataBlk := in_msg.DataBlk; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; + } } } @@ -466,6 +535,8 @@ machine(MachineType:TCC, "TCC Cache") peek(coreRequestNetwork_in, CPURequestMsg) { if(in_msg.Type == CoherenceRequestType:RdBlk || in_msg.Type == CoherenceRequestType:Atomic){ tbe.Destination.add(in_msg.Requestor); + tbe.isGLCSet := in_msg.isGLCSet; + tbe.isSLCSet := in_msg.isSLCSet; } } } @@ -505,6 +576,8 @@ machine(MachineType:TCC, "TCC Cache") out_msg.DataBlk := in_msg.DataBlk; out_msg.writeMask.orMask(in_msg.writeMask); out_msg.instSeqNum := in_msg.instSeqNum; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; } } } @@ -520,6 +593,10 @@ machine(MachineType:TCC, "TCC Cache") out_msg.Dirty := true; out_msg.DataBlk := cache_entry.DataBlk; out_msg.writeMask.orMask(cache_entry.writeMask); + peek(coreRequestNetwork_in, CPURequestMsg) { + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; + } } } @@ -534,6 +611,8 @@ machine(MachineType:TCC, "TCC Cache") out_msg.Type := CoherenceRequestType:Atomic; out_msg.Dirty := true; out_msg.writeMask.orMask(in_msg.writeMask); + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; } } } @@ -549,6 +628,10 @@ machine(MachineType:TCC, "TCC Cache") out_msg.Ntsl := true; out_msg.State := CoherenceState:NA; out_msg.MessageSize := MessageSizeType:Response_Control; + peek(probeNetwork_in, NBProbeRequestMsg) { + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; + } } } action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") { @@ -592,6 +675,10 @@ machine(MachineType:TCC, "TCC Cache") tbe.atomicDoneCnt := tbe.atomicDoneCnt + 1; out_msg.addr := address; out_msg.Type := TriggerType:AtomicDone; + peek(responseFromNB_in, ResponseMsg) { + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; + } } } } @@ -659,6 +746,54 @@ machine(MachineType:TCC, "TCC Cache") p_popRequestQueue; } + transition(I, RdBypassEvict) {TagArrayRead} { + p_profileMiss; + t_allocateTBE; + rd_requestData; + p_popRequestQueue; + } + +// Transition to be called when a read request with SLC flag set arrives at +// entry in state W. It evicts and invalidates the cache entry before +// forwarding the request to global memory + transition(W, RdBypassEvict, I) {TagArrayRead} { + p_profileMiss; + t_allocateTBE; + wb_writeBack; + i_invL2; + rd_requestData; + p_popRequestQueue; + } + +// Transition to be called when a read request with SLC flag set arrives at +// entry in state M. It evicts and invalidates the cache entry before +// forwarding the request to global memory to main memory + transition(M, RdBypassEvict, I) {TagArrayRead} { + p_profileMiss; + t_allocateTBE; + wb_writeBack; + i_invL2; + rd_requestData; + p_popRequestQueue; + } + +// Transition to be called when a read request with SLC flag set arrives at +// entry in state V. It invalidates the cache entry before forwarding the +// request to global memory. + transition(V, RdBypassEvict, I) {TagArrayRead} { + p_profileMiss; + t_allocateTBE; + i_invL2; + rd_requestData; + p_popRequestQueue; + } + +// Transition to be called when a read request with SLC flag arrives at entry +// in transient state. The request stalls until the pending transition is complete. + transition({WI, IV}, RdBypassEvict) { + st_stallAndWaitRequest; + } + transition(V, Atomic, A) {TagArrayRead} { p_profileHit; i_invL2; @@ -730,6 +865,31 @@ transition(I, Atomic, A) {TagArrayRead} { p_popRequestQueue; } +// Transition to be called when a write request with SLC bit set arrives at an +// entry with state V. The entry has to be evicted and invalidated before the +// request is forwarded to global memory + transition(V, WrVicBlkEvict, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + p_profileMiss; + ut_updateTag; + t_allocateTBE; + wt_writeThrough; + i_invL2; + p_popRequestQueue; + } + +// Transition to be called when a write request with SLC bit set arrives at an +// entry with state W. The entry has to be evicted and invalidated before the +// request is forwarded to global memory. + transition(W, WrVicBlkEvict, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + p_profileMiss; + ut_updateTag; + wdb_writeDirtyBytes; + t_allocateTBE; + wb_writeBack; + i_invL2; + p_popRequestQueue; + } + transition({W, M}, L2_Repl, WI) {TagArrayRead, DataArrayRead} { t_allocateTBE; wb_writeBack; @@ -764,6 +924,16 @@ transition(I, Atomic, A) {TagArrayRead} { pp_popProbeQueue; } +// Transition to be called when the response for a request with SLC bit set +// arrives. The request has to be forwarded to the core that needs it while +// making sure no entry is allocated. + transition(I, Bypass, I) { + rb_bypassDone; + pr_popResponseQueue; + wada_wakeUpAllDependentsAddr; + dt_deallocateTBE; + } + transition(IV, Data, V) {TagArrayRead, TagArrayWrite, DataArrayWrite} { a_allocateBlock; ut_updateTag; diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm index 775a62b174..3be1397d49 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm @@ -60,6 +60,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") enumeration(Event, desc="TCP Events") { // Core initiated Load, desc="Load"; + LoadBypassEvict, desc="Bypass L1 on a load. Evict if cache block already allocated"; Store, desc="Store to L1 (L1 is dirty)"; StoreThrough, desc="Store directly to L2(L1 is clean)"; Atomic, desc="Atomic"; @@ -256,8 +257,10 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") Entry cache_entry := getCacheEntry(in_msg.addr); TBE tbe := TBEs.lookup(in_msg.addr); if (in_msg.Type == CoherenceResponseType:TDSysResp) { - // disable L1 cache - if (disableL1) { + if (disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) { + // If L1 is disabled or requests have GLC or SLC flag set, + // then, the requests should not cache in the L1. The response + // from L2/global memory should bypass the cache trigger(Event:Bypass, in_msg.addr, cache_entry, tbe); } else { if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) { @@ -284,13 +287,23 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") TBE tbe := TBEs.lookup(in_msg.LineAddress); DPRINTF(RubySlicc, "%s\n", in_msg); if (in_msg.Type == RubyRequestType:LD) { - trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe); + if ((in_msg.isGLCSet || in_msg.isSLCSet) && is_valid(cache_entry)) { + // Read rquests with GLC or SLC bit set should not cache in the L1. + // They need to bypass the L1 and go to the L2. If an entry exists + // in the L1, it needs to be evicted + trigger(Event:LoadBypassEvict, in_msg.LineAddress, cache_entry, tbe); + } + else { + trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe); + } } else if (in_msg.Type == RubyRequestType:ATOMIC || in_msg.Type == RubyRequestType:ATOMIC_RETURN || in_msg.Type == RubyRequestType:ATOMIC_NO_RETURN) { trigger(Event:Atomic, in_msg.LineAddress, cache_entry, tbe); } else if (in_msg.Type == RubyRequestType:ST) { - if(disableL1) { + if(disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) { + // Write requests with GLC or SLC bit set, or when L1 is disabled, + // should not cache in the L1. They need to perform a store through trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe); } else { if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) { @@ -330,6 +343,10 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") TCC_select_low_bit, TCC_select_num_bits)); out_msg.MessageSize := MessageSizeType:Request_Control; out_msg.InitialRequestTime := curCycle(); + peek(mandatoryQueue_in, RubyRequest) { + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; + } } } @@ -375,6 +392,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") TCC_select_low_bit, TCC_select_num_bits)); out_msg.MessageSize := MessageSizeType:Request_Control; out_msg.InitialRequestTime := curCycle(); + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; } } } @@ -401,6 +420,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") // forward inst sequence number to lower TCC peek(mandatoryQueue_in, RubyRequest) { out_msg.instSeqNum := in_msg.instSeqNum; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; } } } @@ -418,6 +439,11 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") out_msg.Type := CoherenceRequestType:Atomic; out_msg.InitialRequestTime := curCycle(); out_msg.Shared := false; + peek(mandatoryQueue_in, RubyRequest) { + out_msg.instSeqNum := in_msg.instSeqNum; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; + } } } } @@ -583,6 +609,17 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") p_popMandatoryQueue; } +// Transition to be called when a load request with GLC or SLC flag set arrives +// at L1. This transition invalidates any existing entry and forwards the +// request to L2. + transition(V, LoadBypassEvict, I) {TagArrayRead, TagArrayWrite} { + uu_profileDataMiss; + inv_invDone; + ic_invCache; + n_issueRdBlk; + p_popMandatoryQueue; +} + transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} { t_allocateTBE; mru_updateMRU; diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm index 3b38e3b1ff..57edef8f2b 100644 --- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm +++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm @@ -161,6 +161,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") uint64_t probe_id, desc="probe id for lifetime profiling"; WriteMask writeMask, desc="outstanding write through mask"; int Len, desc="Length of memory request for DMA"; + bool isGLCSet, desc="Bypass L1 Cache"; + bool isSLCSet, desc="Bypass L1 and L2 Cache"; } structure(TBETable, external="yes") { @@ -483,6 +485,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; out_msg.OriginalResponder := tbe.LastSender; out_msg.L3Hit := tbe.L3Hit; + out_msg.isGLCSet := tbe.isGLCSet; + out_msg.isSLCSet := tbe.isSLCSet; DPRINTF(RubySlicc, "%s\n", out_msg); } } @@ -512,6 +516,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.ForwardRequestTime := tbe.ForwardRequestTime; out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; out_msg.OriginalResponder := tbe.LastSender; + out_msg.isGLCSet := tbe.isGLCSet; + out_msg.isSLCSet := tbe.isSLCSet; if(tbe.atomicData){ out_msg.WTRequestor := tbe.WTRequestor; } @@ -540,6 +546,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.InitialRequestTime := tbe.InitialRequestTime; out_msg.ForwardRequestTime := curCycle(); out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; + out_msg.isGLCSet := tbe.isGLCSet; + out_msg.isSLCSet := tbe.isSLCSet; DPRINTF(RubySlicc, "%s\n", out_msg); } } @@ -557,6 +565,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.ForwardRequestTime := curCycle(); out_msg.ProbeRequestStartTime := curCycle(); out_msg.instSeqNum := in_msg.instSeqNum; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; } } } @@ -569,6 +579,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.Sender := machineID; out_msg.MessageSize := MessageSizeType:Writeback_Data; out_msg.DataBlk := in_msg.DataBlk; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; } } } @@ -624,6 +636,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.Type := MemoryRequestType:MEMORY_READ; out_msg.Sender := machineID; out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; } } } @@ -739,6 +753,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.MessageSize := MessageSizeType:Control; out_msg.Destination := probe_dests; tbe.NumPendingAcks := out_msg.Destination.count(); + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; DPRINTF(RubySlicc, "%s\n", out_msg); APPEND_TRANSITION_COMMENT(" dc: Acks remaining: "); APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); @@ -842,6 +858,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.ReturnData := true; out_msg.MessageSize := MessageSizeType:Control; out_msg.Destination := probe_dests; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; tbe.NumPendingAcks := out_msg.Destination.count(); DPRINTF(RubySlicc, "%s\n", (out_msg)); APPEND_TRANSITION_COMMENT(" sc: Acks remaining: "); @@ -897,6 +915,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.ReturnData := false; out_msg.MessageSize := MessageSizeType:Control; out_msg.Destination := probe_dests; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; tbe.NumPendingAcks := out_msg.Destination.count(); APPEND_TRANSITION_COMMENT(" ic: Acks remaining: "); APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); @@ -923,6 +943,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.Sender := machineID; out_msg.MessageSize := MessageSizeType:Writeback_Data; out_msg.DataBlk := in_msg.DataBlk; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; } if (tbe.Dirty == false) { // have to update the TBE, too, because of how this @@ -985,6 +1007,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") tbe.NumPendingAcks := 0; tbe.Cached := in_msg.ForceShared; tbe.InitialRequestTime := in_msg.InitialRequestTime; + tbe.isGLCSet := in_msg.isGLCSet; + tbe.isSLCSet := in_msg.isSLCSet; } } @@ -1004,6 +1028,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.Sender := machineID; out_msg.MessageSize := MessageSizeType:Writeback_Data; out_msg.DataBlk := tbe.DataBlk; + out_msg.isGLCSet := tbe.isGLCSet; + out_msg.isSLCSet := tbe.isSLCSet; DPRINTF(ProtocolTrace, "%s\n", out_msg); } } @@ -1104,6 +1130,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.Sender := machineID; out_msg.MessageSize := MessageSizeType:Writeback_Data; out_msg.DataBlk := victim_entry.DataBlk; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; } L3CacheMemory.deallocate(victim); } @@ -1136,6 +1164,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.Sender := machineID; out_msg.MessageSize := MessageSizeType:Writeback_Data; out_msg.DataBlk := victim_entry.DataBlk; + out_msg.isGLCSet := tbe.isGLCSet; + out_msg.isSLCSet := tbe.isSLCSet; } L3CacheMemory.deallocate(victim); } diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm index 46bab43c22..6ff19e953b 100644 --- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm +++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm @@ -138,6 +138,9 @@ structure(CPURequestMsg, desc="...", interface="Message") { bool NoWriteConflict, default="true", desc="write collided with CAB entry"; int ProgramCounter, desc="PC that accesses to this block"; + bool isGLCSet, default="false", desc="GLC flag value in the request"; + bool isSLCSet, default="false", desc="SLC flag value in the request"; + bool functionalRead(Packet *pkt) { // Only PUTX messages contains the data block if (Type == CoherenceRequestType:VicDirty) { @@ -165,6 +168,8 @@ structure(NBProbeRequestMsg, desc="...", interface="Message") { MachineID Requestor, desc="Requestor id for 3-hop requests"; bool NoAckNeeded, default="false", desc="For short circuting acks"; int ProgramCounter, desc="PC that accesses to this block"; + bool isGLCSet, desc="Bypass L1 Cache"; + bool isSLCSet, desc="Bypass L1 and L2 Caches"; bool functionalRead(Packet *pkt) { return false; @@ -248,6 +253,9 @@ structure(ResponseMsg, desc="...", interface="Message") { int ProgramCounter, desc="PC that issues this request"; bool mispred, desc="tell TCP if the block should not be bypassed"; + bool isGLCSet, default="false", desc="GLC flag value in the request that triggered response"; + bool isSLCSet, default="false", desc="SLC flag value in the request that triggered response"; + bool functionalRead(Packet *pkt) { // Only PUTX messages contains the data block @@ -277,6 +285,8 @@ structure(UnblockMsg, desc="...", interface="Message") { bool wasValid, default="false", desc="Was block valid when evicted"; bool valid, default="false", desc="Is block valid"; bool validToInvalid, default="false", desc="Was block valid when evicted"; + bool isGLCSet, default="false", desc="GLC flag value in the request"; + bool isSLCSet, default="false", desc="SLC flag value in the request"; bool functionalRead(Packet *pkt) { return false; @@ -321,6 +331,8 @@ structure(TriggerMsg, desc="...", interface="Message") { TriggerType Type, desc="Type of trigger"; CacheId Dest, default="CacheId_NA", desc="Cache to invalidate"; int ProgramCounter, desc="PC that accesses to this block"; + bool isGLCSet, default="false", desc="GLC flag value in the request"; + bool isSLCSet, default="false", desc="SLC flag value in the request"; bool functionalRead(Packet *pkt) { return false; diff --git a/src/mem/ruby/protocol/RubySlicc_MemControl.sm b/src/mem/ruby/protocol/RubySlicc_MemControl.sm index e8517a4a07..012b169dea 100644 --- a/src/mem/ruby/protocol/RubySlicc_MemControl.sm +++ b/src/mem/ruby/protocol/RubySlicc_MemControl.sm @@ -74,6 +74,8 @@ structure(MemoryMsg, desc="...", interface="Message") { PrefetchBit Prefetch, desc="Is this a prefetch request"; bool ReadX, desc="Exclusive"; int Acks, desc="How many acks to expect"; + bool isGLCSet, desc="Bypass L1 Cache"; + bool isSLCSet, desc="Bypass L1 and L2 Caches"; bool functionalRead(Packet *pkt) { if ((MessageSize == MessageSizeType:Response_Data) || diff --git a/src/mem/ruby/protocol/RubySlicc_Types.sm b/src/mem/ruby/protocol/RubySlicc_Types.sm index 8d76f78f76..8ba9d935ff 100644 --- a/src/mem/ruby/protocol/RubySlicc_Types.sm +++ b/src/mem/ruby/protocol/RubySlicc_Types.sm @@ -177,6 +177,8 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") { int htmTransactionUid, desc="Used to identify the unique HTM transaction that produced this request"; bool isTlbi, desc="Memory request is a TLB shootdown (invalidation) operation"; Addr tlbiTransactionUid, desc="Unique identifier of the TLB shootdown operation that produced this request"; + bool isGLCSet, default="false",desc="If flag is set, bypass GPU L1 cache"; + bool isSLCSet, default="false",desc="If flag is set, bypass GPU L1 and L2 caches"; RequestPtr getRequestPtr(); } diff --git a/src/mem/ruby/slicc_interface/RubyRequest.hh b/src/mem/ruby/slicc_interface/RubyRequest.hh index 2345c224fb..89ce83451e 100644 --- a/src/mem/ruby/slicc_interface/RubyRequest.hh +++ b/src/mem/ruby/slicc_interface/RubyRequest.hh @@ -79,6 +79,11 @@ class RubyRequest : public Message bool m_isTlbi; // Should be uint64, but SLICC complains about casts Addr m_tlbiTransactionUid; + // GPU cache bypass flags. GLC bypasses L1 while SLC bypasses both L1 and + // L2 if set to true. They are set to false by default and they must be + // explicitly set to true in the program in order to bypass caches + bool m_isGLCSet; + bool m_isSLCSet; RubyRequest(Tick curTime, uint64_t _paddr, int _len, uint64_t _pc, RubyRequestType _type, RubyAccessMode _access_mode, @@ -99,6 +104,13 @@ class RubyRequest : public Message m_tlbiTransactionUid(0) { m_LineAddress = makeLineAddress(m_PhysicalAddress); + if (_pkt) { + m_isGLCSet = m_pkt->req->isGLCSet(); + m_isSLCSet = m_pkt->req->isSLCSet(); + } else { + m_isGLCSet = 0; + m_isSLCSet = 0; + } } /** RubyRequest for memory management commands */ @@ -120,6 +132,13 @@ class RubyRequest : public Message m_tlbiTransactionUid(0) { assert(m_pkt->req->isMemMgmt()); + if (_pkt) { + m_isGLCSet = m_pkt->req->isGLCSet(); + m_isSLCSet = m_pkt->req->isSLCSet(); + } else { + m_isGLCSet = 0; + m_isSLCSet = 0; + } } RubyRequest(Tick curTime, uint64_t _paddr, int _len, @@ -148,6 +167,13 @@ class RubyRequest : public Message m_tlbiTransactionUid(0) { m_LineAddress = makeLineAddress(m_PhysicalAddress); + if (_pkt) { + m_isGLCSet = m_pkt->req->isGLCSet(); + m_isSLCSet = m_pkt->req->isSLCSet(); + } else { + m_isGLCSet = 0; + m_isSLCSet = 0; + } } RubyRequest(Tick curTime, uint64_t _paddr, int _len, @@ -177,6 +203,14 @@ class RubyRequest : public Message m_tlbiTransactionUid(0) { m_LineAddress = makeLineAddress(m_PhysicalAddress); + if (_pkt) { + m_isGLCSet = m_pkt->req->isGLCSet(); + m_isSLCSet = m_pkt->req->isSLCSet(); + + } else { + m_isGLCSet = 0; + m_isSLCSet = 0; + } } RubyRequest(Tick curTime) : Message(curTime) {}