From ddf43726ef95fb9b64f89109413a62aa070fada8 Mon Sep 17 00:00:00 2001 From: Vishnu Ramadas Date: Wed, 4 Jan 2023 21:07:54 -0600 Subject: [PATCH] gpu-compute, mem-ruby: Update GPU cache bypassing to use TBE An earlier commit added support for GLC and SLC AMDGPU instruction modifiers. These modifiers enable cache bypassing when set. The GLC/SLC flag information was being threaded through all the way to memory and back so that appropriate actions could be taken upon receiving a request and corresponding response. This commit removes the threading and adds the bypass flag information to TBE. Requests populate this entry and responses access it to determine the correct set of actions to execute. Change-Id: I20ffa6682d109270adb921de078cfd47fb4e137c Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/67191 Maintainer: Matt Sinclair Tested-by: kokoro Reviewed-by: Matt Sinclair Reviewed-by: Jason Lowe-Power --- src/mem/ruby/protocol/GPU_VIPER-TCC.sm | 59 +++++++++------------ src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm | 30 ----------- src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm | 2 - 3 files changed, 25 insertions(+), 66 deletions(-) diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm index ae142471fa..ca4c543722 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm @@ -283,7 +283,13 @@ machine(MachineType:TCC, "TCC Cache") peek(responseFromNB_in, ResponseMsg, block_on="addr") { TBE tbe := TBEs.lookup(in_msg.addr); Entry cache_entry := getCacheEntry(in_msg.addr); - if (in_msg.isSLCSet) { + bool is_slc_set := false; + + if (!is_invalid(tbe)) { + is_slc_set := tbe.isSLCSet; + } + + if (is_slc_set) { // If the SLC bit is set, the response needs to bypass the cache // and should not be allocated an entry. trigger(Event:Bypass, in_msg.addr, cache_entry, tbe); @@ -343,6 +349,10 @@ machine(MachineType:TCC, "TCC Cache") trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe); } } else if (in_msg.Type == CoherenceRequestType:Atomic) { + // Currently the Atomic requests do not have GLC/SLC bit handing + // support. The assert ensures that the requests do not have + // these set, and therefore do not expect to bypass the cache + assert(!in_msg.isSLCSet); trigger(Event:Atomic, in_msg.addr, cache_entry, tbe); } else if (in_msg.Type == CoherenceRequestType:RdBlk) { if (in_msg.isSLCSet) { @@ -399,8 +409,8 @@ machine(MachineType:TCC, "TCC Cache") out_msg.State := CoherenceState:Shared; DPRINTF(RubySlicc, "%s\n", out_msg); peek(responseFromNB_in, ResponseMsg) { - out_msg.isGLCSet := in_msg.isGLCSet; - out_msg.isSLCSet := in_msg.isSLCSet; + out_msg.isGLCSet := tbe.isGLCSet; + out_msg.isSLCSet := tbe.isSLCSet; } } enqueue(unblockToNB_out, UnblockMsg, 1) { @@ -408,8 +418,8 @@ machine(MachineType:TCC, "TCC Cache") out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); out_msg.MessageSize := MessageSizeType:Unblock_Control; peek(responseFromNB_in, ResponseMsg) { - out_msg.isGLCSet := in_msg.isGLCSet; - out_msg.isSLCSet := in_msg.isSLCSet; + out_msg.isGLCSet := tbe.isGLCSet; + out_msg.isSLCSet := tbe.isSLCSet; } DPRINTF(RubySlicc, "%s\n", out_msg); } @@ -426,8 +436,8 @@ machine(MachineType:TCC, "TCC Cache") out_msg.MessageSize := MessageSizeType:Response_Data; out_msg.Dirty := false; out_msg.State := CoherenceState:Shared; - out_msg.isGLCSet := in_msg.isGLCSet; - out_msg.isSLCSet := in_msg.isSLCSet; + out_msg.isGLCSet := tbe.isGLCSet; + out_msg.isSLCSet := tbe.isSLCSet; DPRINTF(RubySlicc, "%s\n", out_msg); } enqueue(unblockToNB_out, UnblockMsg, 1) { @@ -449,8 +459,8 @@ machine(MachineType:TCC, "TCC Cache") out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); out_msg.Shared := false; // unneeded for this request out_msg.MessageSize := in_msg.MessageSize; - out_msg.isGLCSet := in_msg.isGLCSet; - out_msg.isSLCSet := in_msg.isSLCSet; + out_msg.isGLCSet := tbe.isGLCSet; + out_msg.isSLCSet := tbe.isSLCSet; DPRINTF(RubySlicc, "%s\n", out_msg); } } @@ -467,9 +477,6 @@ machine(MachineType:TCC, "TCC Cache") out_msg.Sender := machineID; out_msg.MessageSize := MessageSizeType:Writeback_Control; out_msg.instSeqNum := in_msg.instSeqNum; - out_msg.isGLCSet := in_msg.isGLCSet; - out_msg.isSLCSet := in_msg.isSLCSet; - } } } @@ -484,9 +491,6 @@ machine(MachineType:TCC, "TCC Cache") out_msg.Sender := machineID; out_msg.MessageSize := MessageSizeType:Writeback_Control; out_msg.instSeqNum := in_msg.instSeqNum; - out_msg.isGLCSet := in_msg.isGLCSet; - out_msg.isSLCSet := in_msg.isSLCSet; - } } } @@ -500,9 +504,8 @@ machine(MachineType:TCC, "TCC Cache") out_msg.Sender := machineID; out_msg.MessageSize := in_msg.MessageSize; out_msg.DataBlk := in_msg.DataBlk; - out_msg.isGLCSet := in_msg.isGLCSet; - out_msg.isSLCSet := in_msg.isSLCSet; - + out_msg.isGLCSet := tbe.isGLCSet; + out_msg.isSLCSet := tbe.isSLCSet; } } } @@ -535,9 +538,9 @@ machine(MachineType:TCC, "TCC Cache") peek(coreRequestNetwork_in, CPURequestMsg) { if(in_msg.Type == CoherenceRequestType:RdBlk || in_msg.Type == CoherenceRequestType:Atomic){ tbe.Destination.add(in_msg.Requestor); - tbe.isGLCSet := in_msg.isGLCSet; - tbe.isSLCSet := in_msg.isSLCSet; } + tbe.isGLCSet := in_msg.isGLCSet; + tbe.isSLCSet := in_msg.isSLCSet; } } } @@ -576,8 +579,6 @@ machine(MachineType:TCC, "TCC Cache") out_msg.DataBlk := in_msg.DataBlk; out_msg.writeMask.orMask(in_msg.writeMask); out_msg.instSeqNum := in_msg.instSeqNum; - out_msg.isGLCSet := in_msg.isGLCSet; - out_msg.isSLCSet := in_msg.isSLCSet; } } } @@ -593,10 +594,6 @@ machine(MachineType:TCC, "TCC Cache") out_msg.Dirty := true; out_msg.DataBlk := cache_entry.DataBlk; out_msg.writeMask.orMask(cache_entry.writeMask); - peek(coreRequestNetwork_in, CPURequestMsg) { - out_msg.isGLCSet := in_msg.isGLCSet; - out_msg.isSLCSet := in_msg.isSLCSet; - } } } @@ -611,8 +608,6 @@ machine(MachineType:TCC, "TCC Cache") out_msg.Type := CoherenceRequestType:Atomic; out_msg.Dirty := true; out_msg.writeMask.orMask(in_msg.writeMask); - out_msg.isGLCSet := in_msg.isGLCSet; - out_msg.isSLCSet := in_msg.isSLCSet; } } } @@ -628,10 +623,6 @@ machine(MachineType:TCC, "TCC Cache") out_msg.Ntsl := true; out_msg.State := CoherenceState:NA; out_msg.MessageSize := MessageSizeType:Response_Control; - peek(probeNetwork_in, NBProbeRequestMsg) { - out_msg.isGLCSet := in_msg.isGLCSet; - out_msg.isSLCSet := in_msg.isSLCSet; - } } } action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") { @@ -676,8 +667,8 @@ machine(MachineType:TCC, "TCC Cache") out_msg.addr := address; out_msg.Type := TriggerType:AtomicDone; peek(responseFromNB_in, ResponseMsg) { - out_msg.isGLCSet := in_msg.isGLCSet; - out_msg.isSLCSet := in_msg.isSLCSet; + out_msg.isGLCSet := tbe.isGLCSet; + out_msg.isSLCSet := tbe.isSLCSet; } } } diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm index 57edef8f2b..3b38e3b1ff 100644 --- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm +++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm @@ -161,8 +161,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") uint64_t probe_id, desc="probe id for lifetime profiling"; WriteMask writeMask, desc="outstanding write through mask"; int Len, desc="Length of memory request for DMA"; - bool isGLCSet, desc="Bypass L1 Cache"; - bool isSLCSet, desc="Bypass L1 and L2 Cache"; } structure(TBETable, external="yes") { @@ -485,8 +483,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; out_msg.OriginalResponder := tbe.LastSender; out_msg.L3Hit := tbe.L3Hit; - out_msg.isGLCSet := tbe.isGLCSet; - out_msg.isSLCSet := tbe.isSLCSet; DPRINTF(RubySlicc, "%s\n", out_msg); } } @@ -516,8 +512,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.ForwardRequestTime := tbe.ForwardRequestTime; out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; out_msg.OriginalResponder := tbe.LastSender; - out_msg.isGLCSet := tbe.isGLCSet; - out_msg.isSLCSet := tbe.isSLCSet; if(tbe.atomicData){ out_msg.WTRequestor := tbe.WTRequestor; } @@ -546,8 +540,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.InitialRequestTime := tbe.InitialRequestTime; out_msg.ForwardRequestTime := curCycle(); out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime; - out_msg.isGLCSet := tbe.isGLCSet; - out_msg.isSLCSet := tbe.isSLCSet; DPRINTF(RubySlicc, "%s\n", out_msg); } } @@ -565,8 +557,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.ForwardRequestTime := curCycle(); out_msg.ProbeRequestStartTime := curCycle(); out_msg.instSeqNum := in_msg.instSeqNum; - out_msg.isGLCSet := in_msg.isGLCSet; - out_msg.isSLCSet := in_msg.isSLCSet; } } } @@ -579,8 +569,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.Sender := machineID; out_msg.MessageSize := MessageSizeType:Writeback_Data; out_msg.DataBlk := in_msg.DataBlk; - out_msg.isGLCSet := in_msg.isGLCSet; - out_msg.isSLCSet := in_msg.isSLCSet; } } } @@ -636,8 +624,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.Type := MemoryRequestType:MEMORY_READ; out_msg.Sender := machineID; out_msg.MessageSize := MessageSizeType:Request_Control; - out_msg.isGLCSet := in_msg.isGLCSet; - out_msg.isSLCSet := in_msg.isSLCSet; } } } @@ -753,8 +739,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.MessageSize := MessageSizeType:Control; out_msg.Destination := probe_dests; tbe.NumPendingAcks := out_msg.Destination.count(); - out_msg.isGLCSet := in_msg.isGLCSet; - out_msg.isSLCSet := in_msg.isSLCSet; DPRINTF(RubySlicc, "%s\n", out_msg); APPEND_TRANSITION_COMMENT(" dc: Acks remaining: "); APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); @@ -858,8 +842,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.ReturnData := true; out_msg.MessageSize := MessageSizeType:Control; out_msg.Destination := probe_dests; - out_msg.isGLCSet := in_msg.isGLCSet; - out_msg.isSLCSet := in_msg.isSLCSet; tbe.NumPendingAcks := out_msg.Destination.count(); DPRINTF(RubySlicc, "%s\n", (out_msg)); APPEND_TRANSITION_COMMENT(" sc: Acks remaining: "); @@ -915,8 +897,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.ReturnData := false; out_msg.MessageSize := MessageSizeType:Control; out_msg.Destination := probe_dests; - out_msg.isGLCSet := in_msg.isGLCSet; - out_msg.isSLCSet := in_msg.isSLCSet; tbe.NumPendingAcks := out_msg.Destination.count(); APPEND_TRANSITION_COMMENT(" ic: Acks remaining: "); APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks); @@ -943,8 +923,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.Sender := machineID; out_msg.MessageSize := MessageSizeType:Writeback_Data; out_msg.DataBlk := in_msg.DataBlk; - out_msg.isGLCSet := in_msg.isGLCSet; - out_msg.isSLCSet := in_msg.isSLCSet; } if (tbe.Dirty == false) { // have to update the TBE, too, because of how this @@ -1007,8 +985,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") tbe.NumPendingAcks := 0; tbe.Cached := in_msg.ForceShared; tbe.InitialRequestTime := in_msg.InitialRequestTime; - tbe.isGLCSet := in_msg.isGLCSet; - tbe.isSLCSet := in_msg.isSLCSet; } } @@ -1028,8 +1004,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.Sender := machineID; out_msg.MessageSize := MessageSizeType:Writeback_Data; out_msg.DataBlk := tbe.DataBlk; - out_msg.isGLCSet := tbe.isGLCSet; - out_msg.isSLCSet := tbe.isSLCSet; DPRINTF(ProtocolTrace, "%s\n", out_msg); } } @@ -1130,8 +1104,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.Sender := machineID; out_msg.MessageSize := MessageSizeType:Writeback_Data; out_msg.DataBlk := victim_entry.DataBlk; - out_msg.isGLCSet := in_msg.isGLCSet; - out_msg.isSLCSet := in_msg.isSLCSet; } L3CacheMemory.deallocate(victim); } @@ -1164,8 +1136,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.Sender := machineID; out_msg.MessageSize := MessageSizeType:Writeback_Data; out_msg.DataBlk := victim_entry.DataBlk; - out_msg.isGLCSet := tbe.isGLCSet; - out_msg.isSLCSet := tbe.isSLCSet; } L3CacheMemory.deallocate(victim); } diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm index 6ff19e953b..bb3a013325 100644 --- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm +++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm @@ -168,8 +168,6 @@ structure(NBProbeRequestMsg, desc="...", interface="Message") { MachineID Requestor, desc="Requestor id for 3-hop requests"; bool NoAckNeeded, default="false", desc="For short circuting acks"; int ProgramCounter, desc="PC that accesses to this block"; - bool isGLCSet, desc="Bypass L1 Cache"; - bool isSLCSet, desc="Bypass L1 and L2 Caches"; bool functionalRead(Packet *pkt) { return false;