gpu-compute, mem-ruby: Update GPU cache bypassing to use TBE

An earlier commit added support for GLC and SLC AMDGPU instruction
modifiers. These modifiers enable cache bypassing when set. The GLC/SLC
flag information was being threaded through all the way to memory and
back so that appropriate actions could be taken upon receiving a request
and corresponding response. This commit removes the threading and adds
the bypass flag information to TBE. Requests populate this
entry and responses access it to determine the correct set of actions to
execute.

Change-Id: I20ffa6682d109270adb921de078cfd47fb4e137c
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/67191
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
This commit is contained in:
Vishnu Ramadas
2023-01-04 21:07:54 -06:00
committed by VISHNU RAMADAS
parent 03083ba5e3
commit ddf43726ef
3 changed files with 25 additions and 66 deletions

View File

@@ -283,7 +283,13 @@ machine(MachineType:TCC, "TCC Cache")
peek(responseFromNB_in, ResponseMsg, block_on="addr") {
TBE tbe := TBEs.lookup(in_msg.addr);
Entry cache_entry := getCacheEntry(in_msg.addr);
if (in_msg.isSLCSet) {
bool is_slc_set := false;
if (!is_invalid(tbe)) {
is_slc_set := tbe.isSLCSet;
}
if (is_slc_set) {
// If the SLC bit is set, the response needs to bypass the cache
// and should not be allocated an entry.
trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
@@ -343,6 +349,10 @@ machine(MachineType:TCC, "TCC Cache")
trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
}
} else if (in_msg.Type == CoherenceRequestType:Atomic) {
// Currently the Atomic requests do not have GLC/SLC bit handing
// support. The assert ensures that the requests do not have
// these set, and therefore do not expect to bypass the cache
assert(!in_msg.isSLCSet);
trigger(Event:Atomic, in_msg.addr, cache_entry, tbe);
} else if (in_msg.Type == CoherenceRequestType:RdBlk) {
if (in_msg.isSLCSet) {
@@ -399,8 +409,8 @@ machine(MachineType:TCC, "TCC Cache")
out_msg.State := CoherenceState:Shared;
DPRINTF(RubySlicc, "%s\n", out_msg);
peek(responseFromNB_in, ResponseMsg) {
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
out_msg.isGLCSet := tbe.isGLCSet;
out_msg.isSLCSet := tbe.isSLCSet;
}
}
enqueue(unblockToNB_out, UnblockMsg, 1) {
@@ -408,8 +418,8 @@ machine(MachineType:TCC, "TCC Cache")
out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
out_msg.MessageSize := MessageSizeType:Unblock_Control;
peek(responseFromNB_in, ResponseMsg) {
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
out_msg.isGLCSet := tbe.isGLCSet;
out_msg.isSLCSet := tbe.isSLCSet;
}
DPRINTF(RubySlicc, "%s\n", out_msg);
}
@@ -426,8 +436,8 @@ machine(MachineType:TCC, "TCC Cache")
out_msg.MessageSize := MessageSizeType:Response_Data;
out_msg.Dirty := false;
out_msg.State := CoherenceState:Shared;
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
out_msg.isGLCSet := tbe.isGLCSet;
out_msg.isSLCSet := tbe.isSLCSet;
DPRINTF(RubySlicc, "%s\n", out_msg);
}
enqueue(unblockToNB_out, UnblockMsg, 1) {
@@ -449,8 +459,8 @@ machine(MachineType:TCC, "TCC Cache")
out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
out_msg.Shared := false; // unneeded for this request
out_msg.MessageSize := in_msg.MessageSize;
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
out_msg.isGLCSet := tbe.isGLCSet;
out_msg.isSLCSet := tbe.isSLCSet;
DPRINTF(RubySlicc, "%s\n", out_msg);
}
}
@@ -467,9 +477,6 @@ machine(MachineType:TCC, "TCC Cache")
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Control;
out_msg.instSeqNum := in_msg.instSeqNum;
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
@@ -484,9 +491,6 @@ machine(MachineType:TCC, "TCC Cache")
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Control;
out_msg.instSeqNum := in_msg.instSeqNum;
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
@@ -500,9 +504,8 @@ machine(MachineType:TCC, "TCC Cache")
out_msg.Sender := machineID;
out_msg.MessageSize := in_msg.MessageSize;
out_msg.DataBlk := in_msg.DataBlk;
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
out_msg.isGLCSet := tbe.isGLCSet;
out_msg.isSLCSet := tbe.isSLCSet;
}
}
}
@@ -535,9 +538,9 @@ machine(MachineType:TCC, "TCC Cache")
peek(coreRequestNetwork_in, CPURequestMsg) {
if(in_msg.Type == CoherenceRequestType:RdBlk || in_msg.Type == CoherenceRequestType:Atomic){
tbe.Destination.add(in_msg.Requestor);
tbe.isGLCSet := in_msg.isGLCSet;
tbe.isSLCSet := in_msg.isSLCSet;
}
tbe.isGLCSet := in_msg.isGLCSet;
tbe.isSLCSet := in_msg.isSLCSet;
}
}
}
@@ -576,8 +579,6 @@ machine(MachineType:TCC, "TCC Cache")
out_msg.DataBlk := in_msg.DataBlk;
out_msg.writeMask.orMask(in_msg.writeMask);
out_msg.instSeqNum := in_msg.instSeqNum;
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
@@ -593,10 +594,6 @@ machine(MachineType:TCC, "TCC Cache")
out_msg.Dirty := true;
out_msg.DataBlk := cache_entry.DataBlk;
out_msg.writeMask.orMask(cache_entry.writeMask);
peek(coreRequestNetwork_in, CPURequestMsg) {
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
@@ -611,8 +608,6 @@ machine(MachineType:TCC, "TCC Cache")
out_msg.Type := CoherenceRequestType:Atomic;
out_msg.Dirty := true;
out_msg.writeMask.orMask(in_msg.writeMask);
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
@@ -628,10 +623,6 @@ machine(MachineType:TCC, "TCC Cache")
out_msg.Ntsl := true;
out_msg.State := CoherenceState:NA;
out_msg.MessageSize := MessageSizeType:Response_Control;
peek(probeNetwork_in, NBProbeRequestMsg) {
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
@@ -676,8 +667,8 @@ machine(MachineType:TCC, "TCC Cache")
out_msg.addr := address;
out_msg.Type := TriggerType:AtomicDone;
peek(responseFromNB_in, ResponseMsg) {
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
out_msg.isGLCSet := tbe.isGLCSet;
out_msg.isSLCSet := tbe.isSLCSet;
}
}
}

View File

@@ -161,8 +161,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
uint64_t probe_id, desc="probe id for lifetime profiling";
WriteMask writeMask, desc="outstanding write through mask";
int Len, desc="Length of memory request for DMA";
bool isGLCSet, desc="Bypass L1 Cache";
bool isSLCSet, desc="Bypass L1 and L2 Cache";
}
structure(TBETable, external="yes") {
@@ -485,8 +483,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
out_msg.OriginalResponder := tbe.LastSender;
out_msg.L3Hit := tbe.L3Hit;
out_msg.isGLCSet := tbe.isGLCSet;
out_msg.isSLCSet := tbe.isSLCSet;
DPRINTF(RubySlicc, "%s\n", out_msg);
}
}
@@ -516,8 +512,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
out_msg.OriginalResponder := tbe.LastSender;
out_msg.isGLCSet := tbe.isGLCSet;
out_msg.isSLCSet := tbe.isSLCSet;
if(tbe.atomicData){
out_msg.WTRequestor := tbe.WTRequestor;
}
@@ -546,8 +540,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
out_msg.InitialRequestTime := tbe.InitialRequestTime;
out_msg.ForwardRequestTime := curCycle();
out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
out_msg.isGLCSet := tbe.isGLCSet;
out_msg.isSLCSet := tbe.isSLCSet;
DPRINTF(RubySlicc, "%s\n", out_msg);
}
}
@@ -565,8 +557,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
out_msg.ForwardRequestTime := curCycle();
out_msg.ProbeRequestStartTime := curCycle();
out_msg.instSeqNum := in_msg.instSeqNum;
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
@@ -579,8 +569,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Data;
out_msg.DataBlk := in_msg.DataBlk;
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
@@ -636,8 +624,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
out_msg.Type := MemoryRequestType:MEMORY_READ;
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Request_Control;
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
}
}
}
@@ -753,8 +739,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
out_msg.MessageSize := MessageSizeType:Control;
out_msg.Destination := probe_dests;
tbe.NumPendingAcks := out_msg.Destination.count();
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
DPRINTF(RubySlicc, "%s\n", out_msg);
APPEND_TRANSITION_COMMENT(" dc: Acks remaining: ");
APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
@@ -858,8 +842,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
out_msg.ReturnData := true;
out_msg.MessageSize := MessageSizeType:Control;
out_msg.Destination := probe_dests;
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
tbe.NumPendingAcks := out_msg.Destination.count();
DPRINTF(RubySlicc, "%s\n", (out_msg));
APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
@@ -915,8 +897,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
out_msg.ReturnData := false;
out_msg.MessageSize := MessageSizeType:Control;
out_msg.Destination := probe_dests;
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
tbe.NumPendingAcks := out_msg.Destination.count();
APPEND_TRANSITION_COMMENT(" ic: Acks remaining: ");
APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
@@ -943,8 +923,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Data;
out_msg.DataBlk := in_msg.DataBlk;
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
}
if (tbe.Dirty == false) {
// have to update the TBE, too, because of how this
@@ -1007,8 +985,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
tbe.NumPendingAcks := 0;
tbe.Cached := in_msg.ForceShared;
tbe.InitialRequestTime := in_msg.InitialRequestTime;
tbe.isGLCSet := in_msg.isGLCSet;
tbe.isSLCSet := in_msg.isSLCSet;
}
}
@@ -1028,8 +1004,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Data;
out_msg.DataBlk := tbe.DataBlk;
out_msg.isGLCSet := tbe.isGLCSet;
out_msg.isSLCSet := tbe.isSLCSet;
DPRINTF(ProtocolTrace, "%s\n", out_msg);
}
}
@@ -1130,8 +1104,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Data;
out_msg.DataBlk := victim_entry.DataBlk;
out_msg.isGLCSet := in_msg.isGLCSet;
out_msg.isSLCSet := in_msg.isSLCSet;
}
L3CacheMemory.deallocate(victim);
}
@@ -1164,8 +1136,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Data;
out_msg.DataBlk := victim_entry.DataBlk;
out_msg.isGLCSet := tbe.isGLCSet;
out_msg.isSLCSet := tbe.isSLCSet;
}
L3CacheMemory.deallocate(victim);
}

View File

@@ -168,8 +168,6 @@ structure(NBProbeRequestMsg, desc="...", interface="Message") {
MachineID Requestor, desc="Requestor id for 3-hop requests";
bool NoAckNeeded, default="false", desc="For short circuting acks";
int ProgramCounter, desc="PC that accesses to this block";
bool isGLCSet, desc="Bypass L1 Cache";
bool isSLCSet, desc="Bypass L1 and L2 Caches";
bool functionalRead(Packet *pkt) {
return false;