mem-ruby: SLICC Fixes to GLC Atomics in WB L2 (#397)

Made the following changes to fix the behavior of GLC atomics in a WB
L2:
- Stored atomic write mask in TBE For GLC atomics on an invalid line
that bypass to the directory, but have their atomics performed on the
return path.
- Replaced !presentOrAvail() check for bypassing atomics to directory
(which will then be performed on return path), with check for invalid
line state.
- Replaced wdb_writeDirtyBytes action used when performing atomics with
owm_orWriteMask action that doesn't write from invalid atomic request
data block
   - Fixed atomic return path actions

Change-Id: I6a406c313d2f9c88cd75bfe39187ef94ce84098f
This commit is contained in:
Daniel Kouchekinia
2023-11-09 15:15:10 -06:00
committed by GitHub
parent 0442c9a88c
commit 1204267fd8

View File

@@ -106,16 +106,17 @@ machine(MachineType:TCC, "TCC Cache")
}
structure(TBE, desc="...") {
State TBEState, desc="Transient state";
DataBlock DataBlk, desc="data for the block";
bool Dirty, desc="Is the data dirty?";
bool Shared, desc="Victim hit by shared probe";
MachineID From, desc="Waiting for writeback from...";
NetDest Destination, desc="Data destination";
int numAtomics, desc="number remaining atomics";
int atomicDoneCnt, desc="number AtomicDones triggered";
bool isGLCSet, desc="Bypass L1 Cache";
bool isSLCSet, desc="Bypass L1 and L2 Cache";
State TBEState, desc="Transient state";
DataBlock DataBlk, desc="data for the block";
bool Dirty, desc="Is the data dirty?";
bool Shared, desc="Victim hit by shared probe";
MachineID From, desc="Waiting for writeback from...";
NetDest Destination, desc="Data destination";
int numPendingDirectoryAtomics, desc="number of pending atomics to be performed in directory";
int atomicDoneCnt, desc="number AtomicDones triggered";
bool isGLCSet, desc="Bypass L1 Cache";
bool isSLCSet, desc="Bypass L1 and L2 Cache";
WriteMask atomicWriteMask, desc="Atomic write mask";
}
structure(TBETable, external="yes") {
@@ -265,13 +266,15 @@ machine(MachineType:TCC, "TCC Cache")
TBE tbe := TBEs.lookup(in_msg.addr);
Entry cache_entry := getCacheEntry(in_msg.addr);
// The trigger queue applies only to atomics performed in the directory.
// There is a possible race where multiple AtomicDone triggers can be
// sent if another Atomic to the same address is issued after the
// AtomicDone is triggered but before the message arrives here. For
// that case we count the number of AtomicDones in flight for this
// address and only call AtomicDone to deallocate the TBE when it is
// the last in flight message.
if (tbe.numAtomics == 0 && tbe.atomicDoneCnt == 1) {
if (tbe.numPendingDirectoryAtomics == 0 && tbe.atomicDoneCnt == 1) {
trigger(Event:AtomicDone, in_msg.addr, cache_entry, tbe);
} else {
trigger(Event:AtomicNotDone, in_msg.addr, cache_entry, tbe);
@@ -370,7 +373,7 @@ machine(MachineType:TCC, "TCC Cache")
// TCC will perform the atomic on the return path on Event:Data.
// The action will invalidate the cache line if SLC is set and the address is
// in the cache.
if(in_msg.isSLCSet || !WB || !presentOrAvail(in_msg.addr)) {
if(in_msg.isSLCSet || !WB) {
trigger(Event:AtomicPassOn, in_msg.addr, cache_entry, tbe);
} else {
trigger(Event:Atomic, in_msg.addr, cache_entry, tbe);
@@ -549,7 +552,23 @@ machine(MachineType:TCC, "TCC Cache")
cache_entry.DataBlk.clearAtomicLogEntries();
}
action(bar_sendBypassedAtomicResponse, "bar", desc="send bypassed Atomic Ack") {
action(baplr_sendBypassedAtomicPerformedLocallyResponse, "barplr", desc="send locally-performed bypassed Atomic Ack") {
peek(responseFromNB_in, ResponseMsg) {
enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
out_msg.addr := address;
out_msg.Type := CoherenceResponseType:TDSysResp;
out_msg.Destination.add(in_msg.WTRequestor);
out_msg.Sender := machineID;
out_msg.MessageSize := in_msg.MessageSize;
out_msg.DataBlk := cache_entry.DataBlk;
out_msg.isGLCSet := tbe.isGLCSet;
out_msg.isSLCSet := tbe.isSLCSet;
}
}
cache_entry.DataBlk.clearAtomicLogEntries();
}
action(bapdr_sendBypassedAtomicPerformedInDirectoryResponse, "bapdr", desc="send bypassed Atomic Ack") {
peek(responseFromNB_in, ResponseMsg) {
enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
out_msg.addr := address;
@@ -585,7 +604,7 @@ machine(MachineType:TCC, "TCC Cache")
TBEs.allocate(address);
set_tbe(TBEs.lookup(address));
tbe.Destination.clear();
tbe.numAtomics := 0;
tbe.numPendingDirectoryAtomics := 0;
tbe.atomicDoneCnt := 0;
}
if (coreRequestNetwork_in.isReady(clockEdge())) {
@@ -595,6 +614,10 @@ machine(MachineType:TCC, "TCC Cache")
}
tbe.isGLCSet := in_msg.isGLCSet;
tbe.isSLCSet := in_msg.isSLCSet;
if(in_msg.Type == CoherenceRequestType:Atomic){
tbe.atomicWriteMask.clear();
tbe.atomicWriteMask.orMask(in_msg.writeMask);
}
}
}
}
@@ -620,6 +643,20 @@ machine(MachineType:TCC, "TCC Cache")
}
}
action(wardb_writeAtomicResponseDirtyBytes, "wardb", desc="write data to TCC") {
peek(responseFromNB_in, ResponseMsg) {
cache_entry.DataBlk := in_msg.DataBlk;
cache_entry.writeMask.orMask(tbe.atomicWriteMask);
DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
}
}
action(owm_orWriteMask, "owm", desc="or TCCs write mask") {
peek(coreRequestNetwork_in, CPURequestMsg) {
cache_entry.writeMask.orMask(in_msg.writeMask);
}
}
action(wt_writeThrough, "wt", desc="write back data") {
peek(coreRequestNetwork_in, CPURequestMsg) {
enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
@@ -726,14 +763,20 @@ machine(MachineType:TCC, "TCC Cache")
}
action(ina_incrementNumAtomics, "ina", desc="inc num atomics") {
tbe.numAtomics := tbe.numAtomics + 1;
action(inpa_incrementNumPendingDirectoryAtomics, "inpa", desc="inc num atomics") {
// Only increment number of atomics if they will actually be performed in directory
// That is, if the SLC bit is set or if the cache is write through
peek(coreRequestNetwork_in, CPURequestMsg) {
if (in_msg.isSLCSet || !WB) {
tbe.numPendingDirectoryAtomics := tbe.numPendingDirectoryAtomics + 1;
}
}
}
action(dna_decrementNumAtomics, "dna", desc="inc num atomics") {
tbe.numAtomics := tbe.numAtomics - 1;
if (tbe.numAtomics==0) {
action(dnpa_decrementNumPendingDirectoryAtomics, "dnpa", desc="dec num atomics") {
tbe.numPendingDirectoryAtomics := tbe.numPendingDirectoryAtomics - 1;
if (tbe.numPendingDirectoryAtomics==0) {
enqueue(triggerQueue_out, TriggerMsg, 1) {
tbe.atomicDoneCnt := tbe.atomicDoneCnt + 1;
out_msg.addr := address;
@@ -876,17 +919,8 @@ machine(MachineType:TCC, "TCC Cache")
transition(V, Atomic, M) {TagArrayRead, DataArrayWrite} {
p_profileHit;
wdb_writeDirtyBytes;
pa_performAtomic;
ar_sendAtomicResponse;
p_popRequestQueue;
}
transition(I, Atomic, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
p_profileMiss;
a_allocateBlock;
ut_updateTag;
wdb_writeDirtyBytes;
owm_orWriteMask;
pa_performAtomic;
ar_sendAtomicResponse;
p_popRequestQueue;
@@ -900,9 +934,19 @@ machine(MachineType:TCC, "TCC Cache")
st_stallAndWaitRequest;
}
transition({M, W}, Atomic) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
transition(W, Atomic, WI) {
t_allocateTBE;
wb_writeBack;
// need to try this request again after writing back the current entry -- to
// do so, put it with other stalled requests in a buffer to reduce resource
// contention since they won't try again every cycle and will instead only
// try again once woken up
st_stallAndWaitRequest;
}
transition(M, Atomic) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
p_profileHit;
wdb_writeDirtyBytes;
owm_orWriteMask;
pa_performAtomic;
ar_sendAtomicResponse;
p_popRequestQueue;
@@ -917,16 +961,16 @@ machine(MachineType:TCC, "TCC Cache")
i_invL2;
t_allocateTBE;
at_atomicThrough;
ina_incrementNumAtomics;
inpa_incrementNumPendingDirectoryAtomics;
p_popRequestQueue;
}
transition(I, AtomicPassOn, A) {TagArrayRead} {
transition(I, {Atomic, AtomicPassOn}, A) {TagArrayRead} {
p_profileMiss;
i_invL2;
t_allocateTBE;
at_atomicThrough;
ina_incrementNumAtomics;
inpa_incrementNumPendingDirectoryAtomics;
p_popRequestQueue;
}
@@ -1058,8 +1102,8 @@ machine(MachineType:TCC, "TCC Cache")
}
transition(A, Bypass) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
bar_sendBypassedAtomicResponse;
dna_decrementNumAtomics;
bapdr_sendBypassedAtomicPerformedInDirectoryResponse;
dnpa_decrementNumPendingDirectoryAtomics;
pr_popResponseQueue;
}
@@ -1081,9 +1125,9 @@ machine(MachineType:TCC, "TCC Cache")
transition(A, Data, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
a_allocateBlock;
wardb_writeAtomicResponseDirtyBytes;
pa_performAtomic;
bar_sendBypassedAtomicResponse;
dna_decrementNumAtomics;
baplr_sendBypassedAtomicPerformedLocallyResponse;
pr_popResponseQueue;
}