mem-ruby: Reduce handshaking between CorePair and dir (#1117)

Currently when data is downgraded by MOESI_AMD_Base-CorePair (e.g. due
to a replacement) this requires a 4-way handshake between the CorePair
and the dir. Specifically, the CorePair send a message telling the dir
it'd like to downgrade then, the dir sends an ACK back and then, the
CorePair writes the data back, and finally, the dir ACKs the writeback.
This is very inefficient and not representative of how modern protocols
downgrade a request. Accordingly, this commits updates the downgrade
support such that the CorePair writes back the data immediately and then
the dir ACKs it.
Thus, this approach requires only a 2-way handshake.

Change-Id: I7ebc85bb03e8ce46a8847e3240fc170120e9fcd6

Co-authored-by: Neeraj Surawar <neerajs@hyrule.cs.wisc.edu>
This commit is contained in:
NSurawar
2024-05-30 09:36:29 -07:00
committed by GitHub
parent 7c1207d5c4
commit efbfdeabd7
2 changed files with 19 additions and 42 deletions

View File

@@ -730,7 +730,7 @@ machine(MachineType:CorePair, "CP-like Core Coherence")
out_msg.DataBlk := cache_entry.DataBlk;
assert(cache_entry.Dirty);
out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
out_msg.MessageSize := MessageSizeType:Request_Control;
out_msg.MessageSize := MessageSizeType:Writeback_Data;
out_msg.Type := CoherenceRequestType:VicDirty;
out_msg.InitialRequestTime := curCycle();
if (cache_entry.CacheState == State:O) {
@@ -1114,27 +1114,6 @@ machine(MachineType:CorePair, "CP-like Core Coherence")
}
}
action(wb_data, "wb", desc="write back data") {
peek(responseToCore_in, ResponseMsg) {
enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
out_msg.addr := address;
out_msg.Type := CoherenceResponseType:CPUData;
out_msg.Sender := machineID;
out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
out_msg.DataBlk := tbe.DataBlk;
out_msg.Dirty := tbe.Dirty;
if (tbe.Shared) {
out_msg.NbReqShared := true;
} else {
out_msg.NbReqShared := false;
}
out_msg.State := CoherenceState:Shared; // faux info
out_msg.MessageSize := MessageSizeType:Writeback_Data;
DPRINTF(RubySlicc, "%s\n", out_msg);
}
}
}
action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
out_msg.addr := address;
@@ -2427,19 +2406,16 @@ machine(MachineType:CorePair, "CP-like Core Coherence")
}
transition(MO_I, NB_AckWB, I) {L2TagArrayWrite} {
wb_data;
d_deallocateTBE;
pr_popResponseQueue;
}
transition(ES_I, NB_AckWB, I) {L2TagArrayWrite} {
wb_data;
d_deallocateTBE;
pr_popResponseQueue;
}
transition(MO_S0, NB_AckWB, S0) {L2TagArrayWrite} {
wb_data;
i2_invL2;
a2_allocateL2;
d_deallocateTBE; // FOO
@@ -2448,7 +2424,6 @@ machine(MachineType:CorePair, "CP-like Core Coherence")
}
transition(MO_S1, NB_AckWB, S1) {L2TagArrayWrite} {
wb_data;
i2_invL2;
a2_allocateL2;
d_deallocateTBE; // FOO

View File

@@ -101,7 +101,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
// writebacks
VicDirty, desc="...";
VicClean, desc="...";
CPUData, desc="WB data from CPU";
StaleWB, desc="Notification that WB has been superceded by a probe";
// probe responses
@@ -361,8 +360,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe);
} else if (in_msg.Type == CoherenceResponseType:CPUData) {
trigger(Event:CPUData, in_msg.addr, entry, tbe);
} else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
trigger(Event:StaleWB, in_msg.addr, entry, tbe);
} else {
@@ -968,7 +965,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
}
action(d_writeDataToMemory, "d", desc="Write data to memory") {
peek(responseNetwork_in, ResponseMsg) {
peek(requestNetwork_in, CPURequestMsg) {
enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
out_msg.addr := address;
out_msg.Type := MemoryRequestType:MEMORY_WB;
@@ -1175,12 +1172,12 @@ machine(MachineType:Directory, "AMD Baseline protocol")
}
action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") {
peek(responseNetwork_in, ResponseMsg) {
peek(requestNetwork_in, CPURequestMsg) {
if (L3CacheMemory.isTagPresent(address)) {
CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
entry.DataBlk := in_msg.DataBlk;
entry.LastSender := in_msg.Sender;
entry.LastSender := in_msg.Requestor;
assert(is_valid(tbe));
//The controller always allocates a TBE entry upon receipt of a request from L2 caches.
//L3Hit flag is used by the hit profiling action pr_profileL3HitMiss to determine hit or miss.
@@ -1205,7 +1202,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
entry.DataBlk := in_msg.DataBlk;
entry.LastSender := in_msg.Sender;
entry.LastSender := in_msg.Requestor;
}
}
}
@@ -1397,15 +1394,25 @@ machine(MachineType:Directory, "AMD Baseline protocol")
p_popRequestQueue;
}
transition(U, VicDirty, BL) {L3TagArrayRead} {
transition(U, VicDirty) {L3TagArrayRead, L3DataArrayWrite} {
t_allocateTBE;
w_sendResponseWBAck;
d_writeDataToMemory;
al_allocateL3Block;
pr_profileL3HitMiss; //Must come after al_allocateL3Block and before dt_deallocateTBE
wada_wakeUpAllDependentsAddr;
dt_deallocateTBE;
p_popRequestQueue;
}
transition(U, VicClean, BL) {L3TagArrayRead} {
transition(U, VicClean) {L3TagArrayRead, L3DataArrayWrite} {
t_allocateTBE;
w_sendResponseWBAck;
d_writeDataToMemory;
al_allocateL3Block;
pr_profileL3HitMiss; //Must come after al_allocateL3Block and before dt_deallocateTBE
wada_wakeUpAllDependentsAddr;
dt_deallocateTBE;
p_popRequestQueue;
}
@@ -1413,17 +1420,12 @@ machine(MachineType:Directory, "AMD Baseline protocol")
zz_recycleRequestQueue;
}
transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} {
d_writeDataToMemory;
al_allocateL3Block;
pr_profileL3HitMiss; //Must come after al_allocateL3Block and before dt_deallocateTBE
transition(BL, StaleWB, U) {L3TagArrayWrite} {
wada_wakeUpAllDependentsAddr;
dt_deallocateTBE;
pr_popResponseQueue;
}
transition(BL, StaleWB, U) {L3TagArrayWrite} {
dt_deallocateTBE;
transition(U, StaleWB, U) {L3TagArrayWrite} {
wada_wakeUpAllDependentsAddr;
pr_popResponseQueue;
}