From efbfdeabd785c3b01ae66094a4d3801a390c3635 Mon Sep 17 00:00:00 2001 From: NSurawar <139829981+NSurawar@users.noreply.github.com> Date: Thu, 30 May 2024 09:36:29 -0700 Subject: [PATCH] mem-ruby: Reduce handshaking between CorePair and dir (#1117) Currently when data is downgraded by MOESI_AMD_Base-CorePair (e.g. due to a replacement) this requires a 4-way handshake between the CorePair and the dir. Specifically, the CorePair send a message telling the dir it'd like to downgrade then, the dir sends an ACK back and then, the CorePair writes the data back, and finally, the dir ACKs the writeback. This is very inefficient and not representative of how modern protocols downgrade a request. Accordingly, this commits updates the downgrade support such that the CorePair writes back the data immediately and then the dir ACKs it. Thus, this approach requires only a 2-way handshake. Change-Id: I7ebc85bb03e8ce46a8847e3240fc170120e9fcd6 Co-authored-by: Neeraj Surawar --- .../ruby/protocol/MOESI_AMD_Base-CorePair.sm | 27 +-------------- src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm | 34 ++++++++++--------- 2 files changed, 19 insertions(+), 42 deletions(-) diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-CorePair.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-CorePair.sm index 7d1bde04dd..0e12b0a9e4 100644 --- a/src/mem/ruby/protocol/MOESI_AMD_Base-CorePair.sm +++ b/src/mem/ruby/protocol/MOESI_AMD_Base-CorePair.sm @@ -730,7 +730,7 @@ machine(MachineType:CorePair, "CP-like Core Coherence") out_msg.DataBlk := cache_entry.DataBlk; assert(cache_entry.Dirty); out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); - out_msg.MessageSize := MessageSizeType:Request_Control; + out_msg.MessageSize := MessageSizeType:Writeback_Data; out_msg.Type := CoherenceRequestType:VicDirty; out_msg.InitialRequestTime := curCycle(); if (cache_entry.CacheState == State:O) { @@ -1114,27 +1114,6 @@ machine(MachineType:CorePair, "CP-like Core Coherence") } } - action(wb_data, "wb", desc="write back data") { - peek(responseToCore_in, ResponseMsg) { - enqueue(responseNetwork_out, ResponseMsg, issue_latency) { - out_msg.addr := address; - out_msg.Type := CoherenceResponseType:CPUData; - out_msg.Sender := machineID; - out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); - out_msg.DataBlk := tbe.DataBlk; - out_msg.Dirty := tbe.Dirty; - if (tbe.Shared) { - out_msg.NbReqShared := true; - } else { - out_msg.NbReqShared := false; - } - out_msg.State := CoherenceState:Shared; // faux info - out_msg.MessageSize := MessageSizeType:Writeback_Data; - DPRINTF(RubySlicc, "%s\n", out_msg); - } - } - } - action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") { enqueue(responseNetwork_out, ResponseMsg, issue_latency) { out_msg.addr := address; @@ -2427,19 +2406,16 @@ machine(MachineType:CorePair, "CP-like Core Coherence") } transition(MO_I, NB_AckWB, I) {L2TagArrayWrite} { - wb_data; d_deallocateTBE; pr_popResponseQueue; } transition(ES_I, NB_AckWB, I) {L2TagArrayWrite} { - wb_data; d_deallocateTBE; pr_popResponseQueue; } transition(MO_S0, NB_AckWB, S0) {L2TagArrayWrite} { - wb_data; i2_invL2; a2_allocateL2; d_deallocateTBE; // FOO @@ -2448,7 +2424,6 @@ machine(MachineType:CorePair, "CP-like Core Coherence") } transition(MO_S1, NB_AckWB, S1) {L2TagArrayWrite} { - wb_data; i2_invL2; a2_allocateL2; d_deallocateTBE; // FOO diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm index c36fc9ec93..17a92f5f90 100644 --- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm +++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm @@ -101,7 +101,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") // writebacks VicDirty, desc="..."; VicClean, desc="..."; - CPUData, desc="WB data from CPU"; StaleWB, desc="Notification that WB has been superceded by a probe"; // probe responses @@ -361,8 +360,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr)); if (in_msg.Type == CoherenceResponseType:CPUPrbResp) { trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe); - } else if (in_msg.Type == CoherenceResponseType:CPUData) { - trigger(Event:CPUData, in_msg.addr, entry, tbe); } else if (in_msg.Type == CoherenceResponseType:StaleNotif) { trigger(Event:StaleWB, in_msg.addr, entry, tbe); } else { @@ -968,7 +965,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") } action(d_writeDataToMemory, "d", desc="Write data to memory") { - peek(responseNetwork_in, ResponseMsg) { + peek(requestNetwork_in, CPURequestMsg) { enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) { out_msg.addr := address; out_msg.Type := MemoryRequestType:MEMORY_WB; @@ -1175,12 +1172,12 @@ machine(MachineType:Directory, "AMD Baseline protocol") } action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") { - peek(responseNetwork_in, ResponseMsg) { + peek(requestNetwork_in, CPURequestMsg) { if (L3CacheMemory.isTagPresent(address)) { CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) "); entry.DataBlk := in_msg.DataBlk; - entry.LastSender := in_msg.Sender; + entry.LastSender := in_msg.Requestor; assert(is_valid(tbe)); //The controller always allocates a TBE entry upon receipt of a request from L2 caches. //L3Hit flag is used by the hit profiling action pr_profileL3HitMiss to determine hit or miss. @@ -1205,7 +1202,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") APPEND_TRANSITION_COMMENT(" al wrote data to L3 "); entry.DataBlk := in_msg.DataBlk; - entry.LastSender := in_msg.Sender; + entry.LastSender := in_msg.Requestor; } } } @@ -1397,15 +1394,25 @@ machine(MachineType:Directory, "AMD Baseline protocol") p_popRequestQueue; } - transition(U, VicDirty, BL) {L3TagArrayRead} { + transition(U, VicDirty) {L3TagArrayRead, L3DataArrayWrite} { t_allocateTBE; w_sendResponseWBAck; + d_writeDataToMemory; + al_allocateL3Block; + pr_profileL3HitMiss; //Must come after al_allocateL3Block and before dt_deallocateTBE + wada_wakeUpAllDependentsAddr; + dt_deallocateTBE; p_popRequestQueue; } - transition(U, VicClean, BL) {L3TagArrayRead} { + transition(U, VicClean) {L3TagArrayRead, L3DataArrayWrite} { t_allocateTBE; w_sendResponseWBAck; + d_writeDataToMemory; + al_allocateL3Block; + pr_profileL3HitMiss; //Must come after al_allocateL3Block and before dt_deallocateTBE + wada_wakeUpAllDependentsAddr; + dt_deallocateTBE; p_popRequestQueue; } @@ -1413,17 +1420,12 @@ machine(MachineType:Directory, "AMD Baseline protocol") zz_recycleRequestQueue; } - transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} { - d_writeDataToMemory; - al_allocateL3Block; - pr_profileL3HitMiss; //Must come after al_allocateL3Block and before dt_deallocateTBE + transition(BL, StaleWB, U) {L3TagArrayWrite} { wada_wakeUpAllDependentsAddr; - dt_deallocateTBE; pr_popResponseQueue; } - transition(BL, StaleWB, U) {L3TagArrayWrite} { - dt_deallocateTBE; + transition(U, StaleWB, U) {L3TagArrayWrite} { wada_wakeUpAllDependentsAddr; pr_popResponseQueue; }