From 7018c2b34e83c592843bd4ad714f93bc6179866d Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Fri, 4 Feb 2022 14:25:32 -0600 Subject: [PATCH] mem-ruby: Remove DirectoryMemory storage in MOESI_AMD_BASE-dir This protocol is using an old style where read/writes to memory were being done by writing to a DataBlock in a DirectoryMemory entry. This results in having multiple copies of memory, leads to stale copies in at least one memory (usually DRAM), and require --access-backing-store in most cases to work properly. This changeset removes all references to getDirectoryEntry(...).DataBlk and instead forwards those reads and writes to DRAM always. This results in new transient states BL_WM, BDW_WM, and B_WM which are blocked states waiting on memory acks indicating a write request is complete. The appropriate transitions are updates to move to these new states and stall states are updated to include them. DMA write ACK is also moved to when the request is sent to memory, rather than when the request is received. Change-Id: Ic5bd6a8a8881d7df782e0f7eed8be9d873610e04 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/56446 Reviewed-by: Matt Sinclair Maintainer: Matt Sinclair Tested-by: kokoro --- src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm | 158 ++++++++++++-------- 1 file changed, 94 insertions(+), 64 deletions(-) diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm index 249693576b..bcac2c1cdb 100644 --- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm +++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm @@ -61,26 +61,29 @@ machine(MachineType:Directory, "AMD Baseline protocol") { // STATES state_declaration(State, desc="Directory states", default="Directory_State_U") { - U, AccessPermission:Backing_Store, desc="unblocked"; - BL, AccessPermission:Busy, desc="got L3 WB request"; + U, AccessPermission:Backing_Store, desc="unblocked"; // BL is Busy because it's possible for the data only to be in the network // in the WB, L3 has sent it and gone on with its business in possibly I // state. - BDR_M, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for memory"; - BS_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; - BM_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; - B_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; - BP, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory"; - BDR_PM, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for probes and memory"; - BS_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; - BM_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; - B_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; - BDW_P, AccessPermission:Backing_Store, desc="DMA write, blocked waiting for probes, no need for memory"; - BDR_Pm, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for probes, already got memory"; - BS_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; - BM_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; - B_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; - B, AccessPermission:Backing_Store, desc="sent response, Blocked til ack"; + BL, AccessPermission:Busy, desc="got L3 WB request"; + BL_WM, AccessPermission:Busy, desc="writing L3 WB to memory, waiting for ack"; + BDR_M, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for memory"; + BS_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + BM_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + B_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + BP, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory"; + BDR_PM, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for probes and memory"; + BS_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + BM_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + B_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + BDW_P, AccessPermission:Backing_Store, desc="DMA write, blocked waiting for probes, no need for memory"; + BDW_WM, AccessPermission:Backing_Store, desc="DMA write, writing to memory, waiting for ack"; + BDR_Pm, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for probes, already got memory"; + BS_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + BM_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B_WM, AccessPermission:Backing_Store, desc="writing to memory, waiting for ack"; + B, AccessPermission:Backing_Store, desc="sent response, Blocked til ack"; } // Events @@ -132,7 +135,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") // DirectoryEntry structure(Entry, desc="...", interface="AbstractCacheEntry", main="false") { State DirectoryState, desc="Directory state"; - DataBlock DataBlk, desc="data for the block"; NetDest VicDirtyIgnore, desc="VicDirty coming from whom to ignore"; } @@ -195,16 +197,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") return dir_entry; } - DataBlock getDataBlock(Addr addr), return_by_ref="yes" { - TBE tbe := TBEs.lookup(addr); - if (is_valid(tbe) && tbe.MemData) { - DPRINTF(RubySlicc, "Returning DataBlk from TBE %s:%s\n", addr, tbe); - return tbe.DataBlk; - } - DPRINTF(RubySlicc, "Returning DataBlk from Dir %s:%s\n", addr, getDirectoryEntry(addr)); - return getDirectoryEntry(addr).DataBlk; - } - State getState(TBE tbe, CacheEntry entry, Addr addr) { return getDirectoryEntry(addr).DirectoryState; } @@ -379,7 +371,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") trigger(Event:MemData, in_msg.addr, entry, tbe); DPRINTF(RubySlicc, "%s\n", in_msg); } else if (in_msg.Type == MemoryRequestType:MEMORY_WB) { - trigger(Event:WBAck, in_msg.addr, entry, tbe); // ignore WBAcks, don't care about them. + trigger(Event:WBAck, in_msg.addr, entry, tbe); } else { DPRINTF(RubySlicc, "%s\n", in_msg.Type); error("Invalid message"); @@ -920,7 +912,13 @@ machine(MachineType:Directory, "AMD Baseline protocol") action(d_writeDataToMemory, "d", desc="Write data to memory") { peek(responseNetwork_in, ResponseMsg) { - getDirectoryEntry(address).DataBlk := in_msg.DataBlk; + enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) { + out_msg.addr := address; + out_msg.Type := MemoryRequestType:MEMORY_WB; + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Data; + out_msg.DataBlk := in_msg.DataBlk; + } if (tbe.Dirty == false) { // have to update the TBE, too, because of how this // directory deals with functional writes @@ -966,7 +964,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") tbe.WTRequestor := in_msg.WTRequestor; tbe.LastSender := in_msg.Requestor; } - tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs tbe.Dirty := false; if (in_msg.Type == CoherenceRequestType:WriteThrough) { tbe.DataBlk.copyPartial(in_msg.DataBlk,in_msg.writeMask); @@ -980,30 +977,37 @@ machine(MachineType:Directory, "AMD Baseline protocol") } action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") { - if (tbe.Dirty == false) { - getDirectoryEntry(address).DataBlk := tbe.DataBlk; - } TBEs.deallocate(address); unset_tbe(); } action(wd_writeBackData, "wd", desc="Write back data if needed") { - if (tbe.wtData) { - getDirectoryEntry(address).DataBlk.copyPartial(tbe.DataBlk, tbe.writeMask); - } else if (tbe.atomicData) { - tbe.DataBlk.atomicPartial(getDirectoryEntry(address).DataBlk,tbe.writeMask); - getDirectoryEntry(address).DataBlk := tbe.DataBlk; - } else if (tbe.Dirty == false) { - getDirectoryEntry(address).DataBlk := tbe.DataBlk; + if (tbe.wtData || tbe.atomicData || tbe.Dirty == false) { + if (tbe.atomicData) { + tbe.DataBlk.atomicPartial(tbe.DataBlk, tbe.writeMask); + } + enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) { + out_msg.addr := address; + out_msg.Type := MemoryRequestType:MEMORY_WB; + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Data; + out_msg.DataBlk := tbe.DataBlk; + out_msg.Len := tbe.Len; + DPRINTF(ProtocolTrace, "%s\n", out_msg); + } } } action(mt_writeMemDataToTBE, "mt", desc="write Mem data to TBE") { peek(memQueue_in, MemoryMsg) { if (tbe.wtData == true) { - // do nothing + // Keep the write-through data based on mask, but use the memory block + // for the masked-off data. + DataBlock tmpBlk := in_msg.DataBlk; + tmpBlk.copyPartial(tbe.DataBlk, tbe.writeMask); + tbe.DataBlk := tmpBlk; } else if (tbe.Dirty == false) { - tbe.DataBlk := getDirectoryEntry(address).DataBlk; + tbe.DataBlk := in_msg.DataBlk; } tbe.MemData := true; } @@ -1183,6 +1187,10 @@ machine(MachineType:Directory, "AMD Baseline protocol") stall_and_wait(dmaRequestQueue_in, address); } + action(su_stallAndWaitRequest, "su", desc="Stall and wait on the address") { + stall_and_wait(unblockNetwork_in, address); + } + action(wad_wakeUpDependents, "wad", desc="Wake up any requests waiting for this address") { wakeUpBuffers(address); } @@ -1199,18 +1207,18 @@ machine(MachineType:Directory, "AMD Baseline protocol") } // TRANSITIONS - transition({BL, BDR_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_P, BS_PM, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) { + transition({BL, BL_WM, BDR_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_P, BDW_WM, BS_PM, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm, B, B_WM}, {RdBlkS, RdBlkM, RdBlk, CtoD}) { st_stallAndWaitRequest; } // It may be possible to save multiple invalidations here! - transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {Atomic, WriteThrough}) { + transition({BL, BL_WM, BDR_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_P, BDW_WM, BS_PM, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm, B, B_WM}, {Atomic, WriteThrough}) { st_stallAndWaitRequest; } // The exit state is always going to be U, so wakeUpDependents logic should be covered in all the // transitions which are flowing into U. - transition({BL, BDR_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_P, BS_PM, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm, B}, {DmaRead,DmaWrite}){ + transition({BL, BL_WM, BDR_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_P, BDW_WM, BS_PM, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm, B, B_WM}, {DmaRead,DmaWrite}){ sd_stallAndWaitRequest; } @@ -1233,7 +1241,6 @@ machine(MachineType:Directory, "AMD Baseline protocol") transition(U, DmaWrite, BDW_P) {L3TagArrayRead} { atd_allocateTBEforDMA; - da_sendResponseDmaAck; icd_probeInvCoreDataForDMA; pd_popDmaRequestQueue; } @@ -1293,12 +1300,10 @@ machine(MachineType:Directory, "AMD Baseline protocol") zz_recycleRequestQueue; } - transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} { + transition(BL, CPUData, BL_WM) {L3TagArrayWrite, L3DataArrayWrite} { d_writeDataToMemory; al_allocateL3Block; pr_profileL3HitMiss; //Must come after al_allocateL3Block and before dt_deallocateTBE - wad_wakeUpDependents; - dt_deallocateTBE; pr_popResponseQueue; } @@ -1308,7 +1313,13 @@ machine(MachineType:Directory, "AMD Baseline protocol") pr_popResponseQueue; } - transition({B, BDR_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_P, BS_PM, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm}, {VicDirty, VicClean}) { + transition(BL_WM, WBAck, U) { + wad_wakeUpDependents; + dt_deallocateTBE; + pm_popMemQueue; + } + + transition({B, B_WM, BDR_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_P, BDW_WM, BS_PM, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm}, {VicDirty, VicClean}) { z_stall; } @@ -1316,7 +1327,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") pm_popMemQueue; } - transition({U, BL, BDR_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_P, BS_PM, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm, B}, StaleVicDirty) { + transition({U, BL, BL_WM, BDR_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_P, BDW_WM, BS_PM, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm, B, B_WM}, StaleVicDirty) { rv_removeVicDirtyIgnore; w_sendResponseWBAck; p_popRequestQueue; @@ -1376,7 +1387,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") pm_popMemQueue; } - transition(BS_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} { + transition(BS_M, MemData, B_WM){L3TagArrayWrite, L3DataArrayWrite} { mt_writeMemDataToTBE; s_sendResponseS; wd_writeBackData; @@ -1385,7 +1396,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") pm_popMemQueue; } - transition(BM_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} { + transition(BM_M, MemData, B_WM){L3TagArrayWrite, L3DataArrayWrite} { mt_writeMemDataToTBE; m_sendResponseM; wd_writeBackData; @@ -1394,7 +1405,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") pm_popMemQueue; } - transition(B_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} { + transition(B_M, MemData, B_WM){L3TagArrayWrite, L3DataArrayWrite} { mt_writeMemDataToTBE; es_sendResponseES; wd_writeBackData; @@ -1403,7 +1414,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") pm_popMemQueue; } - transition(BS_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} { + transition(BS_M, L3Hit, B_WM) {L3TagArrayWrite, L3DataArrayWrite} { s_sendResponseS; wd_writeBackData; alwt_allocateL3BlockOnWT; @@ -1411,7 +1422,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") ptl_popTriggerQueue; } - transition(BM_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} { + transition(BM_M, L3Hit, B_WM) {L3DataArrayWrite, L3TagArrayWrite} { m_sendResponseM; wd_writeBackData; alwt_allocateL3BlockOnWT; @@ -1419,7 +1430,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") ptl_popTriggerQueue; } - transition(B_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} { + transition(B_M, L3Hit, B_WM) {L3DataArrayWrite, L3TagArrayWrite} { es_sendResponseES; wd_writeBackData; alwt_allocateL3BlockOnWT; @@ -1453,12 +1464,18 @@ machine(MachineType:Directory, "AMD Baseline protocol") pt_popTriggerQueue; } - transition(BDW_P, ProbeAcksComplete, U) { + transition(BDW_P, ProbeAcksComplete, BDW_WM) { + wd_writeBackData; + da_sendResponseDmaAck; + pt_popTriggerQueue; + } + + transition(BDW_WM, WBAck, U) { // Check for pending requests from the core we put to sleep while waiting // for a response wada_wakeUpAllDependentsAddr; dt_deallocateTBE; - pt_popTriggerQueue; + pm_popMemQueue; } transition(BDR_Pm, ProbeAcksComplete, U) { @@ -1470,7 +1487,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") pt_popTriggerQueue; } - transition(BS_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + transition(BS_Pm, ProbeAcksComplete, B_WM){L3DataArrayWrite, L3TagArrayWrite} { sf_setForwardReqTime; s_sendResponseS; wd_writeBackData; @@ -1479,7 +1496,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") pt_popTriggerQueue; } - transition(BM_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + transition(BM_Pm, ProbeAcksComplete, B_WM){L3DataArrayWrite, L3TagArrayWrite} { sf_setForwardReqTime; m_sendResponseM; wd_writeBackData; @@ -1488,7 +1505,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") pt_popTriggerQueue; } - transition(B_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} { + transition(B_Pm, ProbeAcksComplete, B_WM){L3DataArrayWrite, L3TagArrayWrite} { sf_setForwardReqTime; es_sendResponseES; wd_writeBackData; @@ -1497,7 +1514,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") pt_popTriggerQueue; } - transition(BP, ProbeAcksComplete, B){L3TagArrayWrite, L3TagArrayWrite} { + transition(BP, ProbeAcksComplete, B_WM){L3TagArrayWrite, L3TagArrayWrite} { sf_setForwardReqTime; c_sendResponseCtoD; wd_writeBackData; @@ -1505,4 +1522,17 @@ machine(MachineType:Directory, "AMD Baseline protocol") dt_deallocateTBE; pt_popTriggerQueue; } + + transition(B_WM, WBAck, B) { + wada_wakeUpAllDependentsAddr; + pm_popMemQueue; + } + + transition(B_WM, UnblockWriteThrough) { + z_stall; + } + + transition(B_WM, CoreUnblock) { + su_stallAndWaitRequest; + } }