From 20d8b388ad5de20dfe746c8734fb5cdbb0f6f415 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Tue, 8 Mar 2022 12:56:23 -0800 Subject: [PATCH] mem-ruby: Enhance MOESI_AMD DmaWrite This enhances MOESI_AMD_Base-dir DmaWrite to enable partial writes. This is currently done by assuming a full cache line, invalidating caches, and transitioning back to unblocked state. The enhanced write supports partial writes (i.e., smaller than cache line size) by first reading memory, merging the modified data, and then writing back to memory. Implementation of this mirrors that of DmaRead in terms of state. This means for each DmaRead state (BDR_PM, BDR_Pm, and BDR_M) there is a write analogue (BDW_PM, BDW_Pm, and BDR_M) and the BDR_P state is removed. Furthermore, this enhanced DmaWrite ... actually writes data to memory instead of relying on DirectoryEntry / backing store for correct data. There are two possible state transitions for DmaWrite now. (1) Memory data arrives before probe response and (2) probe response arrives before memory data. In case (1), probe data overwrites memory data and merges the partial write using the TBE write mask then updates write mask to 'filled' state. In case (2), probe data is merged with the partial data using the TBE write mask then updates write mask to 'filled' state. The memory data will then be clobbered by copying the TBE data over the response since the write mask is now full. Change-Id: I1eebb882b464c4c5ee5fd60932fd38d271ada4d7 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/57410 Reviewed-by: Matthew Poremba Maintainer: Matthew Poremba Reviewed-by: Matt Sinclair Maintainer: Matt Sinclair Tested-by: kokoro --- src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm | 102 ++++++++++++++------ 1 file changed, 72 insertions(+), 30 deletions(-) diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm index 9176df5313..3b38e3b1ff 100644 --- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm +++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm @@ -61,26 +61,28 @@ machine(MachineType:Directory, "AMD Baseline protocol") { // STATES state_declaration(State, desc="Directory states", default="Directory_State_U") { - U, AccessPermission:Backing_Store, desc="unblocked"; + U, AccessPermission:Backing_Store, desc="unblocked"; BL, AccessPermission:Busy, desc="got L3 WB request"; // BL is Busy because it's possible for the data only to be in the network // in the WB, L3 has sent it and gone on with its business in possibly I // state. - BDR_M, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for memory"; - BS_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; - BM_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; - B_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; - BP, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory"; - BDR_PM, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for probes and memory"; - BS_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; - BM_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; - B_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; - BDW_P, AccessPermission:Backing_Store, desc="DMA write, blocked waiting for probes, no need for memory"; - BDR_Pm, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for probes, already got memory"; - BS_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; - BM_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; - B_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; - B, AccessPermission:Backing_Store, desc="sent response, Blocked til ack"; + BDR_M, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for memory"; + BDW_M, AccessPermission:Backing_Store, desc="DMA write, blocked waiting for memory"; + BS_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + BM_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + B_M, AccessPermission:Backing_Store, desc="blocked waiting for memory"; + BP, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory"; + BDR_PM, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for probes and memory"; + BDW_PM, AccessPermission:Backing_Store, desc="DMA write, blocked waiting for probes and memory"; + BS_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + BM_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + B_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory"; + BDR_Pm, AccessPermission:Backing_Store, desc="DMA read, blocked waiting for probes, already got memory"; + BDW_Pm, AccessPermission:Backing_Store, desc="DMA write, blocked waiting for probes, already got memory"; + BS_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + BM_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; + B, AccessPermission:Backing_Store, desc="sent response, Blocked til ack"; } // Events @@ -579,7 +581,13 @@ machine(MachineType:Directory, "AMD Baseline protocol") out_msg.Type := TriggerType:L3Hit; } CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address)); - tbe.DataBlk := entry.DataBlk; + + // tbe.DataBlk may have partial data (e.g., for DMA writes). Make sure + // not to clobber the data before merging it with the L3 cache data. + DataBlock tmpBlk := entry.DataBlk; + tmpBlk.copyPartial(tbe.DataBlk, tbe.writeMask); + tbe.DataBlk := tmpBlk; + tbe.L3Hit := true; tbe.MemData := true; L3CacheMemory.deallocate(address); @@ -937,7 +945,14 @@ machine(MachineType:Directory, "AMD Baseline protocol") tbe.wtData := true; tbe.Dirty := true; tbe.DataBlk := in_msg.DataBlk; - tbe.writeMask.fillMask(); + // DMAs can be partial cache line. This is indicated by a "Len" value + // which is non-zero. In this case make sure to set the writeMask + // appropriately. This can occur for either reads or writes. + if (in_msg.Len != 0) { + tbe.writeMask.setMask(addressOffset(in_msg.PhysicalAddress, address), in_msg.Len); + } else { + tbe.writeMask.fillMask(); + } } } } @@ -998,7 +1013,9 @@ machine(MachineType:Directory, "AMD Baseline protocol") peek(memQueue_in, MemoryMsg) { if (tbe.wtData == true) { // Keep the write-through data based on mask, but use the memory block - // for the masked-off data. + // for the masked-off data. If we received a probe with data, the mask + // will be filled and the tbe data will fully overwrite the memory + // data in the temp block. DataBlock tmpBlk := in_msg.DataBlk; tmpBlk.copyPartial(tbe.DataBlk, tbe.writeMask); tbe.DataBlk := tmpBlk; @@ -1199,7 +1216,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") } // TRANSITIONS - transition({BL, BDR_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_P, BS_PM, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) { + transition({BL, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) { st_stallAndWaitRequest; } @@ -1210,7 +1227,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") // The exit state is always going to be U, so wakeUpDependents logic should be covered in all the // transitions which are flowing into U. - transition({BL, BDR_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_P, BS_PM, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm, B}, {DmaRead,DmaWrite}){ + transition({BL, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, B}, {DmaRead,DmaWrite}){ sd_stallAndWaitRequest; } @@ -1231,9 +1248,10 @@ machine(MachineType:Directory, "AMD Baseline protocol") p_popRequestQueue; } - transition(U, DmaWrite, BDW_P) {L3TagArrayRead} { + transition(U, DmaWrite, BDW_PM) {L3TagArrayRead} { atd_allocateTBEforDMA; - da_sendResponseDmaAck; + qdr_queueDmaRdReq; + pr_profileL3HitMiss; //Must come after qdr_queueDmaRdReq icd_probeInvCoreDataForDMA; pd_popDmaRequestQueue; } @@ -1308,15 +1326,15 @@ machine(MachineType:Directory, "AMD Baseline protocol") pr_popResponseQueue; } - transition({B, BDR_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_P, BS_PM, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm}, {VicDirty, VicClean}) { + transition({B, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm}, {VicDirty, VicClean}) { z_stall; } - transition({U, BL, BDR_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_P, BS_PM, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm, B}, WBAck) { + transition({U, BL, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, B}, WBAck) { pm_popMemQueue; } - transition({U, BL, BDR_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_P, BS_PM, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm, B}, StaleVicDirty) { + transition({U, BL, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, B}, StaleVicDirty) { rv_removeVicDirtyIgnore; w_sendResponseWBAck; p_popRequestQueue; @@ -1337,6 +1355,11 @@ machine(MachineType:Directory, "AMD Baseline protocol") pm_popMemQueue; } + transition(BDW_PM, MemData, BDW_Pm) { + mt_writeMemDataToTBE; + pm_popMemQueue; + } + transition(BS_PM, MemData, BS_Pm) {} { mt_writeMemDataToTBE; pm_popMemQueue; @@ -1356,6 +1379,10 @@ machine(MachineType:Directory, "AMD Baseline protocol") ptl_popTriggerQueue; } + transition(BDW_PM, L3Hit, BDW_Pm) { + ptl_popTriggerQueue; + } + transition(BS_PM, L3Hit, BS_Pm) {} { ptl_popTriggerQueue; } @@ -1376,6 +1403,15 @@ machine(MachineType:Directory, "AMD Baseline protocol") pm_popMemQueue; } + transition(BDW_M, MemData, U) { + mt_writeMemDataToTBE; + da_sendResponseDmaAck; + wd_writeBackData; + wada_wakeUpAllDependentsAddr; + dt_deallocateTBE; + pm_popMemQueue; + } + transition(BS_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} { mt_writeMemDataToTBE; s_sendResponseS; @@ -1427,7 +1463,7 @@ machine(MachineType:Directory, "AMD Baseline protocol") ptl_popTriggerQueue; } - transition({BDR_PM, BS_PM, BDW_P, BM_PM, B_PM, BDR_Pm, BS_Pm, BM_Pm, B_Pm, BP}, CPUPrbResp) { + transition({BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, BP}, CPUPrbResp) { y_writeProbeDataToTBE; x_decrementAcks; o_checkForCompletion; @@ -1438,6 +1474,10 @@ machine(MachineType:Directory, "AMD Baseline protocol") pt_popTriggerQueue; } + transition(BDW_PM, ProbeAcksComplete, BDW_M) { + pt_popTriggerQueue; + } + transition(BS_PM, ProbeAcksComplete, BS_M) {} { sf_setForwardReqTime; pt_popTriggerQueue; @@ -1453,7 +1493,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") pt_popTriggerQueue; } - transition(BDW_P, ProbeAcksComplete, U) { + transition(BDR_Pm, ProbeAcksComplete, U) { + dd_sendResponseDmaData; // Check for pending requests from the core we put to sleep while waiting // for a response wada_wakeUpAllDependentsAddr; @@ -1461,8 +1502,9 @@ machine(MachineType:Directory, "AMD Baseline protocol") pt_popTriggerQueue; } - transition(BDR_Pm, ProbeAcksComplete, U) { - dd_sendResponseDmaData; + transition(BDW_Pm, ProbeAcksComplete, U) { + da_sendResponseDmaAck; + wd_writeBackData; // Check for pending requests from the core we put to sleep while waiting // for a response wada_wakeUpAllDependentsAddr;