From e82cf20150daffbc06ccb9308267eff4517f728c Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Tue, 28 May 2024 11:02:00 -0700 Subject: [PATCH] mem-ruby: Remove VIPER StoreThrough temp cache storage (#1156) StoreThrough in VIPER when the TCP is disabled, GLC bit is set, or SLC bit is set will bypass the TCP, but will temporarily allocate a cache entry seemingly to handle write coalescing with valid blocks. It does not attempt to evict a block if the set is full and the address is invalid. This causes a panic if the set is full as there is no spare cache entry to use temporarily to use for DataBlk manipulation. However, a cache block is not required for this. This commit removes using a cache block for StoreThrough with invalid blocks as there is no existing data to coalesce with. It creates no allocate variants of the actions needed in StoreThrough and pulls the DataBlk information from the in_msg instead. Non-invalid blocks do not have this panic as they have a cache entry already. Fixes issues with StoreThroughs on more aggressive architectures like MI300. Change-Id: Id8687eccb991e967bb5292068cbe7686e0930d7d --- src/mem/ruby/protocol/GPU_VIPER-TCP.sm | 42 +++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm index 1ad935324c..0d740ef473 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm @@ -449,6 +449,28 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") } } + action(wtna_writeThroughNoAlloc, "wtna", desc="Write through without allocation") { + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + peek(mandatoryQueue_in, RubyRequest) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.DataBlk.copyPartial(in_msg.WTData, in_msg.writeMask); + out_msg.writeMask := in_msg.writeMask; + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:WriteThrough; + out_msg.InitialRequestTime := curCycle(); + out_msg.Shared := false; + + // forward inst sequence number to lower TCC + out_msg.instSeqNum := in_msg.instSeqNum; + out_msg.isGLCSet := in_msg.isGLCSet; + out_msg.isSLCSet := in_msg.isSLCSet; + } + } + } + action(at_atomicThrough, "at", desc="send Atomic") { peek(mandatoryQueue_in, RubyRequest) { enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { @@ -597,6 +619,19 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") cache_entry.Dirty := true; } + action(sna_storeDoneMissNoAlloc, "sna", desc="local store done (misses in TCP)") { + peek(mandatoryQueue_in, RubyRequest) { + // writeCallback requires pass-by-reference and in_msg.WTData is a const value. + DataBlock tmp := in_msg.WTData; + + if (use_seq_not_coal) { + sequencer.writeCallback(address, tmp, false, MachineType:L1Cache); + } else { + coalescer.writeCallback(address, MachineType:L1Cache, tmp); + } + } + } + action(f_flushDone, "f", desc="flush done") { assert(is_valid(cache_entry)); @@ -755,12 +790,9 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") } transition(I, StoreThrough) {TagArrayRead, TagArrayWrite, DataArrayWrite} { - a_allocate; - dw_dirtyWrite; - s_storeDoneMiss; + sna_storeDoneMissNoAlloc; uu_profileDataMiss; - wt_writeThrough; - ic_invCache; + wtna_writeThroughNoAlloc; p_popMandatoryQueue; }