From e82cf20150daffbc06ccb9308267eff4517f728c Mon Sep 17 00:00:00 2001
From: Matthew Poremba <matthew.poremba@amd.com>
Date: Tue, 28 May 2024 11:02:00 -0700
Subject: [PATCH] mem-ruby: Remove VIPER StoreThrough temp cache storage
 (#1156)

StoreThrough in VIPER when the TCP is disabled, GLC bit is set, or SLC
bit is set will bypass the TCP, but will temporarily allocate a cache
entry seemingly to handle write coalescing with valid blocks. It does
not attempt to evict a block if the set is full and the address is
invalid. This causes a panic if the set is full as there is no spare
cache entry to use temporarily to use for DataBlk manipulation. However,
a cache block is not required for this.

This commit removes using a cache block for StoreThrough with invalid
blocks as there is no existing data to coalesce with. It creates no
allocate variants of the actions needed in StoreThrough and pulls the
DataBlk information from the in_msg instead. Non-invalid blocks do not
have this panic as they have a cache entry already.

Fixes issues with StoreThroughs on more aggressive architectures like
MI300.

Change-Id: Id8687eccb991e967bb5292068cbe7686e0930d7d
---
 src/mem/ruby/protocol/GPU_VIPER-TCP.sm | 42 +++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
index 1ad935324c..0d740ef473 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -449,6 +449,28 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     }
   }
 
+  action(wtna_writeThroughNoAlloc, "wtna", desc="Write through without allocation") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      peek(mandatoryQueue_in, RubyRequest) {
+        out_msg.addr := address;
+        out_msg.Requestor := machineID;
+        out_msg.DataBlk.copyPartial(in_msg.WTData, in_msg.writeMask);
+        out_msg.writeMask := in_msg.writeMask;
+        out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                TCC_select_low_bit, TCC_select_num_bits));
+        out_msg.MessageSize := MessageSizeType:Data;
+        out_msg.Type := CoherenceRequestType:WriteThrough;
+        out_msg.InitialRequestTime := curCycle();
+        out_msg.Shared := false;
+
+        // forward inst sequence number to lower TCC
+        out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
+    }
+  }
+
   action(at_atomicThrough, "at", desc="send Atomic") {
     peek(mandatoryQueue_in, RubyRequest) {
       enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
@@ -597,6 +619,19 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     cache_entry.Dirty := true;
   }
 
+  action(sna_storeDoneMissNoAlloc, "sna", desc="local store done (misses in TCP)") {
+    peek(mandatoryQueue_in, RubyRequest) {
+      // writeCallback requires pass-by-reference and in_msg.WTData is a const value.
+      DataBlock tmp := in_msg.WTData;
+
+      if (use_seq_not_coal) {
+          sequencer.writeCallback(address, tmp, false, MachineType:L1Cache);
+      } else {
+          coalescer.writeCallback(address, MachineType:L1Cache, tmp);
+      }
+    }
+  }
+
   action(f_flushDone, "f", desc="flush done") {
     assert(is_valid(cache_entry));
 
@@ -755,12 +790,9 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
   }
 
   transition(I, StoreThrough) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
-    a_allocate;
-    dw_dirtyWrite;
-    s_storeDoneMiss;
+    sna_storeDoneMissNoAlloc;
     uu_profileDataMiss;
-    wt_writeThrough;
-    ic_invCache;
+    wtna_writeThroughNoAlloc;
     p_popMandatoryQueue;
   }