mem-ruby: Remove VIPER StoreThrough temp cache storage (#1156)

StoreThrough in VIPER when the TCP is disabled, GLC bit is set, or SLC bit is set will bypass the TCP, but will temporarily allocate a cache entry seemingly to handle write coalescing with valid blocks. It does not attempt to evict a block if the set is full and the address is invalid. This causes a panic if the set is full as there is no spare cache entry to use temporarily to use for DataBlk manipulation. However, a cache block is not required for this. This commit removes using a cache block for StoreThrough with invalid blocks as there is no existing data to coalesce with. It creates no allocate variants of the actions needed in StoreThrough and pulls the DataBlk information from the in_msg instead. Non-invalid blocks do not have this panic as they have a cache entry already. Fixes issues with StoreThroughs on more aggressive architectures like MI300. Change-Id: Id8687eccb991e967bb5292068cbe7686e0930d7d
2024-05-28 11:02:00 -07:00
parent 5ec1acaf5f
commit e82cf20150
1 changed files with 37 additions and 5 deletions
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -449,6 +449,28 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
    }
  }

+  action(wtna_writeThroughNoAlloc, "wtna", desc="Write through without allocation") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      peek(mandatoryQueue_in, RubyRequest) {
+        out_msg.addr := address;
+        out_msg.Requestor := machineID;
+        out_msg.DataBlk.copyPartial(in_msg.WTData, in_msg.writeMask);
+        out_msg.writeMask := in_msg.writeMask;
+        out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                                TCC_select_low_bit, TCC_select_num_bits));
+        out_msg.MessageSize := MessageSizeType:Data;
+        out_msg.Type := CoherenceRequestType:WriteThrough;
+        out_msg.InitialRequestTime := curCycle();
+        out_msg.Shared := false;
+
+        // forward inst sequence number to lower TCC
+        out_msg.instSeqNum := in_msg.instSeqNum;
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
+    }
+  }
+
  action(at_atomicThrough, "at", desc="send Atomic") {
    peek(mandatoryQueue_in, RubyRequest) {
      enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
@@ -597,6 +619,19 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
    cache_entry.Dirty := true;
  }

+  action(sna_storeDoneMissNoAlloc, "sna", desc="local store done (misses in TCP)") {
+    peek(mandatoryQueue_in, RubyRequest) {
+      // writeCallback requires pass-by-reference and in_msg.WTData is a const value.
+      DataBlock tmp := in_msg.WTData;
+
+      if (use_seq_not_coal) {
+          sequencer.writeCallback(address, tmp, false, MachineType:L1Cache);
+      } else {
+          coalescer.writeCallback(address, MachineType:L1Cache, tmp);
+      }
+    }
+  }
+
  action(f_flushDone, "f", desc="flush done") {
    assert(is_valid(cache_entry));

@@ -755,12 +790,9 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
  }

  transition(I, StoreThrough) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
-    a_allocate;
-    dw_dirtyWrite;
-    s_storeDoneMiss;
+    sna_storeDoneMissNoAlloc;
    uu_profileDataMiss;
-    wt_writeThrough;
-    ic_invCache;
+    wtna_writeThroughNoAlloc;
    p_popMandatoryQueue;
  }