mem-ruby: SLICC Fixes to GLC Atomics in WB L2 (#397)

Made the following changes to fix the behavior of GLC atomics in a WB L2: - Stored atomic write mask in TBE For GLC atomics on an invalid line that bypass to the directory, but have their atomics performed on the return path. - Replaced !presentOrAvail() check for bypassing atomics to directory (which will then be performed on return path), with check for invalid line state. - Replaced wdb_writeDirtyBytes action used when performing atomics with owm_orWriteMask action that doesn't write from invalid atomic request data block - Fixed atomic return path actions Change-Id: I6a406c313d2f9c88cd75bfe39187ef94ce84098f
2023-11-09 15:15:10 -06:00
parent 0442c9a88c
commit 1204267fd8
1 changed files with 82 additions and 38 deletions
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -106,16 +106,17 @@ machine(MachineType:TCC, "TCC Cache")
  }

  structure(TBE, desc="...") {
-    State TBEState,     desc="Transient state";
-    DataBlock DataBlk,  desc="data for the block";
-    bool Dirty,         desc="Is the data dirty?";
-    bool Shared,        desc="Victim hit by shared probe";
-    MachineID From,     desc="Waiting for writeback from...";
-    NetDest Destination, desc="Data destination";
-    int numAtomics,     desc="number remaining atomics";
-    int atomicDoneCnt,  desc="number AtomicDones triggered";
-    bool isGLCSet,      desc="Bypass L1 Cache";
-    bool isSLCSet,      desc="Bypass L1 and L2 Cache";
+    State TBEState,                  desc="Transient state";
+    DataBlock DataBlk,               desc="data for the block";
+    bool Dirty,                      desc="Is the data dirty?";
+    bool Shared,                     desc="Victim hit by shared probe";
+    MachineID From,                  desc="Waiting for writeback from...";
+    NetDest Destination,             desc="Data destination";
+    int numPendingDirectoryAtomics,  desc="number of pending atomics to be performed in directory";
+    int atomicDoneCnt,               desc="number AtomicDones triggered";
+    bool isGLCSet,                   desc="Bypass L1 Cache";
+    bool isSLCSet,                   desc="Bypass L1 and L2 Cache";
+    WriteMask atomicWriteMask,       desc="Atomic write mask";
  }

  structure(TBETable, external="yes") {
@@ -265,13 +266,15 @@ machine(MachineType:TCC, "TCC Cache")
        TBE tbe := TBEs.lookup(in_msg.addr);
        Entry cache_entry := getCacheEntry(in_msg.addr);

+        // The trigger queue applies only to atomics performed in the directory.
+
        // There is a possible race where multiple AtomicDone triggers can be
        // sent if another Atomic to the same address is issued after the
        // AtomicDone is triggered but before the message arrives here. For
        // that case we count the number of AtomicDones in flight for this
        // address and only call AtomicDone to deallocate the TBE when it is
        // the last in flight message.
-        if (tbe.numAtomics == 0 && tbe.atomicDoneCnt == 1) {
+        if (tbe.numPendingDirectoryAtomics == 0 && tbe.atomicDoneCnt == 1) {
            trigger(Event:AtomicDone, in_msg.addr, cache_entry, tbe);
        } else {
            trigger(Event:AtomicNotDone, in_msg.addr, cache_entry, tbe);
@@ -370,7 +373,7 @@ machine(MachineType:TCC, "TCC Cache")
          // TCC will perform the atomic on the return path on Event:Data.
          // The action will invalidate the cache line if SLC is set and the address is
          // in the cache.
-          if(in_msg.isSLCSet || !WB || !presentOrAvail(in_msg.addr)) {
+          if(in_msg.isSLCSet || !WB) {
            trigger(Event:AtomicPassOn, in_msg.addr, cache_entry, tbe);
          } else {
            trigger(Event:Atomic, in_msg.addr, cache_entry, tbe);
@@ -549,7 +552,23 @@ machine(MachineType:TCC, "TCC Cache")
    cache_entry.DataBlk.clearAtomicLogEntries();
  }

-  action(bar_sendBypassedAtomicResponse, "bar", desc="send bypassed Atomic Ack") {
+  action(baplr_sendBypassedAtomicPerformedLocallyResponse, "barplr", desc="send locally-performed bypassed Atomic Ack") {
+    peek(responseFromNB_in, ResponseMsg) {
+        enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+          out_msg.addr := address;
+          out_msg.Type := CoherenceResponseType:TDSysResp;
+          out_msg.Destination.add(in_msg.WTRequestor);
+          out_msg.Sender := machineID;
+          out_msg.MessageSize := in_msg.MessageSize;
+          out_msg.DataBlk := cache_entry.DataBlk;
+          out_msg.isGLCSet := tbe.isGLCSet;
+          out_msg.isSLCSet := tbe.isSLCSet;
+        }
+    }
+    cache_entry.DataBlk.clearAtomicLogEntries();
+  }
+
+  action(bapdr_sendBypassedAtomicPerformedInDirectoryResponse, "bapdr", desc="send bypassed Atomic Ack") {
    peek(responseFromNB_in, ResponseMsg) {
        enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
          out_msg.addr := address;
@@ -585,7 +604,7 @@ machine(MachineType:TCC, "TCC Cache")
      TBEs.allocate(address);
      set_tbe(TBEs.lookup(address));
      tbe.Destination.clear();
-      tbe.numAtomics := 0;
+      tbe.numPendingDirectoryAtomics := 0;
      tbe.atomicDoneCnt := 0;
    }
    if (coreRequestNetwork_in.isReady(clockEdge())) {
@@ -595,6 +614,10 @@ machine(MachineType:TCC, "TCC Cache")
        }
        tbe.isGLCSet := in_msg.isGLCSet;
        tbe.isSLCSet := in_msg.isSLCSet;
+        if(in_msg.Type == CoherenceRequestType:Atomic){
+          tbe.atomicWriteMask.clear();
+          tbe.atomicWriteMask.orMask(in_msg.writeMask);
+        }
      }
    }
  }
@@ -620,6 +643,20 @@ machine(MachineType:TCC, "TCC Cache")
    }
  }

+  action(wardb_writeAtomicResponseDirtyBytes, "wardb", desc="write data to TCC") {
+    peek(responseFromNB_in, ResponseMsg) {
+      cache_entry.DataBlk := in_msg.DataBlk;
+      cache_entry.writeMask.orMask(tbe.atomicWriteMask);
+      DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
+    }
+  }
+
+  action(owm_orWriteMask, "owm", desc="or TCCs write mask") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      cache_entry.writeMask.orMask(in_msg.writeMask);
+    }
+  }
+
  action(wt_writeThrough, "wt", desc="write back data") {
    peek(coreRequestNetwork_in, CPURequestMsg) {
      enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
@@ -726,14 +763,20 @@ machine(MachineType:TCC, "TCC Cache")
  }


-  action(ina_incrementNumAtomics, "ina", desc="inc num atomics") {
-    tbe.numAtomics := tbe.numAtomics + 1;
+  action(inpa_incrementNumPendingDirectoryAtomics, "inpa", desc="inc num atomics") {
+    // Only increment number of atomics if they will actually be performed in directory
+    // That is, if the SLC bit is set or if the cache is write through
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      if (in_msg.isSLCSet || !WB) {
+        tbe.numPendingDirectoryAtomics := tbe.numPendingDirectoryAtomics + 1;
+      }
+    }
  }


-  action(dna_decrementNumAtomics, "dna", desc="inc num atomics") {
-    tbe.numAtomics := tbe.numAtomics - 1;
-    if (tbe.numAtomics==0) {
+  action(dnpa_decrementNumPendingDirectoryAtomics, "dnpa", desc="dec num atomics") {
+    tbe.numPendingDirectoryAtomics := tbe.numPendingDirectoryAtomics - 1;
+    if (tbe.numPendingDirectoryAtomics==0) {
      enqueue(triggerQueue_out, TriggerMsg, 1) {
        tbe.atomicDoneCnt := tbe.atomicDoneCnt + 1;
        out_msg.addr := address;
@@ -876,17 +919,8 @@ machine(MachineType:TCC, "TCC Cache")

  transition(V, Atomic, M) {TagArrayRead, DataArrayWrite} {
    p_profileHit;
-    wdb_writeDirtyBytes;
-    pa_performAtomic;
-    ar_sendAtomicResponse;
-    p_popRequestQueue;
-  }
-
-  transition(I, Atomic, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
-    p_profileMiss;
-    a_allocateBlock;
    ut_updateTag;
-    wdb_writeDirtyBytes;
+    owm_orWriteMask;
    pa_performAtomic;
    ar_sendAtomicResponse;
    p_popRequestQueue;
@@ -900,9 +934,19 @@ machine(MachineType:TCC, "TCC Cache")
    st_stallAndWaitRequest;
  }

-  transition({M, W}, Atomic) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+  transition(W, Atomic, WI) {
+    t_allocateTBE;
+    wb_writeBack;
+    // need to try this request again after writing back the current entry -- to
+    // do so, put it with other stalled requests in a buffer to reduce resource
+    // contention since they won't try again every cycle and will instead only
+    // try again once woken up
+    st_stallAndWaitRequest;
+  }
+
+  transition(M, Atomic) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
    p_profileHit;
-    wdb_writeDirtyBytes;
+    owm_orWriteMask;
    pa_performAtomic;
    ar_sendAtomicResponse;
    p_popRequestQueue;
@@ -917,16 +961,16 @@ machine(MachineType:TCC, "TCC Cache")
    i_invL2;
    t_allocateTBE;
    at_atomicThrough;
-    ina_incrementNumAtomics;
+    inpa_incrementNumPendingDirectoryAtomics;
    p_popRequestQueue;
  }

-  transition(I, AtomicPassOn, A) {TagArrayRead} {
+  transition(I, {Atomic, AtomicPassOn}, A) {TagArrayRead} {
    p_profileMiss;
    i_invL2;
    t_allocateTBE;
    at_atomicThrough;
-    ina_incrementNumAtomics;
+    inpa_incrementNumPendingDirectoryAtomics;
    p_popRequestQueue;
  }

@@ -1058,8 +1102,8 @@ machine(MachineType:TCC, "TCC Cache")
  }

  transition(A, Bypass) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
-    bar_sendBypassedAtomicResponse;
-    dna_decrementNumAtomics;
+    bapdr_sendBypassedAtomicPerformedInDirectoryResponse;
+    dnpa_decrementNumPendingDirectoryAtomics;
    pr_popResponseQueue;
  }

@@ -1081,9 +1125,9 @@ machine(MachineType:TCC, "TCC Cache")

  transition(A, Data, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
    a_allocateBlock;
+    wardb_writeAtomicResponseDirtyBytes;
    pa_performAtomic;
-    bar_sendBypassedAtomicResponse;
-    dna_decrementNumAtomics;
+    baplr_sendBypassedAtomicPerformedLocallyResponse;
    pr_popResponseQueue;
  }