mem-ruby: Add flush support to GPU_VIPER protocol

This commit adds flush support to the GPU VIPER coherence protocol. The L1 cache will now initiate a flush request if the packet it receives is of type RubyRequestType_FLUSH. During the flush process, the L1 cache will a request to L2 if its in either V or I state. L2 will issue a flush request to the directory if its cache line is in the valid state before invalidating its copy. The directory, on receiving this request, writes data to memory and sends an ack back to the L2. L2 forwards this ack back to the L1, which then ends the flush by calling the write callback Change-Id: I9dfc0c7b71a1e9f6d5e9e6ed4977c1e6a3b5ba46
2023-09-29 18:19:37 -05:00
parent 61e39d5b26
commit 085789d00c
3 changed files with 151 additions and 2 deletions
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -65,7 +65,8 @@ machine(MachineType:TCC, "TCC Cache")
    AtomicPassOn,           desc="Atomic Op Passed on to Directory";
    AtomicDone,             desc="AtomicOps Complete";
    AtomicNotDone,          desc="AtomicOps not Complete";
-    Data,                   desc="data messgae";
+    Data,                   desc="Data message";
+    Flush,                  desc="Flush cache entry";
    // Coming from this TCC
    L2_Repl,                desc="L2 Replacement";
    // Probes
@@ -376,6 +377,8 @@ machine(MachineType:TCC, "TCC Cache")
          } else {
            trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
          }
+        } else if (in_msg.Type == CoherenceRequestType:WriteFlush) {
+            trigger(Event:Flush, in_msg.addr, cache_entry, tbe);
        } else {
          DPRINTF(RubySlicc, "%s\n", in_msg);
          error("Unexpected Response Message to Core");
@@ -509,6 +512,20 @@ machine(MachineType:TCC, "TCC Cache")
    }
  }

+  action(fw_sendFlushResponse, "fw", desc="send Flush Response") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysWBAck;
+        out_msg.Destination.clear();
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        out_msg.instSeqNum := in_msg.instSeqNum;
+      }
+    }
+  }
+
  action(ar_sendAtomicResponse, "ar", desc="send Atomic Ack") {
    peek(coreRequestNetwork_in, CPURequestMsg) {
        enqueue(responseToCore_out, ResponseMsg, l2_response_latency + glc_atomic_latency, true) {
@@ -628,6 +645,22 @@ machine(MachineType:TCC, "TCC Cache")
    }
  }

+  action(f_flush, "f", desc="write back data") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+        out_msg.addr := address;
+        out_msg.Requestor := machineID;
+        out_msg.WTRequestor := in_msg.Requestor;
+        out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
+        out_msg.MessageSize := MessageSizeType:Data;
+        out_msg.Type := CoherenceRequestType:WriteFlush;
+        out_msg.Dirty := true;
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.writeMask.orMask(cache_entry.writeMask);
+      }
+    }
+  }
+
  action(at_atomicThrough, "at", desc="write back data") {
    peek(coreRequestNetwork_in, CPURequestMsg) {
      enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
@@ -1075,4 +1108,21 @@ machine(MachineType:TCC, "TCC Cache")
  transition(WIB, WBAck,I) {
    pr_popResponseQueue;
  }
+
+  transition({A, IV, WI, WIB}, Flush) {
+    st_stallAndWaitRequest;
+  }
+
+  transition(I, Flush) {
+    fw_sendFlushResponse;
+    p_popRequestQueue;
+  }
+
+  transition({V, W}, Flush, I) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    ut_updateTag;
+    f_flush;
+    i_invL2;
+    p_popRequestQueue;
+   }
 }
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -55,6 +55,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
    I, AccessPermission:Invalid, desc="Invalid";
    V, AccessPermission:Read_Only, desc="Valid";
    A, AccessPermission:Invalid, desc="Waiting on Atomic";
+
+    F, AccessPermission:Invalid, desc="Flushing; Waiting for Ack";
  }

  enumeration(Event, desc="TCP Events") {
@@ -256,6 +258,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
      peek(responseToTCP_in, ResponseMsg, block_on="addr") {
        Entry cache_entry := getCacheEntry(in_msg.addr);
        TBE tbe := TBEs.lookup(in_msg.addr);
+        DPRINTF(RubySlicc, "In responseToTCP_in with %s\n", in_msg);
+
        if (in_msg.Type == CoherenceResponseType:TDSysResp) {
          if (disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) {
              // If L1 is disabled or requests have GLC or SLC flag set,
@@ -273,6 +277,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
        } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck ||
                     in_msg.Type == CoherenceResponseType:NBSysWBAck) {
            trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
+            DPRINTF(RubySlicc, "Issuing TCC_AckWB\n");
          } else {
            error("Unexpected Response Message to Core");
          }
@@ -469,6 +474,24 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
  action(sf_setFlush, "sf", desc="set flush") {
    inFlush := true;
    APPEND_TRANSITION_COMMENT(" inFlush is true");
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      assert(is_valid(cache_entry));
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.writeMask.clear();
+      out_msg.writeMask.orMask(cache_entry.writeMask);
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Data;
+      out_msg.Type := CoherenceRequestType:WriteFlush;
+      out_msg.InitialRequestTime := curCycle();
+      out_msg.Shared := false;
+      out_msg.isSLCSet := false;
+      peek(mandatoryQueue_in, RubyRequest) {
+        out_msg.instSeqNum := in_msg.instSeqNum;
+      }
+    }
  }

  action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
@@ -524,6 +547,16 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
    cache_entry.Dirty := true;
  }

+  action(f_flushDone, "f", desc="flush done") {
+    assert(is_valid(cache_entry));
+
+    if (use_seq_not_coal) {
+        sequencer.writeCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
+    } else {
+        coalescer.writeCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
+    }
+  }
+
  action(inv_invDone, "inv", desc="local inv done") {
    if (use_seq_not_coal) {
        DPRINTF(RubySlicc, "Sequencer does not define invCallback!\n");
@@ -695,11 +728,16 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
    ic_invCache;
  }

-  transition({V, I, A},Flush) {TagArrayFlash} {
+  transition({V,I}, Flush, F) {TagArrayFlash} {
+    a_allocate;
    sf_setFlush;
    p_popMandatoryQueue;
  }

+  transition(A, Flush) {
+    z_stall;
+  }
+
  transition({I, V}, Evict, I) {TagArrayFlash} {
    inv_invDone;
    p_popMandatoryQueue;
@@ -716,4 +754,10 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
    wd_wtDone;
    pr_popResponseQueue;
  }
+
+  transition(F, TCC_AckWB, I) {
+    f_flushDone;
+    pr_popResponseQueue;
+    ic_invCache;
+  }
 }
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
@@ -83,6 +83,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
    BM_Pm, AccessPermission:Backing_Store,      desc="blocked waiting for probes, already got memory";
    B_Pm, AccessPermission:Backing_Store,       desc="blocked waiting for probes, already got memory";
    B, AccessPermission:Backing_Store,          desc="sent response, Blocked til ack";
+
+    F, AccessPermission:Busy, desc="sent Flus, blocked till ack";
  }

  // Events
@@ -120,6 +122,9 @@ machine(MachineType:Directory, "AMD Baseline protocol")
    // DMA
    DmaRead,            desc="DMA read";
    DmaWrite,           desc="DMA write";
+
+    // Flush
+    Flush,              desc="Flush entry";
  }

  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
@@ -411,6 +416,9 @@ machine(MachineType:Directory, "AMD Baseline protocol")
            DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr);
            trigger(Event:VicClean, in_msg.addr, entry, tbe);
          }
+        } else if (in_msg.Type == CoherenceRequestType:WriteFlush) {
+            DPRINTF(RubySlicc, "Got Flush from %s on %s\n", in_msg.Requestor, in_msg.addr);
+            trigger(Event:Flush, in_msg.addr, entry, tbe);
        } else {
          error("Bad request message type");
        }
@@ -562,6 +570,23 @@ machine(MachineType:Directory, "AMD Baseline protocol")
    }
  }

+  action(rf_sendResponseFlush, "rf", desc="send Flush Ack") {
+    peek(memQueue_in, MemoryMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysWBAck;
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.WTRequestor := tbe.WTRequestor;
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        out_msg.InitialRequestTime := tbe.InitialRequestTime;
+        out_msg.ForwardRequestTime := curCycle();
+        out_msg.ProbeRequestStartTime := curCycle();
+        //out_msg.instSeqNum := in_msg.instSeqNum;
+      }
+    }
+  }
+
  action(l_queueMemWBReq, "lq", desc="Write WB data to memory") {
    peek(responseNetwork_in, ResponseMsg) {
      enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
@@ -933,6 +958,23 @@ machine(MachineType:Directory, "AMD Baseline protocol")
    }
  }

+  action(f_writeFlushDataToMemory, "f", desc="Write flush data to memory") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
+        out_msg.addr := address;
+        out_msg.Type := MemoryRequestType:MEMORY_WB;
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Data;
+        out_msg.DataBlk := in_msg.DataBlk;
+      }
+      if (tbe.Dirty == false) {
+          // have to update the TBE, too, because of how this
+          // directory deals with functional writes
+        tbe.DataBlk := in_msg.DataBlk;
+      }
+    }
+  }
+
  action(atd_allocateTBEforDMA, "atd", desc="allocate TBE Entry for DMA") {
    check_allocate(TBEs);
    peek(dmaRequestQueue_in, DMARequestMsg) {
@@ -1553,4 +1595,17 @@ machine(MachineType:Directory, "AMD Baseline protocol")
    dt_deallocateTBE;
    pt_popTriggerQueue;
  }
+
+ transition(U, Flush, F) {L3TagArrayRead, L3TagArrayWrite} {
+    t_allocateTBE;
+    f_writeFlushDataToMemory;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+ }
+
+ transition(F, WBAck, U) {
+    pm_popMemQueue;
+    dt_deallocateTBE;
+ }
+
 }