From 085789d00c4391b6b863981fb25e9cb8a7e7a445 Mon Sep 17 00:00:00 2001
From: Vishnu Ramadas <vramadas@outlook.com>
Date: Fri, 29 Sep 2023 18:19:37 -0500
Subject: [PATCH] mem-ruby: Add flush support to GPU_VIPER protocol

This commit adds flush support to the GPU VIPER coherence protocol. The
L1 cache will now initiate a flush request if the packet it receives
is of type RubyRequestType_FLUSH. During the flush process, the L1 cache
will a request to L2 if its in either V or I state. L2 will issue a
flush request to the directory if its cache line is in the valid
state before invalidating its copy. The directory, on receiving this
request, writes data to memory and sends an ack back to the L2. L2
forwards this ack back to the L1, which then ends the flush by calling
the write callback

Change-Id: I9dfc0c7b71a1e9f6d5e9e6ed4977c1e6a3b5ba46
---
 src/mem/ruby/protocol/GPU_VIPER-TCC.sm      | 52 ++++++++++++++++++-
 src/mem/ruby/protocol/GPU_VIPER-TCP.sm      | 46 ++++++++++++++++-
 src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm | 55 +++++++++++++++++++++
 3 files changed, 151 insertions(+), 2 deletions(-)

diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
index 20a0979af1..be1243aaa5 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -65,7 +65,8 @@ machine(MachineType:TCC, "TCC Cache")
     AtomicPassOn,           desc="Atomic Op Passed on to Directory";
     AtomicDone,             desc="AtomicOps Complete";
     AtomicNotDone,          desc="AtomicOps not Complete";
-    Data,                   desc="data messgae";
+    Data,                   desc="Data message";
+    Flush,                  desc="Flush cache entry";
     // Coming from this TCC
     L2_Repl,                desc="L2 Replacement";
     // Probes
@@ -376,6 +377,8 @@ machine(MachineType:TCC, "TCC Cache")
           } else {
             trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
           }
+        } else if (in_msg.Type == CoherenceRequestType:WriteFlush) {
+            trigger(Event:Flush, in_msg.addr, cache_entry, tbe);
         } else {
           DPRINTF(RubySlicc, "%s\n", in_msg);
           error("Unexpected Response Message to Core");
@@ -509,6 +512,20 @@ machine(MachineType:TCC, "TCC Cache")
     }
   }
 
+  action(fw_sendFlushResponse, "fw", desc="send Flush Response") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysWBAck;
+        out_msg.Destination.clear();
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        out_msg.instSeqNum := in_msg.instSeqNum;
+      }
+    }
+  }
+
   action(ar_sendAtomicResponse, "ar", desc="send Atomic Ack") {
     peek(coreRequestNetwork_in, CPURequestMsg) {
         enqueue(responseToCore_out, ResponseMsg, l2_response_latency + glc_atomic_latency, true) {
@@ -628,6 +645,22 @@ machine(MachineType:TCC, "TCC Cache")
     }
   }
 
+  action(f_flush, "f", desc="write back data") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+        out_msg.addr := address;
+        out_msg.Requestor := machineID;
+        out_msg.WTRequestor := in_msg.Requestor;
+        out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
+        out_msg.MessageSize := MessageSizeType:Data;
+        out_msg.Type := CoherenceRequestType:WriteFlush;
+        out_msg.Dirty := true;
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.writeMask.orMask(cache_entry.writeMask);
+      }
+    }
+  }
+
   action(at_atomicThrough, "at", desc="write back data") {
     peek(coreRequestNetwork_in, CPURequestMsg) {
       enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
@@ -1075,4 +1108,21 @@ machine(MachineType:TCC, "TCC Cache")
   transition(WIB, WBAck,I) {
     pr_popResponseQueue;
   }
+
+  transition({A, IV, WI, WIB}, Flush) {
+    st_stallAndWaitRequest;
+  }
+
+  transition(I, Flush) {
+    fw_sendFlushResponse;
+    p_popRequestQueue;
+  }
+
+  transition({V, W}, Flush, I) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    ut_updateTag;
+    f_flush;
+    i_invL2;
+    p_popRequestQueue;
+   }
 }
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
index 7e0ad4ed96..8244879c55 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -55,6 +55,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     I, AccessPermission:Invalid, desc="Invalid";
     V, AccessPermission:Read_Only, desc="Valid";
     A, AccessPermission:Invalid, desc="Waiting on Atomic";
+
+    F, AccessPermission:Invalid, desc="Flushing; Waiting for Ack";
   }
 
   enumeration(Event, desc="TCP Events") {
@@ -256,6 +258,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
       peek(responseToTCP_in, ResponseMsg, block_on="addr") {
         Entry cache_entry := getCacheEntry(in_msg.addr);
         TBE tbe := TBEs.lookup(in_msg.addr);
+        DPRINTF(RubySlicc, "In responseToTCP_in with %s\n", in_msg);
+
         if (in_msg.Type == CoherenceResponseType:TDSysResp) {
           if (disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) {
               // If L1 is disabled or requests have GLC or SLC flag set,
@@ -273,6 +277,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
         } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck ||
                      in_msg.Type == CoherenceResponseType:NBSysWBAck) {
             trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
+            DPRINTF(RubySlicc, "Issuing TCC_AckWB\n");
           } else {
             error("Unexpected Response Message to Core");
           }
@@ -469,6 +474,24 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
   action(sf_setFlush, "sf", desc="set flush") {
     inFlush := true;
     APPEND_TRANSITION_COMMENT(" inFlush is true");
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      assert(is_valid(cache_entry));
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.writeMask.clear();
+      out_msg.writeMask.orMask(cache_entry.writeMask);
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Data;
+      out_msg.Type := CoherenceRequestType:WriteFlush;
+      out_msg.InitialRequestTime := curCycle();
+      out_msg.Shared := false;
+      out_msg.isSLCSet := false;
+      peek(mandatoryQueue_in, RubyRequest) {
+        out_msg.instSeqNum := in_msg.instSeqNum;
+      }
+    }
   }
 
   action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
@@ -524,6 +547,16 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     cache_entry.Dirty := true;
   }
 
+  action(f_flushDone, "f", desc="flush done") {
+    assert(is_valid(cache_entry));
+
+    if (use_seq_not_coal) {
+        sequencer.writeCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
+    } else {
+        coalescer.writeCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
+    }
+  }
+
   action(inv_invDone, "inv", desc="local inv done") {
     if (use_seq_not_coal) {
         DPRINTF(RubySlicc, "Sequencer does not define invCallback!\n");
@@ -695,11 +728,16 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     ic_invCache;
   }
 
-  transition({V, I, A},Flush) {TagArrayFlash} {
+  transition({V,I}, Flush, F) {TagArrayFlash} {
+    a_allocate;
     sf_setFlush;
     p_popMandatoryQueue;
   }
 
+  transition(A, Flush) {
+    z_stall;
+  }
+
   transition({I, V}, Evict, I) {TagArrayFlash} {
     inv_invDone;
     p_popMandatoryQueue;
@@ -716,4 +754,10 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     wd_wtDone;
     pr_popResponseQueue;
   }
+
+  transition(F, TCC_AckWB, I) {
+    f_flushDone;
+    pr_popResponseQueue;
+    ic_invCache;
+  }
 }
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
index 774b54a432..eed750832f 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
@@ -83,6 +83,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
     BM_Pm, AccessPermission:Backing_Store,      desc="blocked waiting for probes, already got memory";
     B_Pm, AccessPermission:Backing_Store,       desc="blocked waiting for probes, already got memory";
     B, AccessPermission:Backing_Store,          desc="sent response, Blocked til ack";
+
+    F, AccessPermission:Busy, desc="sent Flus, blocked till ack";
   }
 
   // Events
@@ -120,6 +122,9 @@ machine(MachineType:Directory, "AMD Baseline protocol")
     // DMA
     DmaRead,            desc="DMA read";
     DmaWrite,           desc="DMA write";
+
+    // Flush
+    Flush,              desc="Flush entry";
   }
 
   enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
@@ -411,6 +416,9 @@ machine(MachineType:Directory, "AMD Baseline protocol")
             DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr);
             trigger(Event:VicClean, in_msg.addr, entry, tbe);
           }
+        } else if (in_msg.Type == CoherenceRequestType:WriteFlush) {
+            DPRINTF(RubySlicc, "Got Flush from %s on %s\n", in_msg.Requestor, in_msg.addr);
+            trigger(Event:Flush, in_msg.addr, entry, tbe);
         } else {
           error("Bad request message type");
         }
@@ -562,6 +570,23 @@ machine(MachineType:Directory, "AMD Baseline protocol")
     }
   }
 
+  action(rf_sendResponseFlush, "rf", desc="send Flush Ack") {
+    peek(memQueue_in, MemoryMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysWBAck;
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.WTRequestor := tbe.WTRequestor;
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        out_msg.InitialRequestTime := tbe.InitialRequestTime;
+        out_msg.ForwardRequestTime := curCycle();
+        out_msg.ProbeRequestStartTime := curCycle();
+        //out_msg.instSeqNum := in_msg.instSeqNum;
+      }
+    }
+  }
+
   action(l_queueMemWBReq, "lq", desc="Write WB data to memory") {
     peek(responseNetwork_in, ResponseMsg) {
       enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
@@ -933,6 +958,23 @@ machine(MachineType:Directory, "AMD Baseline protocol")
     }
   }
 
+  action(f_writeFlushDataToMemory, "f", desc="Write flush data to memory") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
+        out_msg.addr := address;
+        out_msg.Type := MemoryRequestType:MEMORY_WB;
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Data;
+        out_msg.DataBlk := in_msg.DataBlk;
+      }
+      if (tbe.Dirty == false) {
+          // have to update the TBE, too, because of how this
+          // directory deals with functional writes
+        tbe.DataBlk := in_msg.DataBlk;
+      }
+    }
+  }
+
   action(atd_allocateTBEforDMA, "atd", desc="allocate TBE Entry for DMA") {
     check_allocate(TBEs);
     peek(dmaRequestQueue_in, DMARequestMsg) {
@@ -1553,4 +1595,17 @@ machine(MachineType:Directory, "AMD Baseline protocol")
     dt_deallocateTBE;
     pt_popTriggerQueue;
   }
+
+ transition(U, Flush, F) {L3TagArrayRead, L3TagArrayWrite} {
+    t_allocateTBE;
+    f_writeFlushDataToMemory;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+ }
+
+ transition(F, WBAck, U) {
+    pm_popMemQueue;
+    dt_deallocateTBE;
+ }
+
 }