mem-ruby: Reduce handshaking between CorePair and dir (#1117)

Currently when data is downgraded by MOESI_AMD_Base-CorePair (e.g. due to a replacement) this requires a 4-way handshake between the CorePair and the dir. Specifically, the CorePair send a message telling the dir it'd like to downgrade then, the dir sends an ACK back and then, the CorePair writes the data back, and finally, the dir ACKs the writeback. This is very inefficient and not representative of how modern protocols downgrade a request. Accordingly, this commits updates the downgrade support such that the CorePair writes back the data immediately and then the dir ACKs it. Thus, this approach requires only a 2-way handshake. Change-Id: I7ebc85bb03e8ce46a8847e3240fc170120e9fcd6 Co-authored-by: Neeraj Surawar <neerajs@hyrule.cs.wisc.edu>
2024-05-30 09:36:29 -07:00
parent 7c1207d5c4
commit efbfdeabd7
2 changed files with 19 additions and 42 deletions
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-CorePair.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-CorePair.sm
@@ -730,7 +730,7 @@ machine(MachineType:CorePair, "CP-like Core Coherence")
      out_msg.DataBlk := cache_entry.DataBlk;
      assert(cache_entry.Dirty);
      out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
-      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.MessageSize := MessageSizeType:Writeback_Data;
      out_msg.Type := CoherenceRequestType:VicDirty;
      out_msg.InitialRequestTime := curCycle();
      if (cache_entry.CacheState == State:O) {
@@ -1114,27 +1114,6 @@ machine(MachineType:CorePair, "CP-like Core Coherence")
    }
  }

-  action(wb_data, "wb", desc="write back data") {
-    peek(responseToCore_in, ResponseMsg) {
-      enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
-        out_msg.addr := address;
-        out_msg.Type := CoherenceResponseType:CPUData;
-        out_msg.Sender := machineID;
-        out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
-        out_msg.DataBlk := tbe.DataBlk;
-        out_msg.Dirty := tbe.Dirty;
-        if (tbe.Shared) {
-          out_msg.NbReqShared := true;
-        } else {
-          out_msg.NbReqShared := false;
-        }
-        out_msg.State := CoherenceState:Shared; // faux info
-        out_msg.MessageSize := MessageSizeType:Writeback_Data;
-        DPRINTF(RubySlicc, "%s\n", out_msg);
-      }
-    }
-  }
-
  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
    enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
      out_msg.addr := address;
@@ -2427,19 +2406,16 @@ machine(MachineType:CorePair, "CP-like Core Coherence")
  }

  transition(MO_I, NB_AckWB, I) {L2TagArrayWrite} {
-    wb_data;
    d_deallocateTBE;
    pr_popResponseQueue;
  }

  transition(ES_I, NB_AckWB, I) {L2TagArrayWrite} {
-    wb_data;
    d_deallocateTBE;
    pr_popResponseQueue;
  }

  transition(MO_S0, NB_AckWB, S0) {L2TagArrayWrite} {
-    wb_data;
    i2_invL2;
    a2_allocateL2;
    d_deallocateTBE; // FOO
@@ -2448,7 +2424,6 @@ machine(MachineType:CorePair, "CP-like Core Coherence")
  }

  transition(MO_S1, NB_AckWB, S1) {L2TagArrayWrite} {
-    wb_data;
    i2_invL2;
    a2_allocateL2;
    d_deallocateTBE; // FOO
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
@@ -101,7 +101,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
    // writebacks
    VicDirty,           desc="...";
    VicClean,           desc="...";
-    CPUData,            desc="WB data from CPU";
    StaleWB,         desc="Notification that WB has been superceded by a probe";

    // probe responses
@@ -361,8 +360,6 @@ machine(MachineType:Directory, "AMD Baseline protocol")
        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
        if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
          trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe);
-        } else if (in_msg.Type == CoherenceResponseType:CPUData) {
-          trigger(Event:CPUData, in_msg.addr, entry, tbe);
        } else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
            trigger(Event:StaleWB, in_msg.addr, entry, tbe);
        } else {
@@ -968,7 +965,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
  }

  action(d_writeDataToMemory, "d", desc="Write data to memory") {
-    peek(responseNetwork_in, ResponseMsg) {
+    peek(requestNetwork_in, CPURequestMsg) {
      enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
        out_msg.addr := address;
        out_msg.Type := MemoryRequestType:MEMORY_WB;
@@ -1175,12 +1172,12 @@ machine(MachineType:Directory, "AMD Baseline protocol")
  }

  action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") {
-    peek(responseNetwork_in, ResponseMsg) {
+    peek(requestNetwork_in, CPURequestMsg) {
      if (L3CacheMemory.isTagPresent(address)) {
        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
        APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
        entry.DataBlk := in_msg.DataBlk;
-        entry.LastSender := in_msg.Sender;
+        entry.LastSender := in_msg.Requestor;
        assert(is_valid(tbe));
        //The controller always allocates a TBE entry upon receipt of a request from L2 caches.
        //L3Hit flag is used by the hit profiling action pr_profileL3HitMiss to determine hit or miss.
@@ -1205,7 +1202,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")
        APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
        entry.DataBlk := in_msg.DataBlk;

-        entry.LastSender := in_msg.Sender;
+        entry.LastSender := in_msg.Requestor;
      }
    }
  }
@@ -1397,15 +1394,25 @@ machine(MachineType:Directory, "AMD Baseline protocol")
    p_popRequestQueue;
  }

-  transition(U, VicDirty, BL) {L3TagArrayRead} {
+  transition(U, VicDirty) {L3TagArrayRead, L3DataArrayWrite} {
    t_allocateTBE;
    w_sendResponseWBAck;
+    d_writeDataToMemory;
+    al_allocateL3Block;
+    pr_profileL3HitMiss; //Must come after al_allocateL3Block and before dt_deallocateTBE
+    wada_wakeUpAllDependentsAddr;
+    dt_deallocateTBE;
    p_popRequestQueue;
  }

-  transition(U, VicClean, BL) {L3TagArrayRead} {
+  transition(U, VicClean) {L3TagArrayRead, L3DataArrayWrite} {
    t_allocateTBE;
    w_sendResponseWBAck;
+    d_writeDataToMemory;
+    al_allocateL3Block;
+    pr_profileL3HitMiss; //Must come after al_allocateL3Block and before dt_deallocateTBE
+    wada_wakeUpAllDependentsAddr;
+    dt_deallocateTBE;
    p_popRequestQueue;
  }

@@ -1413,17 +1420,12 @@ machine(MachineType:Directory, "AMD Baseline protocol")
    zz_recycleRequestQueue;
  }

-  transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} {
-    d_writeDataToMemory;
-    al_allocateL3Block;
-    pr_profileL3HitMiss; //Must come after al_allocateL3Block and before dt_deallocateTBE
+  transition(BL, StaleWB, U) {L3TagArrayWrite} {
    wada_wakeUpAllDependentsAddr;
-    dt_deallocateTBE;
    pr_popResponseQueue;
  }

-  transition(BL, StaleWB, U) {L3TagArrayWrite} {
-    dt_deallocateTBE;
+  transition(U, StaleWB, U) {L3TagArrayWrite} {
    wada_wakeUpAllDependentsAddr;
    pr_popResponseQueue;
  }