mem-ruby: Add SQC cache invalidation support to GPU VIPER

This commit adds support for cache invalidation in GPU VIPER protocol's SQC cache. To support this, the commit also adds L1 cache invalidation framework in the Sequencer such that the Sequencer sends out an invalidation request for each line in the cache and declares completion once all lines are evicted. Change-Id: I2f52eacabb2412b16f467f994e985c378230f841
2024-01-25 13:24:57 -06:00
parent fd3aac1518
commit 23dc98ea72
4 changed files with 124 additions and 2 deletions
--- a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
@@ -60,6 +60,7 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
    // Mem sys initiated
    Repl,           desc="Replacing block from cache";
    Data,           desc="Received Data";
+    Evict,          desc="Evict cache line";
  }

  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
@@ -67,6 +68,7 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
    DataArrayWrite,   desc="Write the data array";
    TagArrayRead,     desc="Read the data array";
    TagArrayWrite,    desc="Write the data array";
+    TagArrayFlash,    desc="Flash clear the data array";
  }


@@ -242,7 +244,12 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
      peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
        Entry cache_entry := getCacheEntry(in_msg.LineAddress);
        TBE tbe := TBEs.lookup(in_msg.LineAddress);
-        trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe);
+        DPRINTF(RubySlicc, "%s\n", in_msg);
+        if (in_msg.Type == RubyRequestType:REPLACEMENT) {
+          trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
+        } else {
+          trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe);
+        }
      }
    }
  }
@@ -313,6 +320,11 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
    APPEND_TRANSITION_COMMENT(cache_entry.DataBlk);
  }

+  action(inv_invDone, "inv", desc="local inv done") {
+    sequencer.invL1Callback();
+  }
+
+
  action(w_writeCache, "w", desc="write data to cache") {
    peek(responseToSQC_in, ResponseMsg) {
      assert(is_valid(cache_entry));
@@ -350,6 +362,13 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
    ic_invCache;
  }

+  transition({I, IV, V}, Evict, I) {TagArrayRead, TagArrayWrite} {
+    // since we're evicting something, don't bother classifying as hit/miss
+    ic_invCache;
+    inv_invDone;
+    p_popMandatoryQueue;
+  }
+
  // if we got a response for a load where the line is in I, then
  // another request must have come in that replaced the line in question in
  // the cache.  Thus, complete this request without allocating the line, but
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -157,6 +157,9 @@ structure (Sequencer, external = "yes") {
  void llscClearLocalMonitor();

  void evictionCallback(Addr);
+
+  void invL1Callback();
+
  void recordRequestType(SequencerRequestType);
  bool checkResourceAvailable(CacheResourceType, Addr);
 }
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -85,6 +85,8 @@ Sequencer::Sequencer(const Params &p)

    m_runningGarnetStandalone = p.garnet_standalone;

+    m_num_pending_invs = 0;
+    m_cache_inv_pkt = nullptr;

    // These statistical variables are not for display.
    // The profiler will collate these across different
@@ -348,6 +350,10 @@ Sequencer::insertRequest(PacketPtr pkt, RubyRequestType primary_type,
        return RequestStatus_Ready;
    }

+    if (pkt->cmd == MemCmd::MemSyncReq) {
+        return RequestStatus_Aliased;
+    }
+
    Addr line_addr = makeLineAddress(pkt->getAddr());
    // Check if there is any outstanding request for the same cache line.
    auto &seq_req_list = m_RequestTable[line_addr];
@@ -576,7 +582,8 @@ Sequencer::readCallback(Addr address, DataBlock& data,
        }
        if ((seq_req.m_type != RubyRequestType_LD) &&
            (seq_req.m_type != RubyRequestType_Load_Linked) &&
-            (seq_req.m_type != RubyRequestType_IFETCH)) {
+            (seq_req.m_type != RubyRequestType_IFETCH) &&
+            (seq_req.m_type != RubyRequestType_REPLACEMENT)) {
            // Write request: reissue request to the cache hierarchy
            issueRequest(seq_req.pkt, seq_req.m_second_type);
            break;
@@ -811,6 +818,86 @@ Sequencer::unaddressedCallback(Addr unaddressedReqId,
    }
 }

+void
+Sequencer::completeHitCallback(std::vector<PacketPtr> & mylist)
+{
+    for (auto& pkt : mylist) {
+        // When Ruby is in warmup or cooldown phase, the requests come
+        // from the cache recorder. They do not track which port to use
+        // and do not need to send the response back
+        if (!RubySystem::getWarmupEnabled()
+                && !RubySystem::getCooldownEnabled()) {
+            RubyPort::SenderState *ss =
+                safe_cast<RubyPort::SenderState *>(pkt->senderState);
+            MemResponsePort *port = ss->port;
+            assert(port != NULL);
+
+            pkt->senderState = ss->predecessor;
+
+            if (pkt->cmd != MemCmd::WriteReq) {
+                // for WriteReq, we keep the original senderState until
+                // writeCompleteCallback
+                delete ss;
+            }
+
+            port->hitCallback(pkt);
+            trySendRetries();
+        }
+    }
+
+    RubySystem *rs = m_ruby_system;
+    if (RubySystem::getWarmupEnabled()) {
+        rs->m_cache_recorder->enqueueNextFetchRequest();
+    } else if (RubySystem::getCooldownEnabled()) {
+        rs->m_cache_recorder->enqueueNextFlushRequest();
+    } else {
+        testDrainComplete();
+    }
+}
+
+void
+Sequencer::invL1Callback()
+{
+    // Since L1 invalidate is currently done with paddr = 0
+    assert(m_cache_inv_pkt && m_num_pending_invs > 0);
+
+    m_num_pending_invs--;
+
+    if (m_num_pending_invs == 0) {
+        std::vector<PacketPtr> pkt_list { m_cache_inv_pkt };
+        m_cache_inv_pkt = nullptr;
+        completeHitCallback(pkt_list);
+    }
+}
+
+void
+Sequencer::invL1()
+{
+    int size = m_dataCache_ptr->getNumBlocks();
+    DPRINTF(RubySequencer,
+            "There are %d Invalidations outstanding before Cache Walk\n",
+            m_num_pending_invs);
+    // Walk the cache
+    for (int i = 0; i < size; i++) {
+        Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+        // Evict Read-only data
+        RubyRequestType request_type = RubyRequestType_REPLACEMENT;
+        std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+            clockEdge(), addr, 0, 0,
+            request_type, RubyAccessMode_Supervisor,
+            nullptr);
+        DPRINTF(RubySequencer, "Evicting addr 0x%x\n", addr);
+        assert(m_mandatory_q_ptr != NULL);
+        Tick latency = cyclesToTicks(
+            m_controller->mandatoryQueueLatency(request_type));
+        m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
+        m_num_pending_invs++;
+    }
+    DPRINTF(RubySequencer,
+            "There are %d Invalidations outstanding after Cache Walk\n",
+            m_num_pending_invs);
+}
+
 bool
 Sequencer::empty() const
 {
@@ -915,6 +1002,11 @@ Sequencer::makeRequest(PacketPtr pkt)
            }
        } else if (pkt->isFlush()) {
          primary_type = secondary_type = RubyRequestType_FLUSH;
+        } else if (pkt->cmd == MemCmd::MemSyncReq) {
+            primary_type = secondary_type = RubyRequestType_REPLACEMENT;
+            assert(!m_cache_inv_pkt);
+            m_cache_inv_pkt = pkt;
+            invL1();
        } else {
            panic("Unsupported ruby packet type\n");
        }
--- a/src/mem/ruby/system/Sequencer.hh
+++ b/src/mem/ruby/system/Sequencer.hh
@@ -141,6 +141,10 @@ class Sequencer : public RubyPort
                             const Cycles forwardRequestTime = Cycles(0),
                             const Cycles firstResponseTime = Cycles(0));

+    void completeHitCallback(std::vector<PacketPtr>& list);
+    void invL1Callback();
+    void invL1();
+
    RequestStatus makeRequest(PacketPtr pkt) override;
    virtual bool empty() const;
    int outstandingCount() const override { return m_outstanding_count; }
@@ -243,6 +247,10 @@ class Sequencer : public RubyPort
  private:
    int m_max_outstanding_requests;

+    int m_num_pending_invs;
+
+    PacketPtr m_cache_inv_pkt;
+
    CacheMemory* m_dataCache_ptr;

    // The cache access latency for top-level caches (L0/L1). These are