mem,gpu-compute: Implement GPU TCC directed invalidate (#1011)

The GPU device currently supports large BAR which means that the driver can write directly to GPU memory over the PCI bus without using SDMA or PM4 packets. The gem5 PCI interface only provides an atomic interface for BAR reads/writes, which means the values cannot go through timing mode Ruby caches. This causes bugs as the TCC cache is allowed to keep clean data between kernels for performance reasons. If there is a BAR write directly to memory bypassing the cache, the value in the cache is stale and must be invalidated. In this commit a TCC invalidate is generated for all writes over PCI that go directly to GPU memory. This will also invalidate TCP along the way if necessary. This currently relies on the driver synchonization which only allows BAR writes in between kernels. Therefore, the cache should only be in I or V state. To handle a race condition between invalidates and launching the next kernel, the invalidates return a response and the GPU command processor will wait for all TCC invalidates to be complete before launching the next kernel. This fixes issues with stale data in nanoGPT and possibly PENNANT.
2024-04-15 13:18:01 -07:00
parent 630f3822b8 1d64669473
commit 7e2d8dee42
15 changed files with 252 additions and 15 deletions
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -1096,6 +1096,7 @@ class Request : public Extensible<Request>
     * setting extraFlags should be done via setCacheCoherenceFlags().
     */
    bool isInvL1() const { return _cacheCoherenceFlags.isSet(INV_L1); }
+    bool isInvL2() const { return _cacheCoherenceFlags.isSet(GL2_CACHE_INV); }

    bool
    isGL2CacheFlush() const
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -72,6 +72,7 @@ machine(MachineType:TCC, "TCC Cache")
    L2_Repl,                desc="L2 Replacement";
    // Probes
    PrbInv,                 desc="Invalidating probe";
+    InvCache,               desc="Invalidating probe from TCP";
    // Coming from Memory Controller
    WBAck,                  desc="writethrough ack from memory";
    Bypass,                 desc="Bypass the entire L2 cache";
@@ -413,6 +414,8 @@ machine(MachineType:TCC, "TCC Cache")
          }
        } else if (in_msg.Type == CoherenceRequestType:WriteFlush) {
            trigger(Event:Flush, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:InvCache) {
+            trigger(Event:InvCache, in_msg.addr, cache_entry, tbe);
        } else {
          DPRINTF(RubySlicc, "%s\n", in_msg);
          error("Unexpected Response Message to Core");
@@ -429,6 +432,19 @@ machine(MachineType:TCC, "TCC Cache")
    unset_cache_entry();
  }

+  action(ir_invL2Resp, "ir", desc="send L2 invalidate ack") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:InvL2Resp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
  action(sd_sendData, "sd", desc="send Shared response") {
    peek(coreRequestNetwork_in, CPURequestMsg) {
      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
@@ -1188,6 +1204,12 @@ machine(MachineType:TCC, "TCC Cache")
    i_invL2;
  }

+  transition({I, V}, InvCache, I) {TagArrayRead, TagArrayWrite} {
+    i_invL2;
+    ir_invL2Resp;
+    p_popRequestQueue;
+  }
+
  transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
    pi_sendProbeResponseInv;
    pp_popProbeQueue;
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -75,6 +75,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
    Evict,           desc="Evict if clean(invL1 for Load Acquire)";
    // Mem sys initiated
    Repl,            desc="Replacing block from cache";
+    InvL2,           desc="Invalidate to L2";
+    InvL2Resp,       desc="Invalidate L2 completed";

    // TCC initiated
    TCC_Ack,         desc="TCC Ack to Core Request";
@@ -286,9 +288,12 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
                     in_msg.Type == CoherenceResponseType:NBSysWBAck) {
            trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
            DPRINTF(RubySlicc, "Issuing TCC_AckWB\n");
-          } else {
-            error("Unexpected Response Message to Core");
-          }
+        } else if (in_msg.Type == CoherenceResponseType:InvL2Resp) {
+            DPRINTF(RubySlicc, "Issuing InvL2Resp\n");
+            trigger(Event:InvL2Resp, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unexpected Response Message to Core");
+        }
      }
    }
  }
@@ -333,6 +338,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
            trigger(Event:Flush, in_msg.LineAddress, cache_entry, tbe);
        } else if (in_msg.Type == RubyRequestType:REPLACEMENT){
            trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
+        } else if (in_msg.Type == RubyRequestType:InvL2){
+            trigger(Event:InvL2, in_msg.LineAddress, cache_entry, tbe);
        } else {
          error("Unexpected Request Message from VIC");
        }
@@ -609,6 +616,31 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
    }
  }

+  action(il2_invL2, "il2", desc="Invalidate address in L2") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:InvCache;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+      peek(mandatoryQueue_in, RubyRequest) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
+    }
+  }
+
+  action(i2r_invL2Resp, "i2r", desc="Invalidate L2 completed") {
+    if (use_seq_not_coal) {
+        DPRINTF(RubySlicc, "Sequencer does not define invTCCCallback!\n");
+        assert(false);
+    } else {
+      coalescer.invTCCCallback(address);
+    }
+  }
+
  action(wd_wtDone, "wd", desc="writethrough done") {
    if (use_seq_not_coal) {
      DPRINTF(RubySlicc, "Sequencer does not define writeCompleteCallback!\n");
@@ -830,6 +862,22 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
    p_popMandatoryQueue;
  }

+  transition(I, InvL2) {
+    il2_invL2;
+    p_popMandatoryQueue;
+  }
+
+  transition(V, InvL2, I) {
+    ic_invCache
+    il2_invL2;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, InvL2Resp) {
+    i2r_invL2Resp;
+    pr_popResponseQueue;
+  }
+
  // if a line is in IV and a TCC_AckWB comes back, we must have had a WT
  // store followed by a load. Thus, complete the store without affecting
  // TBE or line state.
--- a/src/mem/ruby/protocol/GPU_VIPER-msg.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-msg.sm
@@ -62,6 +62,7 @@ structure (VIPERCoalescer, external = "yes") {
                     Cycles, Cycles, Cycles, bool);
  void atomicCallback(Addr, MachineType, DataBlock);
  void invTCPCallback(Addr);
+  void invTCCCallback(Addr);
  void writeCompleteCallback(Addr, uint64_t);
  void evictionCallback(Addr);
 }
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
@@ -46,6 +46,7 @@ enumeration(CoherenceRequestType, desc="Coherence Request Types") {
  WriteThroughFifo, desc="WriteThrough with no data";
  WriteThroughDummy, desc="WriteThrough with no data for atomic operation";
  WriteFlush,   desc="Release Flush";
+  InvCache,     desc="Invalidate Cache";

  WrCancel,     desc="want to cancel WB to Memory"; // should this be here?

@@ -95,6 +96,7 @@ enumeration(CoherenceResponseType, desc="Coherence Response Types") {
  StaleNotif,      desc="Notification of Stale WBAck, No data to writeback";
  CPUCancelWB,     desc="want to cancel WB to Memory";
  MemData,         desc="Data from Memory";
+  InvL2Resp,       desc="Invalidate L2 response";

  // for regions
  PrivateAck,      desc="Ack that r-buf received private notify";
--- a/src/mem/ruby/protocol/RubySlicc_Exports.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm
@@ -181,6 +181,7 @@ enumeration(RubyRequestType, desc="...", default="RubyRequestType_NULL") {
  COMMIT,            desc="Commit version";
  NULL,              desc="Invalid request type";
  FLUSH,             desc="Flush request type";
+  InvL2,             desc="Invalidate L2";
  Release,           desc="Release operation";
  Acquire,           desc="Acquire opertion";
  AcquireRelease,    desc="Acquire and Release opertion";
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -669,14 +669,14 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
 RequestStatus
 GPUCoalescer::makeRequest(PacketPtr pkt)
 {
-    // all packets must have valid instruction sequence numbers
-    assert(pkt->req->hasInstSeqNum());
-
    if (pkt->cmd == MemCmd::MemSyncReq) {
        // issue mem_sync requests immediately to the cache system without
        // going through uncoalescedTable like normal LD/ST/Atomic requests
        issueMemSyncRequest(pkt);
    } else {
+        // all packets must have valid instruction sequence numbers
+        assert(pkt->req->hasInstSeqNum());
+
        // otherwise, this must be either read or write command
        assert(pkt->isRead() || pkt->isWrite() || pkt->isFlush());

--- a/src/mem/ruby/system/VIPERCoalescer.cc
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -80,6 +80,7 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
    // VIPER does not expect MemSyncReq & Release since compute unit
    // does not specify an equivalent type of memory request.
    assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL1()) ||
+           (pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL2()) ||
            pkt->cmd == MemCmd::ReadReq ||
            pkt->cmd == MemCmd::WriteReq ||
            pkt->cmd == MemCmd::FlushReq ||
@@ -106,6 +107,10 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
        invTCP();
    }

+    if (pkt->req->isInvL2()) {
+        invTCC(pkt);
+    }
+
    return RequestStatus_Issued;
 }

@@ -306,5 +311,51 @@ VIPERCoalescer::invTCP()
            m_num_pending_invs);
 }

+void
+VIPERCoalescer::invTCCCallback(Addr addr)
+{
+    for (auto& pkt : m_pending_invl2s[addr]) {
+        RubyPort::SenderState *ss =
+            safe_cast<RubyPort::SenderState *>(pkt->senderState);
+        MemResponsePort *port = ss->port;
+        assert(port != nullptr);
+
+        // Now convert to MemSyncResp
+        pkt->makeResponse();
+
+        pkt->senderState = ss->predecessor;
+        delete ss;
+        port->hitCallback(pkt);
+    }
+    m_pending_invl2s.erase(addr);
+}
+
+/*
+ * Send an invalidate to a specific address in the TCC.
+ */
+void
+VIPERCoalescer::invTCC(PacketPtr pkt)
+{
+    assert(pkt);
+    assert(pkt->req);
+
+    Addr addr = pkt->req->getPaddr();
+    RubyRequestType request_type = RubyRequestType_InvL2;
+
+    std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+        clockEdge(), addr, 0, 0,
+        request_type, RubyAccessMode_Supervisor,
+        nullptr);
+
+    DPRINTF(GPUCoalescer, "Sending L2 invalidate to 0x%x\n", addr);
+
+    assert(m_mandatory_q_ptr);
+    Tick latency = cyclesToTicks(
+        m_controller->mandatoryQueueLatency(request_type));
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
+
+    m_pending_invl2s[addr].push_back(pkt);
+}
+
 } // namespace ruby
 } // namespace gem5
--- a/src/mem/ruby/system/VIPERCoalescer.hh
+++ b/src/mem/ruby/system/VIPERCoalescer.hh
@@ -63,11 +63,13 @@ class VIPERCoalescer : public GPUCoalescer
    ~VIPERCoalescer();
    void writeCompleteCallback(Addr address, uint64_t instSeqNum);
    void invTCPCallback(Addr address);
+    void invTCCCallback(Addr address);
    RequestStatus makeRequest(PacketPtr pkt) override;
    void issueRequest(CoalescedRequest* crequest) override;

  private:
    void invTCP();
+    void invTCC(PacketPtr pkt);

    // make write-complete response packets from original write request packets
    void makeWriteCompletePkts(CoalescedRequest* crequest);
@@ -79,6 +81,9 @@ class VIPERCoalescer : public GPUCoalescer
    // number of remaining cache lines to be invalidated in TCP
    int m_num_pending_invs;

+    // outstanding L2 invalidate packets
+    std::unordered_map<Addr, std::vector<PacketPtr>> m_pending_invl2s;
+
    // a map of instruction sequence number and corresponding pending
    // write-complete response packets. Each write-complete response
    // corresponds to a pending store request that is waiting for