mem,gpu-compute: Implement GPU TCC directed invalidate (#1011)

The GPU device currently supports large BAR which means that the driver can write directly to GPU memory over the PCI bus without using SDMA or PM4 packets. The gem5 PCI interface only provides an atomic interface for BAR reads/writes, which means the values cannot go through timing mode Ruby caches. This causes bugs as the TCC cache is allowed to keep clean data between kernels for performance reasons. If there is a BAR write directly to memory bypassing the cache, the value in the cache is stale and must be invalidated. In this commit a TCC invalidate is generated for all writes over PCI that go directly to GPU memory. This will also invalidate TCP along the way if necessary. This currently relies on the driver synchonization which only allows BAR writes in between kernels. Therefore, the cache should only be in I or V state. To handle a race condition between invalidates and launching the next kernel, the invalidates return a response and the GPU command processor will wait for all TCC invalidates to be complete before launching the next kernel. This fixes issues with stale data in nanoGPT and possibly PENNANT.
2024-04-15 13:18:01 -07:00
parent 630f3822b8 1d64669473
commit 7e2d8dee42
15 changed files with 252 additions and 15 deletions
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -420,6 +420,12 @@ AMDGPUDevice::writeFrame(PacketPtr pkt, Addr offset)
 {
    DPRINTF(AMDGPUDevice, "Wrote framebuffer address %#lx\n", offset);

+    for (auto& cu: CP()->shader()->cuList) {
+        auto system = CP()->shader()->gpuCmdProc.system();
+        Addr aligned_addr = offset & ~(system->cacheLineSize() - 1);
+        cu->sendInvL2(aligned_addr);
+    }
+
    Addr aperture = gpuvm.getFrameAperture(offset);
    Addr aperture_offset = offset - aperture;

--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -864,6 +864,25 @@ ComputeUnit::DataPort::handleResponse(PacketPtr pkt)
        //  - kernel end
        //  - non-kernel mem sync

+        // Non-kernel mem sync not from an instruction
+        if (!gpuDynInst) {
+            // If there is no dynamic instruction, a CU must be present.
+            ComputeUnit *cu = sender_state->computeUnit;
+            assert(cu != nullptr);
+
+            if (pkt->req->isInvL2()) {
+                cu->shader->decNumOutstandingInvL2s();
+                assert(cu->shader->getNumOutstandingInvL2s() >= 0);
+            } else {
+                panic("Unknown MemSyncResp not from an instruction");
+            }
+
+            // Cleanup and return, no other response events needed.
+            delete pkt->senderState;
+            delete pkt;
+            return true;
+        }
+
        // Kernel Launch
        // wavefront was nullptr when launching kernel, so it is meaningless
        // here (simdId=-1, wfSlotId=-1)
@@ -1403,6 +1422,23 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
    }
 }

+void
+ComputeUnit::sendInvL2(Addr paddr)
+{
+    auto req = std::make_shared<Request>(paddr, 64, 0, vramRequestorId());
+    req->setCacheCoherenceFlags(Request::GL2_CACHE_INV);
+
+    auto pkt = new Packet(req, MemCmd::MemSyncReq);
+    pkt->pushSenderState(
+       new ComputeUnit::DataPort::SenderState(this, 0, nullptr));
+
+    EventFunctionWrapper *mem_req_event = memPort[0].createMemReqEvent(pkt);
+
+    schedule(mem_req_event, curTick() + req_tick_latency);
+
+    shader->incNumOutstandingInvL2s();
+}
+
 void
 ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
 {
@@ -1701,16 +1737,20 @@ ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt)
    } else if (!(sendTimingReq(pkt))) {
        retries.push_back(std::make_pair(pkt, gpuDynInst));

-        DPRINTF(GPUPort,
-                "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
-                compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
-                id, pkt->req->getPaddr());
+        if (gpuDynInst) {
+            DPRINTF(GPUPort,
+                    "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
+                    compute_unit->cu_id, gpuDynInst->simdId,
+                    gpuDynInst->wfSlotId, id, pkt->req->getPaddr());
+        }
    } else {
-        DPRINTF(GPUPort,
-                "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
-                "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
-                gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
-                pkt->req->getPaddr());
+        if (gpuDynInst) {
+            DPRINTF(GPUPort,
+                    "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data"
+                    " req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
+                    gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id,
+                    pkt->req->getPaddr());
+        }
    }
 }

--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -474,6 +474,8 @@ class ComputeUnit : public ClockedObject

    void handleSQCReturn(PacketPtr pkt);

+    void sendInvL2(Addr paddr);
+
  protected:
    RequestorID _requestorId;

@@ -527,6 +529,7 @@ class ComputeUnit : public ClockedObject

        struct SenderState : public Packet::SenderState
        {
+            ComputeUnit *computeUnit = nullptr;
            GPUDynInstPtr _gpuDynInst;
            PortID port_index;
            Packet::SenderState *saved;
@@ -536,6 +539,12 @@ class ComputeUnit : public ClockedObject
                : _gpuDynInst(gpuDynInst),
                  port_index(_port_index),
                  saved(sender_state) { }
+
+            SenderState(ComputeUnit *cu, PortID _port_index,
+                        Packet::SenderState *sender_state=nullptr)
+                : computeUnit(cu),
+                  port_index(_port_index),
+                  saved(sender_state) { }
        };

        class SystemHubEvent : public Event
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -41,6 +41,7 @@
 #include "debug/GPUKernelInfo.hh"
 #include "dev/amdgpu/amdgpu_device.hh"
 #include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/shader.hh"
 #include "mem/abstract_mem.hh"
 #include "mem/packet_access.hh"
 #include "mem/se_translating_port_proxy.hh"
@@ -126,6 +127,21 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
    unsigned akc_alignment_granularity = 64;
    assert(!(disp_pkt->kernel_object & (akc_alignment_granularity - 1)));

+    /**
+     * Make sure there is not a race condition with invalidates in the L2
+     * cache. The full system driver may write directly to memory using
+     * large BAR while the L2 cache is allowed to keep data in the valid
+     * state between kernel launches. This is a rare event but is required
+     * for correctness.
+     */
+    if (shader()->getNumOutstandingInvL2s() > 0) {
+        DPRINTF(GPUCommandProc,
+                "Deferring kernel launch due to outstanding L2 invalidates\n");
+        shader()->addDeferredDispatch(raw_pkt, queue_id, host_pkt_addr);
+
+        return;
+    }
+
    /**
     * Need to use a raw pointer for DmaVirtDevice API. This is deleted
     * in the dispatchKernelObject method.
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -552,6 +552,29 @@ Shader::notifyCuSleep() {
    }
 }

+void
+Shader::decNumOutstandingInvL2s()
+{
+    num_outstanding_invl2s--;
+
+    if (num_outstanding_invl2s == 0 && !deferred_dispatches.empty()) {
+        for (auto &dispatch : deferred_dispatches) {
+            gpuCmdProc.submitDispatchPkt(std::get<0>(dispatch),
+                                         std::get<1>(dispatch),
+                                         std::get<2>(dispatch));
+        }
+        deferred_dispatches.clear();
+    }
+}
+
+void
+Shader::addDeferredDispatch(void *raw_pkt, uint32_t queue_id,
+                            Addr host_pkt_addr)
+{
+    deferred_dispatches.push_back(
+            std::make_tuple(raw_pkt, queue_id, host_pkt_addr));
+}
+
 /**
 * Forward the VRAM requestor ID needed for device memory from CP.
 */
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@@ -104,6 +104,11 @@ class Shader : public ClockedObject
    // Set to true by the dispatcher if the current kernel is a blit kernel
    bool blitKernel = false;

+    // Number of pending non-instruction invalidates outstanding. The shader
+    // should wait for these to be done to ensure correctness.
+    int num_outstanding_invl2s = 0;
+    std::vector<std::tuple<void *, uint32_t, Addr>> deferred_dispatches;
+
  public:
    typedef ShaderParams Params;
    enum hsail_mode_e {SIMT,VECTOR_SCALAR};
@@ -330,6 +335,13 @@ class Shader : public ClockedObject
        blitKernel = is_blit_kernel;
    }

+    void decNumOutstandingInvL2s();
+    void incNumOutstandingInvL2s() { num_outstanding_invl2s++; };
+    int getNumOutstandingInvL2s() const { return num_outstanding_invl2s; };
+
+    void addDeferredDispatch(void *raw_pkt, uint32_t queue_id,
+                             Addr host_pkt_addr);
+
  protected:
    struct ShaderStats : public statistics::Group
    {
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -1096,6 +1096,7 @@ class Request : public Extensible<Request>
     * setting extraFlags should be done via setCacheCoherenceFlags().
     */
    bool isInvL1() const { return _cacheCoherenceFlags.isSet(INV_L1); }
+    bool isInvL2() const { return _cacheCoherenceFlags.isSet(GL2_CACHE_INV); }

    bool
    isGL2CacheFlush() const
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -72,6 +72,7 @@ machine(MachineType:TCC, "TCC Cache")
    L2_Repl,                desc="L2 Replacement";
    // Probes
    PrbInv,                 desc="Invalidating probe";
+    InvCache,               desc="Invalidating probe from TCP";
    // Coming from Memory Controller
    WBAck,                  desc="writethrough ack from memory";
    Bypass,                 desc="Bypass the entire L2 cache";
@@ -413,6 +414,8 @@ machine(MachineType:TCC, "TCC Cache")
          }
        } else if (in_msg.Type == CoherenceRequestType:WriteFlush) {
            trigger(Event:Flush, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:InvCache) {
+            trigger(Event:InvCache, in_msg.addr, cache_entry, tbe);
        } else {
          DPRINTF(RubySlicc, "%s\n", in_msg);
          error("Unexpected Response Message to Core");
@@ -429,6 +432,19 @@ machine(MachineType:TCC, "TCC Cache")
    unset_cache_entry();
  }

+  action(ir_invL2Resp, "ir", desc="send L2 invalidate ack") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:InvL2Resp;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+        DPRINTF(RubySlicc, "%s\n", out_msg);
+      }
+    }
+  }
+
  action(sd_sendData, "sd", desc="send Shared response") {
    peek(coreRequestNetwork_in, CPURequestMsg) {
      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
@@ -1188,6 +1204,12 @@ machine(MachineType:TCC, "TCC Cache")
    i_invL2;
  }

+  transition({I, V}, InvCache, I) {TagArrayRead, TagArrayWrite} {
+    i_invL2;
+    ir_invL2Resp;
+    p_popRequestQueue;
+  }
+
  transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
    pi_sendProbeResponseInv;
    pp_popProbeQueue;
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -75,6 +75,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
    Evict,           desc="Evict if clean(invL1 for Load Acquire)";
    // Mem sys initiated
    Repl,            desc="Replacing block from cache";
+    InvL2,           desc="Invalidate to L2";
+    InvL2Resp,       desc="Invalidate L2 completed";

    // TCC initiated
    TCC_Ack,         desc="TCC Ack to Core Request";
@@ -286,9 +288,12 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
                     in_msg.Type == CoherenceResponseType:NBSysWBAck) {
            trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
            DPRINTF(RubySlicc, "Issuing TCC_AckWB\n");
-          } else {
-            error("Unexpected Response Message to Core");
-          }
+        } else if (in_msg.Type == CoherenceResponseType:InvL2Resp) {
+            DPRINTF(RubySlicc, "Issuing InvL2Resp\n");
+            trigger(Event:InvL2Resp, in_msg.addr, cache_entry, tbe);
+        } else {
+          error("Unexpected Response Message to Core");
+        }
      }
    }
  }
@@ -333,6 +338,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
            trigger(Event:Flush, in_msg.LineAddress, cache_entry, tbe);
        } else if (in_msg.Type == RubyRequestType:REPLACEMENT){
            trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
+        } else if (in_msg.Type == RubyRequestType:InvL2){
+            trigger(Event:InvL2, in_msg.LineAddress, cache_entry, tbe);
        } else {
          error("Unexpected Request Message from VIC");
        }
@@ -609,6 +616,31 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
    }
  }

+  action(il2_invL2, "il2", desc="Invalidate address in L2") {
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceRequestType:InvCache;
+      out_msg.Requestor := machineID;
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Request_Control;
+      out_msg.InitialRequestTime := curCycle();
+      peek(mandatoryQueue_in, RubyRequest) {
+        out_msg.isGLCSet := in_msg.isGLCSet;
+        out_msg.isSLCSet := in_msg.isSLCSet;
+      }
+    }
+  }
+
+  action(i2r_invL2Resp, "i2r", desc="Invalidate L2 completed") {
+    if (use_seq_not_coal) {
+        DPRINTF(RubySlicc, "Sequencer does not define invTCCCallback!\n");
+        assert(false);
+    } else {
+      coalescer.invTCCCallback(address);
+    }
+  }
+
  action(wd_wtDone, "wd", desc="writethrough done") {
    if (use_seq_not_coal) {
      DPRINTF(RubySlicc, "Sequencer does not define writeCompleteCallback!\n");
@@ -830,6 +862,22 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
    p_popMandatoryQueue;
  }

+  transition(I, InvL2) {
+    il2_invL2;
+    p_popMandatoryQueue;
+  }
+
+  transition(V, InvL2, I) {
+    ic_invCache
+    il2_invL2;
+    p_popMandatoryQueue;
+  }
+
+  transition(I, InvL2Resp) {
+    i2r_invL2Resp;
+    pr_popResponseQueue;
+  }
+
  // if a line is in IV and a TCC_AckWB comes back, we must have had a WT
  // store followed by a load. Thus, complete the store without affecting
  // TBE or line state.
--- a/src/mem/ruby/protocol/GPU_VIPER-msg.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-msg.sm
@@ -62,6 +62,7 @@ structure (VIPERCoalescer, external = "yes") {
                     Cycles, Cycles, Cycles, bool);
  void atomicCallback(Addr, MachineType, DataBlock);
  void invTCPCallback(Addr);
+  void invTCCCallback(Addr);
  void writeCompleteCallback(Addr, uint64_t);
  void evictionCallback(Addr);
 }
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
@@ -46,6 +46,7 @@ enumeration(CoherenceRequestType, desc="Coherence Request Types") {
  WriteThroughFifo, desc="WriteThrough with no data";
  WriteThroughDummy, desc="WriteThrough with no data for atomic operation";
  WriteFlush,   desc="Release Flush";
+  InvCache,     desc="Invalidate Cache";

  WrCancel,     desc="want to cancel WB to Memory"; // should this be here?

@@ -95,6 +96,7 @@ enumeration(CoherenceResponseType, desc="Coherence Response Types") {
  StaleNotif,      desc="Notification of Stale WBAck, No data to writeback";
  CPUCancelWB,     desc="want to cancel WB to Memory";
  MemData,         desc="Data from Memory";
+  InvL2Resp,       desc="Invalidate L2 response";

  // for regions
  PrivateAck,      desc="Ack that r-buf received private notify";
--- a/src/mem/ruby/protocol/RubySlicc_Exports.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm
@@ -181,6 +181,7 @@ enumeration(RubyRequestType, desc="...", default="RubyRequestType_NULL") {
  COMMIT,            desc="Commit version";
  NULL,              desc="Invalid request type";
  FLUSH,             desc="Flush request type";
+  InvL2,             desc="Invalidate L2";
  Release,           desc="Release operation";
  Acquire,           desc="Acquire opertion";
  AcquireRelease,    desc="Acquire and Release opertion";
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -669,14 +669,14 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
 RequestStatus
 GPUCoalescer::makeRequest(PacketPtr pkt)
 {
-    // all packets must have valid instruction sequence numbers
-    assert(pkt->req->hasInstSeqNum());
-
    if (pkt->cmd == MemCmd::MemSyncReq) {
        // issue mem_sync requests immediately to the cache system without
        // going through uncoalescedTable like normal LD/ST/Atomic requests
        issueMemSyncRequest(pkt);
    } else {
+        // all packets must have valid instruction sequence numbers
+        assert(pkt->req->hasInstSeqNum());
+
        // otherwise, this must be either read or write command
        assert(pkt->isRead() || pkt->isWrite() || pkt->isFlush());

--- a/src/mem/ruby/system/VIPERCoalescer.cc
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -80,6 +80,7 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
    // VIPER does not expect MemSyncReq & Release since compute unit
    // does not specify an equivalent type of memory request.
    assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL1()) ||
+           (pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL2()) ||
            pkt->cmd == MemCmd::ReadReq ||
            pkt->cmd == MemCmd::WriteReq ||
            pkt->cmd == MemCmd::FlushReq ||
@@ -106,6 +107,10 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
        invTCP();
    }

+    if (pkt->req->isInvL2()) {
+        invTCC(pkt);
+    }
+
    return RequestStatus_Issued;
 }

@@ -306,5 +311,51 @@ VIPERCoalescer::invTCP()
            m_num_pending_invs);
 }

+void
+VIPERCoalescer::invTCCCallback(Addr addr)
+{
+    for (auto& pkt : m_pending_invl2s[addr]) {
+        RubyPort::SenderState *ss =
+            safe_cast<RubyPort::SenderState *>(pkt->senderState);
+        MemResponsePort *port = ss->port;
+        assert(port != nullptr);
+
+        // Now convert to MemSyncResp
+        pkt->makeResponse();
+
+        pkt->senderState = ss->predecessor;
+        delete ss;
+        port->hitCallback(pkt);
+    }
+    m_pending_invl2s.erase(addr);
+}
+
+/*
+ * Send an invalidate to a specific address in the TCC.
+ */
+void
+VIPERCoalescer::invTCC(PacketPtr pkt)
+{
+    assert(pkt);
+    assert(pkt->req);
+
+    Addr addr = pkt->req->getPaddr();
+    RubyRequestType request_type = RubyRequestType_InvL2;
+
+    std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+        clockEdge(), addr, 0, 0,
+        request_type, RubyAccessMode_Supervisor,
+        nullptr);
+
+    DPRINTF(GPUCoalescer, "Sending L2 invalidate to 0x%x\n", addr);
+
+    assert(m_mandatory_q_ptr);
+    Tick latency = cyclesToTicks(
+        m_controller->mandatoryQueueLatency(request_type));
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
+
+    m_pending_invl2s[addr].push_back(pkt);
+}
+
 } // namespace ruby
 } // namespace gem5
--- a/src/mem/ruby/system/VIPERCoalescer.hh
+++ b/src/mem/ruby/system/VIPERCoalescer.hh
@@ -63,11 +63,13 @@ class VIPERCoalescer : public GPUCoalescer
    ~VIPERCoalescer();
    void writeCompleteCallback(Addr address, uint64_t instSeqNum);
    void invTCPCallback(Addr address);
+    void invTCCCallback(Addr address);
    RequestStatus makeRequest(PacketPtr pkt) override;
    void issueRequest(CoalescedRequest* crequest) override;

  private:
    void invTCP();
+    void invTCC(PacketPtr pkt);

    // make write-complete response packets from original write request packets
    void makeWriteCompletePkts(CoalescedRequest* crequest);
@@ -79,6 +81,9 @@ class VIPERCoalescer : public GPUCoalescer
    // number of remaining cache lines to be invalidated in TCP
    int m_num_pending_invs;

+    // outstanding L2 invalidate packets
+    std::unordered_map<Addr, std::vector<PacketPtr>> m_pending_invl2s;
+
    // a map of instruction sequence number and corresponding pending
    // write-complete response packets. Each write-complete response
    // corresponds to a pending store request that is waiting for