arch-vega,gpu-compute,mem-ruby: SQC Invalidation Support (#852)

This PR adds support for SQC (GPU I-cache) invalidation to the GPU model. It does this by updating the GPU-VIPER-SQC protocol to support flushes, the sequencer model to send out invalidates and the gpu compute model to send invalidates and handle responses. It also adds support for S_ICACHE_INV, a VEGA ISA instruction that invalidates the entire GPU I-cache. Additionally, the PR modifies the kernel start behavior to invalidate the I-cache too. It previously invalidated only the L1 D-cache.
2024-02-09 17:29:56 -06:00
parent fd3aac1518 8054459df6
commit a840dda23a
16 changed files with 308 additions and 10 deletions
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -434,6 +434,7 @@ print(
 # shader is the GPU
 shader = Shader(
    n_wf=args.wfs_per_simd,
+    cu_per_sqc=args.cu_per_sqc,
    clk_domain=SrcClockDomain(
        clock=args.gpu_clock,
        voltage_domain=VoltageDomain(voltage=args.gpu_voltage),
--- a/configs/example/gpufs/system/amdgpu.py
+++ b/configs/example/gpufs/system/amdgpu.py
@@ -33,7 +33,10 @@ from m5.objects import *

 def createGPU(system, args):
    shader = Shader(
-        n_wf=args.wfs_per_simd, timing=True, clk_domain=system.clk_domain
+        n_wf=args.wfs_per_simd,
+        cu_per_sqc=args.cu_per_sqc,
+        timing=True,
+        clk_domain=system.clk_domain,
    )

    # VIPER GPU protocol implements release consistency at GPU side. So,
--- a/src/arch/amdgpu/vega/insts/sopp.cc
+++ b/src/arch/amdgpu/vega/insts/sopp.cc
@@ -669,6 +669,9 @@ namespace VegaISA
    Inst_SOPP__S_ICACHE_INV::Inst_SOPP__S_ICACHE_INV(InFmt_SOPP *iFmt)
        : Inst_SOPP(iFmt, "s_icache_inv")
    {
+        setFlag(MemBarrier);
+        setFlag(GPUStaticInst::MemSync);
+        setFlag(MemSync);
    } // Inst_SOPP__S_ICACHE_INV

    Inst_SOPP__S_ICACHE_INV::~Inst_SOPP__S_ICACHE_INV()
@@ -683,7 +686,26 @@ namespace VegaISA
    void
    Inst_SOPP__S_ICACHE_INV::execute(GPUDynInstPtr gpuDynInst)
    {
-        panicUnimplemented();
+        Wavefront *wf = gpuDynInst->wavefront();
+
+        if (gpuDynInst->exec_mask.none()) {
+            wf->decLGKMInstsIssued();
+            return;
+        }
+
+        gpuDynInst->execUnitId = wf->execUnitId;
+        gpuDynInst->latency.init(gpuDynInst->computeUnit());
+        gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
+
+        gpuDynInst->resetEntireStatusVector();
+        gpuDynInst->setStatusVector(0, 1);
+        RequestPtr req = std::make_shared<Request>(0, 0, 0,
+                                   gpuDynInst->computeUnit()->
+                                   requestorId(), 0,
+                                   gpuDynInst->wfDynId);
+        gpuDynInst->setRequestFlags(req);
+        gpuDynInst->computeUnit()->scalarMemoryPipe.
+            injectScalarMemFence(gpuDynInst, false, req);
    } // execute
    // --- Inst_SOPP__S_INCPERFLEVEL class methods ---

--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -294,6 +294,7 @@ class Shader(ClockedObject):
    dispatcher = Param.GPUDispatcher("GPU workgroup dispatcher")
    system_hub = Param.AMDGPUSystemHub(NULL, "GPU System Hub (FS Mode only)")
    n_wf = Param.Int(10, "Number of wavefront slots per SIMD")
+    cu_per_sqc = Param.Int(4, "Number of CUs that share an SQC")
    impl_kern_launch_acq = Param.Bool(
        True,
        """Insert acq packet into
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -397,9 +397,9 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
 }

 /**
- * trigger invalidate operation in the cu
+ * trigger invalidate operation in the CU
 *
- * req: request initialized in shader, carrying the invlidate flags
+ * req: request initialized in shader, carrying the invalidate flags
 */
 void
 ComputeUnit::doInvalidate(RequestPtr req, int kernId){
@@ -425,6 +425,26 @@ ComputeUnit::doFlush(GPUDynInstPtr gpuDynInst) {
    injectGlobalMemFence(gpuDynInst, true);
 }

+/**
+ * trigger SQCinvalidate operation in the CU
+ *
+ * req: request initialized in shader, carrying the invalidate flags
+ */
+void
+ComputeUnit::doSQCInvalidate(RequestPtr req, int kernId){
+    GPUDynInstPtr gpuDynInst
+        = std::make_shared<GPUDynInst>(this, nullptr,
+            new KernelLaunchStaticInst(), getAndIncSeqNum());
+
+    // kern_id will be used in inv responses
+    gpuDynInst->kern_id = kernId;
+    // update contextId field
+    req->setContext(gpuDynInst->wfDynId);
+
+    gpuDynInst->staticInstruction()->setFlag(GPUStaticInst::Scalar);
+    scalarMemoryPipe.injectScalarMemFence(gpuDynInst, true, req);
+}
+
 // reseting SIMD register pools
 // I couldn't think of any other place and
 // I think it is needed in my implementation
@@ -1012,7 +1032,14 @@ ComputeUnit::DataPort::recvReqRetry()
 bool
 ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
 {
-    computeUnit->handleSQCReturn(pkt);
+    SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+    /** Process the response only if there is a wavefront associated with it.
+     * Otherwise, it is from SQC invalidate that was issued at kernel start
+     * and doesn't have a wavefront or instruction associated with it.
+     */
+    if (sender_state->wavefront != nullptr) {
+        computeUnit->handleSQCReturn(pkt);
+    }

    return true;
 }
@@ -1046,6 +1073,26 @@ ComputeUnit::SQCPort::recvReqRetry()
    }
 }

+const char*
+ComputeUnit::SQCPort::MemReqEvent::description() const
+{
+    return "ComputeUnit SQC memory request event";
+}
+
+void
+ComputeUnit::SQCPort::MemReqEvent::process()
+{
+    SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+    [[maybe_unused]] ComputeUnit *compute_unit = sqcPort.computeUnit;
+
+    assert(!pkt->req->systemReq());
+
+    if (!(sqcPort.sendTimingReq(pkt))) {
+        sqcPort.retries.push_back(std::pair<PacketPtr, Wavefront*>
+                (pkt, sender_state->wavefront));
+    }
+}
+
 void
 ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt)
 {
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -412,6 +412,7 @@ class ComputeUnit : public ClockedObject

    void doInvalidate(RequestPtr req, int kernId);
    void doFlush(GPUDynInstPtr gpuDynInst);
+    void doSQCInvalidate(RequestPtr req, int kernId);

    void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg);
    bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg);
@@ -680,6 +681,23 @@ class ComputeUnit : public ClockedObject
                kernId(_kernId){ }
        };

+        class MemReqEvent : public Event
+        {
+          private:
+            SQCPort &sqcPort;
+            PacketPtr pkt;
+
+          public:
+            MemReqEvent(SQCPort &_sqc_port, PacketPtr _pkt)
+                : Event(), sqcPort(_sqc_port), pkt(_pkt)
+            {
+              setFlags(Event::AutoDelete);
+            }
+
+            void process();
+            const char *description() const;
+        };
+
        std::deque<std::pair<PacketPtr, Wavefront*>> retries;

      protected:
--- a/src/gpu-compute/fetch_unit.cc
+++ b/src/gpu-compute/fetch_unit.cc
@@ -320,7 +320,7 @@ FetchUnit::processFetchReturn(PacketPtr pkt)
        assert(!fetchBuf.at(wavefront->wfSlotId).hasFetchDataToProcess());
        wavefront->dropFetch = false;
    } else {
-        fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt->req->getVaddr());
+        fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt);
    }

    wavefront->pendingFetch = false;
@@ -469,8 +469,23 @@ FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr)
 }

 void
-FetchUnit::FetchBufDesc::fetchDone(Addr vaddr)
+FetchUnit::FetchBufDesc::fetchDone(PacketPtr pkt)
 {
+    // If the return command is MemSyncResp, then it belongs to
+    // an SQC invalidation request. This request calls
+    // incLGKMInstsIssued() function in its execution path.
+    // Since there is no valid memory return response associated with
+    // this instruction, decLGKMInstsIssued() is not executed. Do this
+    // here to decrement the counter and invalidate all buffers
+    if (pkt->cmd == MemCmd::MemSyncResp) {
+        wavefront->decLGKMInstsIssued();
+        flushBuf();
+        restartFromBranch = false;
+        return;
+    }
+
+    Addr vaddr = pkt->req->getVaddr();
+
    assert(bufferedPCs.find(vaddr) == bufferedPCs.end());
    DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for addr %#x\n",
            wavefront->simdId, wavefront->wfSlotId,
--- a/src/gpu-compute/fetch_unit.hh
+++ b/src/gpu-compute/fetch_unit.hh
@@ -138,7 +138,7 @@ class FetchUnit
            return is_reserved;
        }

-        void fetchDone(Addr vaddr);
+        void fetchDone(PacketPtr ptr);

        /**
         * checks if the buffer contains valid data. this essentially
--- a/src/gpu-compute/scalar_memory_pipeline.cc
+++ b/src/gpu-compute/scalar_memory_pipeline.cc
@@ -160,4 +160,55 @@ ScalarMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)
    issuedRequests.push(gpuDynInst);
 }

+void
+ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst,
+                                        bool kernelMemSync,
+                                        RequestPtr req)
+{
+    assert(gpuDynInst->isScalar());
+
+    if (!req) {
+        req = std::make_shared<Request>(
+                0, 0, 0, computeUnit.requestorId(), 0, gpuDynInst->wfDynId);
+    } else {
+        req->requestorId(computeUnit.requestorId());
+    }
+
+    // When the SQC invalidate instruction is executed, it calls
+    // injectScalarMemFence. The instruction does not contain an address
+    // as one of its operands. Therefore, set the physical address of the
+    // invalidation request to 0 and handle it in the sequencer
+    req->setPaddr(0);
+
+    PacketPtr pkt = nullptr;
+
+    // If kernelMemSync is true, then the invalidation request is from
+    // kernel launch and is an implicit invalidation.If false, then it is
+    // due to an S_ICACHE_INV instruction
+    if (kernelMemSync) {
+        req->setCacheCoherenceFlags(Request::INV_L1);
+        req->setReqInstSeqNum(gpuDynInst->seqNum());
+        req->setFlags(Request::KERNEL);
+        pkt = new Packet(req, MemCmd::MemSyncReq);
+        pkt->pushSenderState(
+                new ComputeUnit::SQCPort::SenderState(
+                    gpuDynInst->wavefront(), nullptr));
+    } else {
+        gpuDynInst->setRequestFlags(req);
+
+        req->setReqInstSeqNum(gpuDynInst->seqNum());
+
+        pkt = new Packet(req, MemCmd::MemSyncReq);
+        pkt->pushSenderState(
+                new ComputeUnit::SQCPort::SenderState(
+                    gpuDynInst->wavefront(), nullptr));
+    }
+
+    ComputeUnit::SQCPort::MemReqEvent *sqc_event =
+            new ComputeUnit::SQCPort::MemReqEvent
+            (computeUnit.sqcPort, pkt);
+    computeUnit.schedule(
+            sqc_event, curTick() + computeUnit.scalar_req_tick_latency);
+}
+
 } // namespace gem5
--- a/src/gpu-compute/scalar_memory_pipeline.hh
+++ b/src/gpu-compute/scalar_memory_pipeline.hh
@@ -36,6 +36,7 @@
 #include <string>

 #include "gpu-compute/misc.hh"
+#include "mem/request.hh"
 #include "params/ComputeUnit.hh"
 #include "sim/stats.hh"

@@ -67,6 +68,9 @@ class ScalarMemPipeline

    void issueRequest(GPUDynInstPtr gpuDynInst);

+    void injectScalarMemFence(
+            GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req);
+
    bool
    isGMLdRespFIFOWrRdy() const
    {
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -64,6 +64,7 @@ Shader::Shader(const Params &p) : ClockedObject(p),
    impl_kern_end_rel(p.impl_kern_end_rel),
    coissue_return(1),
    trace_vgpr_all(1), n_cu((p.CUs).size()), n_wf(p.n_wf),
+    n_cu_per_sqc(p.cu_per_sqc),
    globalMemSize(p.globalmem),
    nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc),
    _dispatcher(*p.dispatcher), systemHub(p.system_hub),
@@ -221,6 +222,13 @@ Shader::prepareInvalidate(HSAQueueEntry *task) {
        // all necessary INV flags are all set now, call cu to execute
        cuList[i_cu]->doInvalidate(req, task->dispatchId());

+
+        // A set of CUs share a single SQC cache. Send a single invalidate
+        // request to each SQC
+        if ((i_cu % n_cu_per_sqc) == 0) {
+            cuList[i_cu]->doSQCInvalidate(req, task->dispatchId());
+        }
+
        // I don't like this. This is intrusive coding.
        cuList[i_cu]->resetRegisterPool();
    }
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@@ -237,6 +237,8 @@ class Shader : public ClockedObject
    int n_cu;
    // Number of wavefront slots per SIMD per CU
    int n_wf;
+    //Number of cu units per sqc in the shader
+    int n_cu_per_sqc;

    // The size of global memory
    int globalMemSize;
--- a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm
@@ -60,6 +60,7 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
    // Mem sys initiated
    Repl,           desc="Replacing block from cache";
    Data,           desc="Received Data";
+    Evict,          desc="Evict cache line";
  }

  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
@@ -67,6 +68,7 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
    DataArrayWrite,   desc="Write the data array";
    TagArrayRead,     desc="Read the data array";
    TagArrayWrite,    desc="Write the data array";
+    TagArrayFlash,    desc="Flash clear the data array";
  }


@@ -242,7 +244,12 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
      peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
        Entry cache_entry := getCacheEntry(in_msg.LineAddress);
        TBE tbe := TBEs.lookup(in_msg.LineAddress);
-        trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe);
+        DPRINTF(RubySlicc, "%s\n", in_msg);
+        if (in_msg.Type == RubyRequestType:REPLACEMENT) {
+          trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
+        } else {
+          trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe);
+        }
      }
    }
  }
@@ -313,6 +320,10 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
    APPEND_TRANSITION_COMMENT(cache_entry.DataBlk);
  }

+  action(inv_invDone, "inv", desc="local inv done") {
+    sequencer.invL1Callback();
+  }
+
  action(w_writeCache, "w", desc="write data to cache") {
    peek(responseToSQC_in, ResponseMsg) {
      assert(is_valid(cache_entry));
@@ -350,6 +361,13 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
    ic_invCache;
  }

+  transition({I, IV, V}, Evict, I) {TagArrayRead, TagArrayWrite} {
+    // since we're evicting something, don't bother classifying as hit/miss
+    ic_invCache;
+    inv_invDone;
+    p_popMandatoryQueue;
+  }
+
  // if we got a response for a load where the line is in I, then
  // another request must have come in that replaced the line in question in
  // the cache.  Thus, complete this request without allocating the line, but
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -157,6 +157,9 @@ structure (Sequencer, external = "yes") {
  void llscClearLocalMonitor();

  void evictionCallback(Addr);
+
+  void invL1Callback();
+
  void recordRequestType(SequencerRequestType);
  bool checkResourceAvailable(CacheResourceType, Addr);
 }
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -85,6 +85,8 @@ Sequencer::Sequencer(const Params &p)

    m_runningGarnetStandalone = p.garnet_standalone;

+    m_num_pending_invs = 0;
+    m_cache_inv_pkt = nullptr;

    // These statistical variables are not for display.
    // The profiler will collate these across different
@@ -348,6 +350,15 @@ Sequencer::insertRequest(PacketPtr pkt, RubyRequestType primary_type,
        return RequestStatus_Ready;
    }

+    // If command is MemSyncReq, it is used to invalidate the cache.
+    // As the cache invalidation requests are already issued in invL1(),
+    // there is no need to create a new request for the same here.
+    // Instead, return RequestStatus_Aliased, and make the sequencer skip
+    // an extra issueRequest
+    if (pkt->cmd == MemCmd::MemSyncReq) {
+        return RequestStatus_Aliased;
+    }
+
    Addr line_addr = makeLineAddress(pkt->getAddr());
    // Check if there is any outstanding request for the same cache line.
    auto &seq_req_list = m_RequestTable[line_addr];
@@ -576,7 +587,8 @@ Sequencer::readCallback(Addr address, DataBlock& data,
        }
        if ((seq_req.m_type != RubyRequestType_LD) &&
            (seq_req.m_type != RubyRequestType_Load_Linked) &&
-            (seq_req.m_type != RubyRequestType_IFETCH)) {
+            (seq_req.m_type != RubyRequestType_IFETCH) &&
+            (seq_req.m_type != RubyRequestType_REPLACEMENT)) {
            // Write request: reissue request to the cache hierarchy
            issueRequest(seq_req.pkt, seq_req.m_second_type);
            break;
@@ -811,6 +823,86 @@ Sequencer::unaddressedCallback(Addr unaddressedReqId,
    }
 }

+void
+Sequencer::completeHitCallback(std::vector<PacketPtr> & mylist)
+{
+    for (auto& pkt : mylist) {
+        // When Ruby is in warmup or cooldown phase, the requests come
+        // from the cache recorder. They do not track which port to use
+        // and do not need to send the response back
+        if (!RubySystem::getWarmupEnabled()
+                && !RubySystem::getCooldownEnabled()) {
+            RubyPort::SenderState *ss =
+                safe_cast<RubyPort::SenderState *>(pkt->senderState);
+            MemResponsePort *port = ss->port;
+            assert(port != NULL);
+
+            pkt->senderState = ss->predecessor;
+
+            if (pkt->cmd != MemCmd::WriteReq) {
+                // for WriteReq, we keep the original senderState until
+                // writeCompleteCallback
+                delete ss;
+            }
+
+            port->hitCallback(pkt);
+            trySendRetries();
+        }
+    }
+
+    RubySystem *rs = m_ruby_system;
+    if (RubySystem::getWarmupEnabled()) {
+        rs->m_cache_recorder->enqueueNextFetchRequest();
+    } else if (RubySystem::getCooldownEnabled()) {
+        rs->m_cache_recorder->enqueueNextFlushRequest();
+    } else {
+        testDrainComplete();
+    }
+}
+
+void
+Sequencer::invL1Callback()
+{
+    // Since L1 invalidate is currently done with paddr = 0
+    assert(m_cache_inv_pkt && m_num_pending_invs > 0);
+
+    m_num_pending_invs--;
+
+    if (m_num_pending_invs == 0) {
+        std::vector<PacketPtr> pkt_list { m_cache_inv_pkt };
+        m_cache_inv_pkt = nullptr;
+        completeHitCallback(pkt_list);
+    }
+}
+
+void
+Sequencer::invL1()
+{
+    int size = m_dataCache_ptr->getNumBlocks();
+    DPRINTF(RubySequencer,
+            "There are %d Invalidations outstanding before Cache Walk\n",
+            m_num_pending_invs);
+    // Walk the cache
+    for (int i = 0; i < size; i++) {
+        Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+        // Evict Read-only data
+        RubyRequestType request_type = RubyRequestType_REPLACEMENT;
+        std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+            clockEdge(), addr, 0, 0,
+            request_type, RubyAccessMode_Supervisor,
+            nullptr);
+        DPRINTF(RubySequencer, "Evicting addr 0x%x\n", addr);
+        assert(m_mandatory_q_ptr != NULL);
+        Tick latency = cyclesToTicks(
+            m_controller->mandatoryQueueLatency(request_type));
+        m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
+        m_num_pending_invs++;
+    }
+    DPRINTF(RubySequencer,
+            "There are %d Invalidations outstanding after Cache Walk\n",
+            m_num_pending_invs);
+}
+
 bool
 Sequencer::empty() const
 {
@@ -915,6 +1007,11 @@ Sequencer::makeRequest(PacketPtr pkt)
            }
        } else if (pkt->isFlush()) {
          primary_type = secondary_type = RubyRequestType_FLUSH;
+        } else if (pkt->cmd == MemCmd::MemSyncReq) {
+            primary_type = secondary_type = RubyRequestType_REPLACEMENT;
+            assert(!m_cache_inv_pkt);
+            m_cache_inv_pkt = pkt;
+            invL1();
        } else {
            panic("Unsupported ruby packet type\n");
        }
--- a/src/mem/ruby/system/Sequencer.hh
+++ b/src/mem/ruby/system/Sequencer.hh
@@ -141,6 +141,10 @@ class Sequencer : public RubyPort
                             const Cycles forwardRequestTime = Cycles(0),
                             const Cycles firstResponseTime = Cycles(0));

+    void completeHitCallback(std::vector<PacketPtr>& list);
+    void invL1Callback();
+    void invL1();
+
    RequestStatus makeRequest(PacketPtr pkt) override;
    virtual bool empty() const;
    int outstandingCount() const override { return m_outstanding_count; }
@@ -243,6 +247,10 @@ class Sequencer : public RubyPort
  private:
    int m_max_outstanding_requests;

+    int m_num_pending_invs;
+
+    PacketPtr m_cache_inv_pkt;
+
    CacheMemory* m_dataCache_ptr;

    // The cache access latency for top-level caches (L0/L1). These are