From 03838afce0650efab482247c1f847657639979aa Mon Sep 17 00:00:00 2001 From: Vishnu Ramadas Date: Thu, 25 Jan 2024 13:37:31 -0600 Subject: [PATCH] gpu-compute: Add support for injecting scalar memory barrier This commit adds support for injecting a scalar memory barrier in the GPU. The barrier will primarily be used to invalidate the entire SQC cache. The commit also invalidates all buffers and decrements related counters upon completion of the invalidation request Change-Id: Ib8e270bbeb8229a4470d606c96876ba5c87335bf --- src/gpu-compute/compute_unit.cc | 22 ++++++++++ src/gpu-compute/compute_unit.hh | 35 ++++++++++++++++ src/gpu-compute/fetch_unit.cc | 30 +++++++++++++ src/gpu-compute/fetch_unit.hh | 1 + src/gpu-compute/scalar_memory_pipeline.cc | 51 +++++++++++++++++++++++ src/gpu-compute/scalar_memory_pipeline.hh | 4 ++ 6 files changed, 143 insertions(+) diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 8259f0a950..f28a8e39c7 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -1046,6 +1046,28 @@ ComputeUnit::SQCPort::recvReqRetry() } } +const char* +ComputeUnit::SQCPort::MemReqEvent::description() const +{ + return "ComputeUnit SQC memory request event"; +} + +void +ComputeUnit::SQCPort::MemReqEvent::process() +{ + SenderState *sender_state = safe_cast(pkt->senderState); + [[maybe_unused]] ComputeUnit *compute_unit = sqcPort.computeUnit; + + if (pkt->req->systemReq()) { + assert(compute_unit->shader->systemHub); + SystemHubEvent *resp_event = new SystemHubEvent(pkt, &sqcPort); + compute_unit->shader->systemHub->sendRequest(pkt, resp_event); + } else if (!(sqcPort.sendTimingReq(pkt))) { + sqcPort.retries.push_back(std::pair + (pkt, sender_state->wavefront)); + } +} + void ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt) { diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index e6bc03da7d..24324bb515 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -680,6 +680,41 @@ class ComputeUnit : public ClockedObject kernId(_kernId){ } }; + class MemReqEvent : public Event + { + private: + SQCPort &sqcPort; + PacketPtr pkt; + + public: + MemReqEvent(SQCPort &_sqc_port, PacketPtr _pkt) + : Event(), sqcPort(_sqc_port), pkt(_pkt) + { + setFlags(Event::AutoDelete); + } + + void process(); + const char *description() const; + }; + + class SystemHubEvent : public Event + { + SQCPort *sqcPort; + PacketPtr reqPkt; + + public: + SystemHubEvent(PacketPtr pkt, SQCPort *_sqcPort) + : sqcPort(_sqcPort), reqPkt(pkt) + { + setFlags(Event::AutoDelete); + } + + void + process() + { + } + }; + std::deque> retries; protected: diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc index 4dadbd363d..19144d55e2 100644 --- a/src/gpu-compute/fetch_unit.cc +++ b/src/gpu-compute/fetch_unit.cc @@ -388,6 +388,29 @@ FetchUnit::FetchBufDesc::flushBuf() wavefront->wfDynId); } +void +FetchUnit::FetchBufDesc::invBuf() +{ + restartFromBranch = false; + /** + * free list may have some entries + * so we clear it here to avoid duplicates + */ + freeList.clear(); + bufferedPCs.clear(); + reservedPCs.clear(); + readPtr = bufStart; + + for (int i = 0; i < fetchDepth; ++i) { + freeList.push_back(bufStart + i * cacheLineSize); + } + + DPRINTF(GPUFetch, "WF[%d][%d]: Id%d Fetch dropped, flushing fetch " + "buffer\n", wavefront->simdId, wavefront->wfSlotId, + wavefront->wfDynId); + +} + Addr FetchUnit::FetchBufDesc::nextFetchAddr() { @@ -471,6 +494,13 @@ FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr) void FetchUnit::FetchBufDesc::fetchDone(Addr vaddr) { + if (vaddr == 0) { + // S_ICACHE_INV fetch done + wavefront->decLGKMInstsIssued(); + invBuf(); + return; + } + assert(bufferedPCs.find(vaddr) == bufferedPCs.end()); DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for addr %#x\n", wavefront->simdId, wavefront->wfSlotId, diff --git a/src/gpu-compute/fetch_unit.hh b/src/gpu-compute/fetch_unit.hh index 0ba88c7d95..99c91b7299 100644 --- a/src/gpu-compute/fetch_unit.hh +++ b/src/gpu-compute/fetch_unit.hh @@ -104,6 +104,7 @@ class FetchUnit int reservedLines() const { return reservedPCs.size(); } bool hasFreeSpace() const { return !freeList.empty(); } void flushBuf(); + void invBuf(); Addr nextFetchAddr(); /** diff --git a/src/gpu-compute/scalar_memory_pipeline.cc b/src/gpu-compute/scalar_memory_pipeline.cc index de24f9448b..767e4e05a7 100644 --- a/src/gpu-compute/scalar_memory_pipeline.cc +++ b/src/gpu-compute/scalar_memory_pipeline.cc @@ -160,4 +160,55 @@ ScalarMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst) issuedRequests.push(gpuDynInst); } +void +ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst, + bool kernelMemSync, + RequestPtr req) +{ + assert(gpuDynInst->isScalar()); + + if (!req) { + req = std::make_shared( + 0, 0, 0, computeUnit.requestorId(), 0, gpuDynInst->wfDynId); + } else { + req->requestorId(computeUnit.requestorId()); + } + + req->setPaddr(0); + + PacketPtr pkt = nullptr; + + if (kernelMemSync) { + req->setCacheCoherenceFlags(Request::INV_L1); + req->setReqInstSeqNum(gpuDynInst->seqNum()); + req->setFlags(Request::KERNEL); + pkt = new Packet(req, MemCmd::MemSyncReq); + pkt->pushSenderState( + new ComputeUnit::SQCPort::SenderState( + gpuDynInst->wavefront(), nullptr)); + ComputeUnit::SQCPort::MemReqEvent *sqc_event = + new ComputeUnit::SQCPort::MemReqEvent + (computeUnit.sqcPort, pkt); + + computeUnit.schedule( + sqc_event, curTick() + computeUnit.scalar_req_tick_latency); + } else { + gpuDynInst->setRequestFlags(req); + + req->setReqInstSeqNum(gpuDynInst->seqNum()); + + pkt = new Packet(req, MemCmd::MemSyncReq); + pkt->pushSenderState( + new ComputeUnit::SQCPort::SenderState( + gpuDynInst->wavefront(), nullptr)); + + ComputeUnit::SQCPort::MemReqEvent *sqc_event = + new ComputeUnit::SQCPort::MemReqEvent + (computeUnit.sqcPort, pkt); + + computeUnit.schedule( + sqc_event, curTick() + computeUnit.scalar_req_tick_latency); + } +} + } // namespace gem5 diff --git a/src/gpu-compute/scalar_memory_pipeline.hh b/src/gpu-compute/scalar_memory_pipeline.hh index 5512c7c01f..e5dc7b4292 100644 --- a/src/gpu-compute/scalar_memory_pipeline.hh +++ b/src/gpu-compute/scalar_memory_pipeline.hh @@ -36,6 +36,7 @@ #include #include "gpu-compute/misc.hh" +#include "mem/request.hh" #include "params/ComputeUnit.hh" #include "sim/stats.hh" @@ -67,6 +68,9 @@ class ScalarMemPipeline void issueRequest(GPUDynInstPtr gpuDynInst); + void injectScalarMemFence( + GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req); + bool isGMLdRespFIFOWrRdy() const {