diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py index 9c023fe5c6..17e46268ef 100644 --- a/configs/example/apu_se.py +++ b/configs/example/apu_se.py @@ -434,6 +434,7 @@ print( # shader is the GPU shader = Shader( n_wf=args.wfs_per_simd, + cu_per_sqc=args.cu_per_sqc, clk_domain=SrcClockDomain( clock=args.gpu_clock, voltage_domain=VoltageDomain(voltage=args.gpu_voltage), diff --git a/configs/example/gpufs/system/amdgpu.py b/configs/example/gpufs/system/amdgpu.py index 30e059d154..0813759e2a 100644 --- a/configs/example/gpufs/system/amdgpu.py +++ b/configs/example/gpufs/system/amdgpu.py @@ -33,7 +33,10 @@ from m5.objects import * def createGPU(system, args): shader = Shader( - n_wf=args.wfs_per_simd, timing=True, clk_domain=system.clk_domain + n_wf=args.wfs_per_simd, + cu_per_sqc=args.cu_per_sqc, + timing=True, + clk_domain=system.clk_domain, ) # VIPER GPU protocol implements release consistency at GPU side. So, diff --git a/src/arch/amdgpu/vega/insts/sopp.cc b/src/arch/amdgpu/vega/insts/sopp.cc index df5cdbf681..781113b204 100644 --- a/src/arch/amdgpu/vega/insts/sopp.cc +++ b/src/arch/amdgpu/vega/insts/sopp.cc @@ -669,6 +669,9 @@ namespace VegaISA Inst_SOPP__S_ICACHE_INV::Inst_SOPP__S_ICACHE_INV(InFmt_SOPP *iFmt) : Inst_SOPP(iFmt, "s_icache_inv") { + setFlag(MemBarrier); + setFlag(GPUStaticInst::MemSync); + setFlag(MemSync); } // Inst_SOPP__S_ICACHE_INV Inst_SOPP__S_ICACHE_INV::~Inst_SOPP__S_ICACHE_INV() @@ -683,7 +686,26 @@ namespace VegaISA void Inst_SOPP__S_ICACHE_INV::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); + Wavefront *wf = gpuDynInst->wavefront(); + + if (gpuDynInst->exec_mask.none()) { + wf->decLGKMInstsIssued(); + return; + } + + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); + + gpuDynInst->resetEntireStatusVector(); + gpuDynInst->setStatusVector(0, 1); + RequestPtr req = std::make_shared(0, 0, 0, + gpuDynInst->computeUnit()-> + requestorId(), 0, + gpuDynInst->wfDynId); + gpuDynInst->setRequestFlags(req); + gpuDynInst->computeUnit()->scalarMemoryPipe. + injectScalarMemFence(gpuDynInst, false, req); } // execute // --- Inst_SOPP__S_INCPERFLEVEL class methods --- diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py index 78baa596a7..b9a13dc85b 100644 --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -294,6 +294,7 @@ class Shader(ClockedObject): dispatcher = Param.GPUDispatcher("GPU workgroup dispatcher") system_hub = Param.AMDGPUSystemHub(NULL, "GPU System Hub (FS Mode only)") n_wf = Param.Int(10, "Number of wavefront slots per SIMD") + cu_per_sqc = Param.Int(4, "Number of CUs that share an SQC") impl_kern_launch_acq = Param.Bool( True, """Insert acq packet into diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 8259f0a950..e485aa6161 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -397,9 +397,9 @@ ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, } /** - * trigger invalidate operation in the cu + * trigger invalidate operation in the CU * - * req: request initialized in shader, carrying the invlidate flags + * req: request initialized in shader, carrying the invalidate flags */ void ComputeUnit::doInvalidate(RequestPtr req, int kernId){ @@ -425,6 +425,26 @@ ComputeUnit::doFlush(GPUDynInstPtr gpuDynInst) { injectGlobalMemFence(gpuDynInst, true); } +/** + * trigger SQCinvalidate operation in the CU + * + * req: request initialized in shader, carrying the invalidate flags + */ +void +ComputeUnit::doSQCInvalidate(RequestPtr req, int kernId){ + GPUDynInstPtr gpuDynInst + = std::make_shared(this, nullptr, + new KernelLaunchStaticInst(), getAndIncSeqNum()); + + // kern_id will be used in inv responses + gpuDynInst->kern_id = kernId; + // update contextId field + req->setContext(gpuDynInst->wfDynId); + + gpuDynInst->staticInstruction()->setFlag(GPUStaticInst::Scalar); + scalarMemoryPipe.injectScalarMemFence(gpuDynInst, true, req); +} + // reseting SIMD register pools // I couldn't think of any other place and // I think it is needed in my implementation @@ -1012,7 +1032,14 @@ ComputeUnit::DataPort::recvReqRetry() bool ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt) { - computeUnit->handleSQCReturn(pkt); + SenderState *sender_state = safe_cast(pkt->senderState); + /** Process the response only if there is a wavefront associated with it. + * Otherwise, it is from SQC invalidate that was issued at kernel start + * and doesn't have a wavefront or instruction associated with it. + */ + if (sender_state->wavefront != nullptr) { + computeUnit->handleSQCReturn(pkt); + } return true; } @@ -1046,6 +1073,26 @@ ComputeUnit::SQCPort::recvReqRetry() } } +const char* +ComputeUnit::SQCPort::MemReqEvent::description() const +{ + return "ComputeUnit SQC memory request event"; +} + +void +ComputeUnit::SQCPort::MemReqEvent::process() +{ + SenderState *sender_state = safe_cast(pkt->senderState); + [[maybe_unused]] ComputeUnit *compute_unit = sqcPort.computeUnit; + + assert(!pkt->req->systemReq()); + + if (!(sqcPort.sendTimingReq(pkt))) { + sqcPort.retries.push_back(std::pair + (pkt, sender_state->wavefront)); + } +} + void ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt) { diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index e6bc03da7d..cfa145551f 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -412,6 +412,7 @@ class ComputeUnit : public ClockedObject void doInvalidate(RequestPtr req, int kernId); void doFlush(GPUDynInstPtr gpuDynInst); + void doSQCInvalidate(RequestPtr req, int kernId); void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg); bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg); @@ -680,6 +681,23 @@ class ComputeUnit : public ClockedObject kernId(_kernId){ } }; + class MemReqEvent : public Event + { + private: + SQCPort &sqcPort; + PacketPtr pkt; + + public: + MemReqEvent(SQCPort &_sqc_port, PacketPtr _pkt) + : Event(), sqcPort(_sqc_port), pkt(_pkt) + { + setFlags(Event::AutoDelete); + } + + void process(); + const char *description() const; + }; + std::deque> retries; protected: diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc index 4dadbd363d..20b89f6384 100644 --- a/src/gpu-compute/fetch_unit.cc +++ b/src/gpu-compute/fetch_unit.cc @@ -320,7 +320,7 @@ FetchUnit::processFetchReturn(PacketPtr pkt) assert(!fetchBuf.at(wavefront->wfSlotId).hasFetchDataToProcess()); wavefront->dropFetch = false; } else { - fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt->req->getVaddr()); + fetchBuf.at(wavefront->wfSlotId).fetchDone(pkt); } wavefront->pendingFetch = false; @@ -469,8 +469,23 @@ FetchUnit::FetchBufDesc::reserveBuf(Addr vaddr) } void -FetchUnit::FetchBufDesc::fetchDone(Addr vaddr) +FetchUnit::FetchBufDesc::fetchDone(PacketPtr pkt) { + // If the return command is MemSyncResp, then it belongs to + // an SQC invalidation request. This request calls + // incLGKMInstsIssued() function in its execution path. + // Since there is no valid memory return response associated with + // this instruction, decLGKMInstsIssued() is not executed. Do this + // here to decrement the counter and invalidate all buffers + if (pkt->cmd == MemCmd::MemSyncResp) { + wavefront->decLGKMInstsIssued(); + flushBuf(); + restartFromBranch = false; + return; + } + + Addr vaddr = pkt->req->getVaddr(); + assert(bufferedPCs.find(vaddr) == bufferedPCs.end()); DPRINTF(GPUFetch, "WF[%d][%d]: Id%d done fetching for addr %#x\n", wavefront->simdId, wavefront->wfSlotId, diff --git a/src/gpu-compute/fetch_unit.hh b/src/gpu-compute/fetch_unit.hh index 0ba88c7d95..85bf2472ec 100644 --- a/src/gpu-compute/fetch_unit.hh +++ b/src/gpu-compute/fetch_unit.hh @@ -138,7 +138,7 @@ class FetchUnit return is_reserved; } - void fetchDone(Addr vaddr); + void fetchDone(PacketPtr ptr); /** * checks if the buffer contains valid data. this essentially diff --git a/src/gpu-compute/scalar_memory_pipeline.cc b/src/gpu-compute/scalar_memory_pipeline.cc index de24f9448b..54819e7d3f 100644 --- a/src/gpu-compute/scalar_memory_pipeline.cc +++ b/src/gpu-compute/scalar_memory_pipeline.cc @@ -160,4 +160,55 @@ ScalarMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst) issuedRequests.push(gpuDynInst); } +void +ScalarMemPipeline::injectScalarMemFence(GPUDynInstPtr gpuDynInst, + bool kernelMemSync, + RequestPtr req) +{ + assert(gpuDynInst->isScalar()); + + if (!req) { + req = std::make_shared( + 0, 0, 0, computeUnit.requestorId(), 0, gpuDynInst->wfDynId); + } else { + req->requestorId(computeUnit.requestorId()); + } + + // When the SQC invalidate instruction is executed, it calls + // injectScalarMemFence. The instruction does not contain an address + // as one of its operands. Therefore, set the physical address of the + // invalidation request to 0 and handle it in the sequencer + req->setPaddr(0); + + PacketPtr pkt = nullptr; + + // If kernelMemSync is true, then the invalidation request is from + // kernel launch and is an implicit invalidation.If false, then it is + // due to an S_ICACHE_INV instruction + if (kernelMemSync) { + req->setCacheCoherenceFlags(Request::INV_L1); + req->setReqInstSeqNum(gpuDynInst->seqNum()); + req->setFlags(Request::KERNEL); + pkt = new Packet(req, MemCmd::MemSyncReq); + pkt->pushSenderState( + new ComputeUnit::SQCPort::SenderState( + gpuDynInst->wavefront(), nullptr)); + } else { + gpuDynInst->setRequestFlags(req); + + req->setReqInstSeqNum(gpuDynInst->seqNum()); + + pkt = new Packet(req, MemCmd::MemSyncReq); + pkt->pushSenderState( + new ComputeUnit::SQCPort::SenderState( + gpuDynInst->wavefront(), nullptr)); + } + + ComputeUnit::SQCPort::MemReqEvent *sqc_event = + new ComputeUnit::SQCPort::MemReqEvent + (computeUnit.sqcPort, pkt); + computeUnit.schedule( + sqc_event, curTick() + computeUnit.scalar_req_tick_latency); +} + } // namespace gem5 diff --git a/src/gpu-compute/scalar_memory_pipeline.hh b/src/gpu-compute/scalar_memory_pipeline.hh index 5512c7c01f..e5dc7b4292 100644 --- a/src/gpu-compute/scalar_memory_pipeline.hh +++ b/src/gpu-compute/scalar_memory_pipeline.hh @@ -36,6 +36,7 @@ #include #include "gpu-compute/misc.hh" +#include "mem/request.hh" #include "params/ComputeUnit.hh" #include "sim/stats.hh" @@ -67,6 +68,9 @@ class ScalarMemPipeline void issueRequest(GPUDynInstPtr gpuDynInst); + void injectScalarMemFence( + GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req); + bool isGMLdRespFIFOWrRdy() const { diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc index e13e7c9cf4..b99950568e 100644 --- a/src/gpu-compute/shader.cc +++ b/src/gpu-compute/shader.cc @@ -64,6 +64,7 @@ Shader::Shader(const Params &p) : ClockedObject(p), impl_kern_end_rel(p.impl_kern_end_rel), coissue_return(1), trace_vgpr_all(1), n_cu((p.CUs).size()), n_wf(p.n_wf), + n_cu_per_sqc(p.cu_per_sqc), globalMemSize(p.globalmem), nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc), _dispatcher(*p.dispatcher), systemHub(p.system_hub), @@ -221,6 +222,13 @@ Shader::prepareInvalidate(HSAQueueEntry *task) { // all necessary INV flags are all set now, call cu to execute cuList[i_cu]->doInvalidate(req, task->dispatchId()); + + // A set of CUs share a single SQC cache. Send a single invalidate + // request to each SQC + if ((i_cu % n_cu_per_sqc) == 0) { + cuList[i_cu]->doSQCInvalidate(req, task->dispatchId()); + } + // I don't like this. This is intrusive coding. cuList[i_cu]->resetRegisterPool(); } diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh index 32ddf3d15b..89541a8ff4 100644 --- a/src/gpu-compute/shader.hh +++ b/src/gpu-compute/shader.hh @@ -237,6 +237,8 @@ class Shader : public ClockedObject int n_cu; // Number of wavefront slots per SIMD per CU int n_wf; + //Number of cu units per sqc in the shader + int n_cu_per_sqc; // The size of global memory int globalMemSize; diff --git a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm index bdc5d73f20..67c7753f09 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-SQC.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-SQC.sm @@ -60,6 +60,7 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)") // Mem sys initiated Repl, desc="Replacing block from cache"; Data, desc="Received Data"; + Evict, desc="Evict cache line"; } enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { @@ -67,6 +68,7 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)") DataArrayWrite, desc="Write the data array"; TagArrayRead, desc="Read the data array"; TagArrayWrite, desc="Write the data array"; + TagArrayFlash, desc="Flash clear the data array"; } @@ -242,7 +244,12 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)") peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") { Entry cache_entry := getCacheEntry(in_msg.LineAddress); TBE tbe := TBEs.lookup(in_msg.LineAddress); - trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe); + DPRINTF(RubySlicc, "%s\n", in_msg); + if (in_msg.Type == RubyRequestType:REPLACEMENT) { + trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe); + } else { + trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe); + } } } } @@ -313,6 +320,10 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)") APPEND_TRANSITION_COMMENT(cache_entry.DataBlk); } + action(inv_invDone, "inv", desc="local inv done") { + sequencer.invL1Callback(); + } + action(w_writeCache, "w", desc="write data to cache") { peek(responseToSQC_in, ResponseMsg) { assert(is_valid(cache_entry)); @@ -350,6 +361,13 @@ machine(MachineType:SQC, "GPU SQC (L1 I Cache)") ic_invCache; } + transition({I, IV, V}, Evict, I) {TagArrayRead, TagArrayWrite} { + // since we're evicting something, don't bother classifying as hit/miss + ic_invCache; + inv_invDone; + p_popMandatoryQueue; + } + // if we got a response for a load where the line is in I, then // another request must have come in that replaced the line in question in // the cache. Thus, complete this request without allocating the line, but diff --git a/src/mem/ruby/protocol/RubySlicc_Types.sm b/src/mem/ruby/protocol/RubySlicc_Types.sm index 2206effa29..cc56d3b1b4 100644 --- a/src/mem/ruby/protocol/RubySlicc_Types.sm +++ b/src/mem/ruby/protocol/RubySlicc_Types.sm @@ -157,6 +157,9 @@ structure (Sequencer, external = "yes") { void llscClearLocalMonitor(); void evictionCallback(Addr); + + void invL1Callback(); + void recordRequestType(SequencerRequestType); bool checkResourceAvailable(CacheResourceType, Addr); } diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc index 48054febef..4fef7090b6 100644 --- a/src/mem/ruby/system/Sequencer.cc +++ b/src/mem/ruby/system/Sequencer.cc @@ -85,6 +85,8 @@ Sequencer::Sequencer(const Params &p) m_runningGarnetStandalone = p.garnet_standalone; + m_num_pending_invs = 0; + m_cache_inv_pkt = nullptr; // These statistical variables are not for display. // The profiler will collate these across different @@ -348,6 +350,15 @@ Sequencer::insertRequest(PacketPtr pkt, RubyRequestType primary_type, return RequestStatus_Ready; } + // If command is MemSyncReq, it is used to invalidate the cache. + // As the cache invalidation requests are already issued in invL1(), + // there is no need to create a new request for the same here. + // Instead, return RequestStatus_Aliased, and make the sequencer skip + // an extra issueRequest + if (pkt->cmd == MemCmd::MemSyncReq) { + return RequestStatus_Aliased; + } + Addr line_addr = makeLineAddress(pkt->getAddr()); // Check if there is any outstanding request for the same cache line. auto &seq_req_list = m_RequestTable[line_addr]; @@ -576,7 +587,8 @@ Sequencer::readCallback(Addr address, DataBlock& data, } if ((seq_req.m_type != RubyRequestType_LD) && (seq_req.m_type != RubyRequestType_Load_Linked) && - (seq_req.m_type != RubyRequestType_IFETCH)) { + (seq_req.m_type != RubyRequestType_IFETCH) && + (seq_req.m_type != RubyRequestType_REPLACEMENT)) { // Write request: reissue request to the cache hierarchy issueRequest(seq_req.pkt, seq_req.m_second_type); break; @@ -811,6 +823,86 @@ Sequencer::unaddressedCallback(Addr unaddressedReqId, } } +void +Sequencer::completeHitCallback(std::vector & mylist) +{ + for (auto& pkt : mylist) { + // When Ruby is in warmup or cooldown phase, the requests come + // from the cache recorder. They do not track which port to use + // and do not need to send the response back + if (!RubySystem::getWarmupEnabled() + && !RubySystem::getCooldownEnabled()) { + RubyPort::SenderState *ss = + safe_cast(pkt->senderState); + MemResponsePort *port = ss->port; + assert(port != NULL); + + pkt->senderState = ss->predecessor; + + if (pkt->cmd != MemCmd::WriteReq) { + // for WriteReq, we keep the original senderState until + // writeCompleteCallback + delete ss; + } + + port->hitCallback(pkt); + trySendRetries(); + } + } + + RubySystem *rs = m_ruby_system; + if (RubySystem::getWarmupEnabled()) { + rs->m_cache_recorder->enqueueNextFetchRequest(); + } else if (RubySystem::getCooldownEnabled()) { + rs->m_cache_recorder->enqueueNextFlushRequest(); + } else { + testDrainComplete(); + } +} + +void +Sequencer::invL1Callback() +{ + // Since L1 invalidate is currently done with paddr = 0 + assert(m_cache_inv_pkt && m_num_pending_invs > 0); + + m_num_pending_invs--; + + if (m_num_pending_invs == 0) { + std::vector pkt_list { m_cache_inv_pkt }; + m_cache_inv_pkt = nullptr; + completeHitCallback(pkt_list); + } +} + +void +Sequencer::invL1() +{ + int size = m_dataCache_ptr->getNumBlocks(); + DPRINTF(RubySequencer, + "There are %d Invalidations outstanding before Cache Walk\n", + m_num_pending_invs); + // Walk the cache + for (int i = 0; i < size; i++) { + Addr addr = m_dataCache_ptr->getAddressAtIdx(i); + // Evict Read-only data + RubyRequestType request_type = RubyRequestType_REPLACEMENT; + std::shared_ptr msg = std::make_shared( + clockEdge(), addr, 0, 0, + request_type, RubyAccessMode_Supervisor, + nullptr); + DPRINTF(RubySequencer, "Evicting addr 0x%x\n", addr); + assert(m_mandatory_q_ptr != NULL); + Tick latency = cyclesToTicks( + m_controller->mandatoryQueueLatency(request_type)); + m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency); + m_num_pending_invs++; + } + DPRINTF(RubySequencer, + "There are %d Invalidations outstanding after Cache Walk\n", + m_num_pending_invs); +} + bool Sequencer::empty() const { @@ -915,6 +1007,11 @@ Sequencer::makeRequest(PacketPtr pkt) } } else if (pkt->isFlush()) { primary_type = secondary_type = RubyRequestType_FLUSH; + } else if (pkt->cmd == MemCmd::MemSyncReq) { + primary_type = secondary_type = RubyRequestType_REPLACEMENT; + assert(!m_cache_inv_pkt); + m_cache_inv_pkt = pkt; + invL1(); } else { panic("Unsupported ruby packet type\n"); } diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh index 8f736da6d5..3dc61ab4fa 100644 --- a/src/mem/ruby/system/Sequencer.hh +++ b/src/mem/ruby/system/Sequencer.hh @@ -141,6 +141,10 @@ class Sequencer : public RubyPort const Cycles forwardRequestTime = Cycles(0), const Cycles firstResponseTime = Cycles(0)); + void completeHitCallback(std::vector& list); + void invL1Callback(); + void invL1(); + RequestStatus makeRequest(PacketPtr pkt) override; virtual bool empty() const; int outstandingCount() const override { return m_outstanding_count; } @@ -243,6 +247,10 @@ class Sequencer : public RubyPort private: int m_max_outstanding_requests; + int m_num_pending_invs; + + PacketPtr m_cache_inv_pkt; + CacheMemory* m_dataCache_ptr; // The cache access latency for top-level caches (L0/L1). These are