/* * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. * All rights reserved. * * For use for simulation and test purposes only * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "gpu-compute/compute_unit.hh" #include #include "base/output.hh" #include "debug/GPUDisp.hh" #include "debug/GPUExec.hh" #include "debug/GPUFetch.hh" #include "debug/GPUMem.hh" #include "debug/GPUPort.hh" #include "debug/GPUPrefetch.hh" #include "debug/GPUReg.hh" #include "debug/GPURename.hh" #include "debug/GPUSync.hh" #include "debug/GPUTLB.hh" #include "gpu-compute/dispatcher.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/gpu_static_inst.hh" #include "gpu-compute/scalar_register_file.hh" #include "gpu-compute/shader.hh" #include "gpu-compute/simple_pool_manager.hh" #include "gpu-compute/vector_register_file.hh" #include "gpu-compute/wavefront.hh" #include "mem/page_table.hh" #include "sim/process.hh" #include "sim/sim_exit.hh" ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), numVectorGlobalMemUnits(p->num_global_mem_pipes), numVectorSharedMemUnits(p->num_shared_mem_pipes), numScalarMemUnits(p->num_scalar_mem_pipes), numVectorALUs(p->num_SIMDs), numScalarALUs(p->num_scalar_cores), vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width), coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width), registerManager(p->register_manager), fetchStage(p), scoreboardCheckStage(p), scheduleStage(p, this), execStage(p), globalMemoryPipe(p), localMemoryPipe(p), scalarMemoryPipe(p), tickEvent([this]{ exec(); }, "Compute unit tick event", false, Event::CPU_Tick_Pri), cu_id(p->cu_id), vrf(p->vector_register_file), srf(p->scalar_register_file), simdWidth(p->simd_width), spBypassPipeLength(p->spbypass_pipe_length), dpBypassPipeLength(p->dpbypass_pipe_length), scalarPipeStages(p->scalar_pipe_length), operandNetworkLength(p->operand_network_length), issuePeriod(p->issue_period), vrf_gm_bus_latency(p->vrf_gm_bus_latency), srf_scm_bus_latency(p->srf_scm_bus_latency), vrf_lm_bus_latency(p->vrf_lm_bus_latency), perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth), prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type), debugSegFault(p->debugSegFault), functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier), countPages(p->countPages), barrier_id(0), req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()), resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()), _masterId(p->system->getMasterId(this, "ComputeUnit")), lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this), _cacheLineSize(p->system->cacheLineSize()), globalSeqNum(0), wavefrontSize(p->wf_size) { /** * This check is necessary because std::bitset only provides conversion * to unsigned long or unsigned long long via to_ulong() or to_ullong(). * there are a few places in the code where to_ullong() is used, however * if wavefrontSize is larger than a value the host can support then * bitset will throw a runtime exception. We should remove all use of * to_long() or to_ullong() so we can have wavefrontSize greater than 64b, * however until that is done this assert is required. */ fatal_if(p->wf_size > std::numeric_limits::digits || p->wf_size <= 0, "WF size is larger than the host can support"); fatal_if(!isPowerOf2(wavefrontSize), "Wavefront size should be a power of 2"); // calculate how many cycles a vector load or store will need to transfer // its data over the corresponding buses numCyclesPerStoreTransfer = (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) / (double)vrfToCoalescerBusWidth); numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t)) / coalescerToVrfBusWidth; // Initialization: all WF slots are assumed STOPPED idleWfs = p->n_wf * numVectorALUs; lastVaddrWF.resize(numVectorALUs); wfList.resize(numVectorALUs); for (int j = 0; j < numVectorALUs; ++j) { lastVaddrWF[j].resize(p->n_wf); for (int i = 0; i < p->n_wf; ++i) { lastVaddrWF[j][i].resize(wfSize()); wfList[j].push_back(p->wavefronts[j * p->n_wf + i]); wfList[j][i]->setParent(this); for (int k = 0; k < wfSize(); ++k) { lastVaddrWF[j][i][k] = 0; } } } lastVaddrSimd.resize(numVectorALUs); for (int i = 0; i < numVectorALUs; ++i) { lastVaddrSimd[i].resize(wfSize(), 0); } lastVaddrCU.resize(wfSize()); lds.setParent(this); if (p->execPolicy == "OLDEST-FIRST") { exec_policy = EXEC_POLICY::OLDEST; } else if (p->execPolicy == "ROUND-ROBIN") { exec_policy = EXEC_POLICY::RR; } else { fatal("Invalid WF execution policy (CU)\n"); } memPort.resize(wfSize()); // Setup tokens for slave ports. The number of tokens in memSlaveTokens // is the total token count for the entire vector port (i.e., this CU). memPortTokens = new TokenManager(p->max_cu_tokens); // resize the tlbPort vectorArray int tlbPort_width = perLaneTLB ? wfSize() : 1; tlbPort.resize(tlbPort_width); cuExitCallback = new CUExitCallback(this); registerExitCallback(cuExitCallback); lastExecCycle.resize(numVectorALUs, 0); for (int i = 0; i < vrf.size(); ++i) { vrf[i]->setParent(this); } for (int i = 0; i < srf.size(); ++i) { srf[i]->setParent(this); } numVecRegsPerSimd = vrf[0]->numRegs(); numScalarRegsPerSimd = srf[0]->numRegs(); registerManager->setParent(this); activeWaves = 0; instExecPerSimd.resize(numVectorALUs, 0); // Calculate the number of bits to address a cache line panic_if(!isPowerOf2(_cacheLineSize), "Cache line size should be a power of two."); cacheLineBits = floorLog2(_cacheLineSize); } ComputeUnit::~ComputeUnit() { // Delete wavefront slots for (int j = 0; j < numVectorALUs; ++j) { for (int i = 0; i < shader->n_wf; ++i) { delete wfList[j][i]; } lastVaddrSimd[j].clear(); } lastVaddrCU.clear(); readyList.clear(); dispatchList.clear(); delete cuExitCallback; delete ldsPort; } int ComputeUnit::numExeUnits() const { return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits + numVectorSharedMemUnits + numScalarMemUnits; } // index into readyList of the first memory unit int ComputeUnit::firstMemUnit() const { return numVectorALUs + numScalarALUs; } // index into readyList of the last memory unit int ComputeUnit::lastMemUnit() const { return numExeUnits() - 1; } // index into scalarALUs vector of SALU used by the wavefront int ComputeUnit::mapWaveToScalarAlu(Wavefront *w) const { if (numScalarALUs == 1) { return 0; } else { return w->simdId % numScalarALUs; } } // index into readyList of Scalar ALU unit used by wavefront int ComputeUnit::mapWaveToScalarAluGlobalIdx(Wavefront *w) const { return numVectorALUs + mapWaveToScalarAlu(w); } // index into readyList of Global Memory unit used by wavefront int ComputeUnit::mapWaveToGlobalMem(Wavefront *w) const { // TODO: FIXME if more than 1 GM pipe supported return numVectorALUs + numScalarALUs; } // index into readyList of Local Memory unit used by wavefront int ComputeUnit::mapWaveToLocalMem(Wavefront *w) const { // TODO: FIXME if more than 1 LM pipe supported return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits; } // index into readyList of Scalar Memory unit used by wavefront int ComputeUnit::mapWaveToScalarMem(Wavefront *w) const { // TODO: FIXME if more than 1 ScM pipe supported return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits + numVectorSharedMemUnits; } void ComputeUnit::fillKernelState(Wavefront *w, HSAQueueEntry *task) { w->resizeRegFiles(task->numVectorRegs(), task->numScalarRegs()); w->workGroupSz[0] = task->wgSize(0); w->workGroupSz[1] = task->wgSize(1); w->workGroupSz[2] = task->wgSize(2); w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2]; w->gridSz[0] = task->gridSize(0); w->gridSz[1] = task->gridSize(1); w->gridSz[2] = task->gridSize(2); w->computeActualWgSz(task); } // delete all wavefronts that have been marked as ready at SCB stage // but are found to have empty instruction buffers at SCH stage void ComputeUnit::updateReadyList(int unitId) { if (!readyList[unitId].empty()) { for (std::vector::iterator it = readyList[unitId].begin(); it != readyList[unitId].end();) { if ((*it)->instructionBuffer.empty()) { it = readyList[unitId].erase(it); } else { ++it; } } } } void ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, bool fetchContext) { static int _n_wave = 0; VectorMask init_mask; init_mask.reset(); for (int k = 0; k < wfSize(); ++k) { if (k + waveId * wfSize() < w->actualWgSzTotal) init_mask[k] = 1; } w->execMask() = init_mask; w->kernId = task->dispatchId(); w->wfId = waveId; w->initMask = init_mask.to_ullong(); for (int k = 0; k < wfSize(); ++k) { w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0]; w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) % w->actualWgSz[1]; w->workItemId[2][k] = (k + waveId * wfSize()) / (w->actualWgSz[0] * w->actualWgSz[1]); w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] * w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] + w->workItemId[0][k]; } w->barrierSlots = divCeil(w->actualWgSzTotal, wfSize()); w->barCnt.resize(wfSize(), 0); w->maxBarCnt = 0; w->oldBarrierCnt = 0; w->barrierCnt = 0; // WG state w->wgId = task->globalWgId(); w->dispatchId = task->dispatchId(); w->workGroupId[0] = w->wgId % task->numWg(0); w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1); w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1)); w->barrierId = barrier_id; w->stalledAtBarrier = (w->oldBarrierCnt == w->barrierCnt) ? false : true; // set the wavefront context to have a pointer to this section of the LDS w->ldsChunk = ldsChunk; int32_t refCount M5_VAR_USED = lds.increaseRefCounter(w->dispatchId, w->wgId); DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n", cu_id, w->wgId, refCount); w->instructionBuffer.clear(); if (w->pendingFetch) w->dropFetch = true; DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: " "WF[%d][%d]\n", _n_wave, w->barrierId, cu_id, w->simdId, w->wfSlotId); w->initRegState(task, w->actualWgSzTotal); w->start(_n_wave++, task->codeAddr()); waveLevelParallelism.sample(activeWaves); activeWaves++; } /** * trigger invalidate operation in the cu * * req: request initialized in shader, carrying the invlidate flags */ void ComputeUnit::doInvalidate(RequestPtr req, int kernId){ GPUDynInstPtr gpuDynInst = std::make_shared(this, nullptr, new KernelLaunchStaticInst(), getAndIncSeqNum()); // kern_id will be used in inv responses gpuDynInst->kern_id = kernId; // update contextId field req->setContext(gpuDynInst->wfDynId); injectGlobalMemFence(gpuDynInst, true, req); } /** * trigger flush operation in the cu * * gpuDynInst: inst passed to the request */ void ComputeUnit::doFlush(GPUDynInstPtr gpuDynInst) { injectGlobalMemFence(gpuDynInst, true); } void ComputeUnit::dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler) { // If we aren't ticking, start it up! if (!tickEvent.scheduled()) { DPRINTF(GPUDisp, "CU%d: Scheduling wakeup next cycle\n", cu_id); schedule(tickEvent, nextCycle()); } // the kernel's invalidate must have finished before any wg dispatch assert(task->isInvDone()); // reserve the LDS capacity allocated to the work group // disambiguated by the dispatch ID and workgroup ID, which should be // globally unique LdsChunk *ldsChunk = lds.reserveSpace(task->dispatchId(), task->globalWgId(), task->ldsSize()); panic_if(!ldsChunk, "was not able to reserve space for this WG"); // calculate the number of 32-bit vector registers required // by each work item int vregDemand = task->numVectorRegs(); int sregDemand = task->numScalarRegs(); int wave_id = 0; // Assign WFs according to numWfsToSched vector, which is computed by // hasDispResources() for (int j = 0; j < shader->n_wf; ++j) { for (int i = 0; i < numVectorALUs; ++i) { Wavefront *w = wfList[i][j]; // Check if this wavefront slot is available and there are WFs // remaining to be dispatched to current SIMD: // WF slot must be stopped and not waiting // for a release to complete S_RETURNING if (w->getStatus() == Wavefront::S_STOPPED && numWfsToSched[i] > 0) { // decrement number of WFs awaiting dispatch to current SIMD numWfsToSched[i] -= 1; fillKernelState(w, task); DPRINTF(GPURename, "SIMD[%d] wfSlotId[%d] WF[%d] " "vregDemand[%d] sregDemand[%d]\n", i, j, w->wfDynId, vregDemand, sregDemand); registerManager->allocateRegisters(w, vregDemand, sregDemand); startWavefront(w, wave_id, ldsChunk, task); ++wave_id; } } } ++barrier_id; } void ComputeUnit::insertInPipeMap(Wavefront *w) { panic_if(w->instructionBuffer.empty(), "Instruction Buffer of WF%d can't be empty", w->wgId); GPUDynInstPtr ii = w->instructionBuffer.front(); pipeMap.emplace(ii->seqNum()); } void ComputeUnit::deleteFromPipeMap(Wavefront *w) { panic_if(w->instructionBuffer.empty(), "Instruction Buffer of WF%d can't be empty", w->wgId); GPUDynInstPtr ii = w->instructionBuffer.front(); // delete the dynamic instruction from the pipeline map auto it = pipeMap.find(ii->seqNum()); panic_if(it == pipeMap.end(), "Pipeline Map is empty\n"); pipeMap.erase(it); } bool ComputeUnit::hasDispResources(HSAQueueEntry *task) { // compute true size of workgroup (after clamping to grid size) int trueWgSize[HSAQueueEntry::MAX_DIM]; int trueWgSizeTotal = 1; for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) { trueWgSize[d] = std::min(task->wgSize(d), task->gridSize(d) - task->wgId(d) * task->wgSize(d)); trueWgSizeTotal *= trueWgSize[d]; DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]); } DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal); // calculate the number of WFs in this WG int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize(); // calculate the number of 32-bit vector registers required by each // work item of the work group int vregDemandPerWI = task->numVectorRegs(); // calculate the number of 32-bit scalar registers required by each // work item of the work group int sregDemandPerWI = task->numScalarRegs(); // check if the total number of VGPRs snd SGPRs required by all WFs // of the WG fit in the VRFs of all SIMD units and the CU's SRF panic_if((numWfs * vregDemandPerWI) > (numVectorALUs * numVecRegsPerSimd), "WG with %d WFs and %d VGPRs per WI can not be allocated to CU " "that has %d VGPRs\n", numWfs, vregDemandPerWI, numVectorALUs * numVecRegsPerSimd); panic_if((numWfs * sregDemandPerWI) > numScalarRegsPerSimd, "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU " "with %d SGPRs\n", numWfs, sregDemandPerWI, numScalarRegsPerSimd); // number of WF slots that are not occupied int freeWfSlots = 0; // number of Wfs from WG that were successfully mapped to a SIMD int numMappedWfs = 0; numWfsToSched.clear(); numWfsToSched.resize(numVectorALUs, 0); // attempt to map WFs to the SIMDs, based on WF slot availability // and register file availability for (int j = 0; j < shader->n_wf; ++j) { for (int i = 0; i < numVectorALUs; ++i) { if (wfList[i][j]->getStatus() == Wavefront::S_STOPPED) { ++freeWfSlots; // check if current WF will fit onto current SIMD/VRF // if all WFs have not yet been mapped to the SIMDs if (numMappedWfs < numWfs && registerManager->canAllocateSgprs(i, numWfsToSched[i] + 1, sregDemandPerWI) && registerManager->canAllocateVgprs(i, numWfsToSched[i] + 1, vregDemandPerWI)) { numWfsToSched[i]++; numMappedWfs++; } } } } // check that the number of mapped WFs is not greater // than the actual number of WFs assert(numMappedWfs <= numWfs); bool vregAvail = true; bool sregAvail = true; // if a WF to SIMD mapping was not found, find the limiting resource if (numMappedWfs < numWfs) { for (int j = 0; j < numVectorALUs; ++j) { // find if there are enough free VGPRs in the SIMD's VRF // to accomodate the WFs of the new WG that would be mapped // to this SIMD unit vregAvail &= registerManager-> canAllocateVgprs(j, numWfsToSched[j], vregDemandPerWI); // find if there are enough free SGPRs in the SIMD's SRF // to accomodate the WFs of the new WG that would be mapped // to this SIMD unit sregAvail &= registerManager-> canAllocateSgprs(j, numWfsToSched[j], sregDemandPerWI); } } DPRINTF(GPUDisp, "Free WF slots = %d, Mapped WFs = %d, \ VGPR Availability = %d, SGPR Availability = %d\n", freeWfSlots, numMappedWfs, vregAvail, sregAvail); if (!vregAvail) { ++numTimesWgBlockedDueVgprAlloc; } if (!sregAvail) { ++numTimesWgBlockedDueSgprAlloc; } // Return true if enough WF slots to submit workgroup and if there are // enough VGPRs to schedule all WFs to their SIMD units bool ldsAvail = lds.canReserve(task->ldsSize()); if (!ldsAvail) { wgBlockedDueLdsAllocation++; } // Return true if the following are all true: // (a) all WFs of the WG were mapped to free WF slots // (b) there are enough VGPRs to schedule all WFs to their SIMD units // (c) there are enough SGPRs on the CU to schedule all WFs // (d) there is enough space in LDS to allocate for all WFs bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail && ldsAvail; return can_dispatch; } int ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots) { DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id); int ccnt = 0; for (int i_simd = 0; i_simd < numVectorALUs; ++i_simd) { for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) { Wavefront *w = wfList[i_simd][i_wf]; if (w->getStatus() == Wavefront::S_RUNNING) { DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf); DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n", w->barrierId, _barrier_id); DPRINTF(GPUSync, "wf->barrierCnt %d, bcnt = %d\n", w->barrierCnt, bcnt); DPRINTF(GPUSync, "outstanding Reqs = %d\n", w->outstandingReqs); } if (w->getStatus() == Wavefront::S_RUNNING && w->barrierId == _barrier_id && w->barrierCnt == bcnt && !w->outstandingReqs) { ++ccnt; DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to " "%d\n", i_simd, i_wf, ccnt); } } } DPRINTF(GPUSync, "CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n", cu_id, ccnt, bslots); return ccnt == bslots; } // Execute one clock worth of work on the ComputeUnit. void ComputeUnit::exec() { // process reads and writes in the RFs for (auto &vecRegFile : vrf) { vecRegFile->exec(); } for (auto &scRegFile : srf) { scRegFile->exec(); } // Execute pipeline stages in reverse order to simulate // the pipeline latency scalarMemoryPipe.exec(); globalMemoryPipe.exec(); localMemoryPipe.exec(); execStage.exec(); scheduleStage.exec(); scoreboardCheckStage.exec(); fetchStage.exec(); totalCycles++; // Put this CU to sleep if there is no more work to be done. if (!isDone()) { schedule(tickEvent, nextCycle()); } else { shader->notifyCuSleep(); DPRINTF(GPUDisp, "CU%d: Going to sleep\n", cu_id); } } void ComputeUnit::init() { // Initialize CU Bus models and execution resources // Vector ALUs vectorALUs.clear(); for (int i = 0; i < numVectorALUs; i++) { vectorALUs.emplace_back(this, clockPeriod()); } // Scalar ALUs scalarALUs.clear(); for (int i = 0; i < numScalarALUs; i++) { scalarALUs.emplace_back(this, clockPeriod()); } // Vector Global Memory fatal_if(numVectorGlobalMemUnits > 1, "No support for multiple Global Memory Pipelines exists!!!"); vectorGlobalMemUnit.init(this, clockPeriod()); vrfToGlobalMemPipeBus.init(this, clockPeriod()); glbMemToVrfBus.init(this, clockPeriod()); // Vector Local/Shared Memory fatal_if(numVectorSharedMemUnits > 1, "No support for multiple Local Memory Pipelines exists!!!"); vectorSharedMemUnit.init(this, clockPeriod()); vrfToLocalMemPipeBus.init(this, clockPeriod()); locMemToVrfBus.init(this, clockPeriod()); // Scalar Memory fatal_if(numScalarMemUnits > 1, "No support for multiple Scalar Memory Pipelines exists!!!"); scalarMemUnit.init(this, clockPeriod()); srfToScalarMemPipeBus.init(this, clockPeriod()); scalarMemToSrfBus.init(this, clockPeriod()); vectorRegsReserved.resize(numVectorALUs, 0); scalarRegsReserved.resize(numVectorALUs, 0); // Initializing pipeline resources readyList.resize(numExeUnits()); for (int j = 0; j < numExeUnits(); ++j) { dispatchList.push_back(std::make_pair(nullptr, EMPTY)); } fetchStage.init(this); scoreboardCheckStage.init(this); scheduleStage.init(this); execStage.init(this); globalMemoryPipe.init(this); localMemoryPipe.init(this); scalarMemoryPipe.init(this); gmTokenPort.setTokenManager(memPortTokens); } bool ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) { // Ruby has completed the memory op. Schedule the mem_resp_event at the // appropriate cycle to process the timing memory response // This delay represents the pipeline delay SenderState *sender_state = safe_cast(pkt->senderState); int index = sender_state->port_index; GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; GPUDispatcher &dispatcher = computeUnit->shader->dispatcher(); // MemSyncResp + WriteAckResp are handled completely here and we don't // schedule a MemRespEvent to process the responses further if (pkt->cmd == MemCmd::MemSyncResp) { // This response is for 1 of the following request types: // - kernel launch // - kernel end // - non-kernel mem sync // Kernel Launch // wavefront was nullptr when launching kernel, so it is meaningless // here (simdId=-1, wfSlotId=-1) if (gpuDynInst->isKernelLaunch()) { // for kernel launch, the original request must be both kernel-type // and acquire assert(pkt->req->isKernel()); assert(pkt->req->isAcquire()); // one D-Cache inv is done, decrement counter dispatcher.updateInvCounter(gpuDynInst->kern_id); delete pkt->senderState; delete pkt; return true; } // retrieve wavefront from inst Wavefront *w = gpuDynInst->wavefront(); // Check if we are waiting on Kernel End Release if (w->getStatus() == Wavefront::S_RETURNING && gpuDynInst->isEndOfKernel()) { // for kernel end, the original request must be both kernel-type // and release assert(pkt->req->isKernel()); assert(pkt->req->isRelease()); // one wb done, decrement counter, and return whether all wbs are // done for the kernel bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id); // not all wbs are done for the kernel, just release pkt // resources if (!isWbDone) { delete pkt->senderState; delete pkt; return true; } // all wbs are completed for the kernel, do retirement work // for the workgroup DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG %d completed\n", computeUnit->cu_id, w->simdId, w->wfSlotId, w->wfDynId, w->wgId); dispatcher.notifyWgCompl(w); w->setStatus(Wavefront::S_STOPPED); } if (!pkt->req->isKernel()) { w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId]; DPRINTF(GPUExec, "MemSyncResp: WF[%d][%d] WV%d %s decrementing " "outstanding reqs %d => %d\n", gpuDynInst->simdId, gpuDynInst->wfSlotId, gpuDynInst->wfDynId, gpuDynInst->disassemble(), w->outstandingReqs, w->outstandingReqs - 1); computeUnit->globalMemoryPipe.handleResponse(gpuDynInst); } DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrierCnt = %d\n", computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, w->barrierCnt); delete pkt->senderState; delete pkt; return true; } else if (pkt->cmd == MemCmd::WriteCompleteResp) { // this is for writeComplete callback // we simply get decrement write-related wait counters assert(gpuDynInst); Wavefront *w M5_VAR_USED = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId]; assert(w); DPRINTF(GPUExec, "WriteCompleteResp: WF[%d][%d] WV%d %s decrementing " "outstanding reqs %d => %d\n", gpuDynInst->simdId, gpuDynInst->wfSlotId, gpuDynInst->wfDynId, gpuDynInst->disassemble(), w->outstandingReqs, w->outstandingReqs - 1); if (gpuDynInst->allLanesZero()) { // ask gm pipe to decrement request counters, instead of directly // performing here, to avoid asynchronous counter update and // instruction retirement (which may hurt waincnt effects) computeUnit->globalMemoryPipe.handleResponse(gpuDynInst); DPRINTF(GPUMem, "CU%d: WF[%d][%d]: write totally complete\n", computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId); } delete pkt->senderState; delete pkt; return true; } EventFunctionWrapper *mem_resp_event = computeUnit->memPort[index]->createMemRespEvent(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n", computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, gpuDynInst->seqNum(), index, pkt->req->getPaddr()); computeUnit->schedule(mem_resp_event, curTick() + computeUnit->resp_tick_latency); return true; } bool ComputeUnit::ScalarDataPort::recvTimingResp(PacketPtr pkt) { assert(!pkt->req->isKernel()); // retrieve sender state SenderState *sender_state = safe_cast(pkt->senderState); GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; assert(pkt->isRead() || pkt->isWrite()); assert(gpuDynInst->numScalarReqs > 0); gpuDynInst->numScalarReqs--; /** * for each returned scalar request we decrement the * numScalarReqs counter that is associated with this * gpuDynInst, which should have been set to correspond * to the number of packets sent for the memory op. * once all packets return, the memory op is finished * and we can push it into the response queue. */ if (!gpuDynInst->numScalarReqs) { if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) { computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push( gpuDynInst); } else { computeUnit->scalarMemoryPipe.getGMStRespFIFO().push( gpuDynInst); } } delete pkt->senderState; delete pkt; return true; } void ComputeUnit::ScalarDataPort::recvReqRetry() { for (const auto &pkt : retries) { if (!sendTimingReq(pkt)) { break; } else { retries.pop_front(); } } } void ComputeUnit::DataPort::recvReqRetry() { int len = retries.size(); assert(len > 0); for (int i = 0; i < len; ++i) { PacketPtr pkt = retries.front().first; GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second; DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n", computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, pkt->req->getPaddr()); /** Currently Ruby can return false due to conflicts for the particular * cache block or address. Thus other requests should be allowed to * pass and the data port should expect multiple retries. */ if (!sendTimingReq(pkt)) { DPRINTF(GPUMem, "failed again!\n"); break; } else { DPRINTF(GPUMem, "successful!\n"); retries.pop_front(); } } } bool ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt) { computeUnit->fetchStage.processFetchReturn(pkt); return true; } void ComputeUnit::SQCPort::recvReqRetry() { int len = retries.size(); assert(len > 0); for (int i = 0; i < len; ++i) { PacketPtr pkt = retries.front().first; Wavefront *wavefront M5_VAR_USED = retries.front().second; DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n", computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr()); if (!sendTimingReq(pkt)) { DPRINTF(GPUFetch, "failed again!\n"); break; } else { DPRINTF(GPUFetch, "successful!\n"); retries.pop_front(); } } } void ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt) { // There must be a way around this check to do the globalMemStart... Addr tmp_vaddr = pkt->req->getVaddr(); updatePageDivergenceDist(tmp_vaddr); // set PC in request pkt->req->setPC(gpuDynInst->wavefront()->pc()); pkt->req->setReqInstSeqNum(gpuDynInst->seqNum()); // figure out the type of the request to set read/write BaseTLB::Mode TLB_mode; assert(pkt->isRead() || pkt->isWrite()); // only do some things if actually accessing data bool isDataAccess = pkt->isWrite() || pkt->isRead(); // Check write before read for atomic operations // since atomic operations should use BaseTLB::Write if (pkt->isWrite()) { TLB_mode = BaseTLB::Write; } else if (pkt->isRead()) { TLB_mode = BaseTLB::Read; } else { fatal("pkt is not a read nor a write\n"); } tlbCycles -= curTick(); ++tlbRequests; int tlbPort_index = perLaneTLB ? index : 0; if (shader->timingSim) { if (debugSegFault) { Process *p = shader->gpuTc->getProcessPtr(); Addr vaddr = pkt->req->getVaddr(); unsigned size = pkt->getSize(); if ((vaddr + size - 1) % 64 < vaddr % 64) { panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr); } Addr paddr; if (!p->pTable->translate(vaddr, paddr)) { if (!p->fixupFault(vaddr)) { panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr); } } } // This is the SenderState needed upon return pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index); // This is the senderState needed by the TLB hierarchy to function TheISA::GpuTLB::TranslationState *translation_state = new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false, pkt->senderState); pkt->senderState = translation_state; if (functionalTLB) { tlbPort[tlbPort_index]->sendFunctional(pkt); // update the hitLevel distribution int hit_level = translation_state->hitLevel; assert(hit_level != -1); hitsPerTLBLevel[hit_level]++; // New SenderState for the memory access X86ISA::GpuTLB::TranslationState *sender_state = safe_cast(pkt->senderState); delete sender_state->tlbEntry; delete sender_state->saved; delete sender_state; assert(pkt->req->hasPaddr()); assert(pkt->req->hasSize()); // this is necessary because the GPU TLB receives packets instead // of requests. when the translation is complete, all relevent // fields in the request will be populated, but not in the packet. // here we create the new packet so we can set the size, addr, // and proper flags. PacketPtr oldPkt = pkt; pkt = new Packet(oldPkt->req, oldPkt->cmd); if (isDataAccess) { uint8_t *tmpData = oldPkt->getPtr(); pkt->dataStatic(tmpData); } delete oldPkt; // New SenderState for the memory access pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, index, nullptr); gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index); gpuDynInst->tlbHitLevel[index] = hit_level; // translation is done. Schedule the mem_req_event at the // appropriate cycle to send the timing memory request to ruby EventFunctionWrapper *mem_req_event = memPort[index]->createMemReqEvent(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data " "scheduled\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index, pkt->req->getPaddr()); schedule(mem_req_event, curTick() + req_tick_latency); } else if (tlbPort[tlbPort_index]->isStalled()) { assert(tlbPort[tlbPort_index]->retries.size() > 0); DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x " "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr); tlbPort[tlbPort_index]->retries.push_back(pkt); } else if (!tlbPort[tlbPort_index]->sendTimingReq(pkt)) { // Stall the data port; // No more packet will be issued till // ruby indicates resources are freed by // a recvReqRetry() call back on this port. tlbPort[tlbPort_index]->stallPort(); DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x " "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr); tlbPort[tlbPort_index]->retries.push_back(pkt); } else { DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr); } } else { if (pkt->cmd == MemCmd::MemSyncReq) { gpuDynInst->resetEntireStatusVector(); } else { gpuDynInst->decrementStatusVector(index); } // New SenderState for the memory access delete pkt->senderState; // Because it's atomic operation, only need TLB translation state pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc); tlbPort[tlbPort_index]->sendFunctional(pkt); // the addr of the packet is not modified, so we need to create a new // packet, or otherwise the memory access will have the old virtual // address sent in the translation packet, instead of the physical // address returned by the translation. PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd); new_pkt->dataStatic(pkt->getPtr()); // Translation is done. It is safe to send the packet to memory. memPort[0]->sendFunctional(new_pkt); DPRINTF(GPUMem, "Functional sendRequest\n"); DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index, new_pkt->req->getPaddr()); // safe_cast the senderState TheISA::GpuTLB::TranslationState *sender_state = safe_cast(pkt->senderState); delete sender_state->tlbEntry; delete new_pkt; delete pkt->senderState; delete pkt; } } void ComputeUnit::sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt) { assert(pkt->isWrite() || pkt->isRead()); BaseTLB::Mode tlb_mode = pkt->isRead() ? BaseTLB::Read : BaseTLB::Write; pkt->senderState = new ComputeUnit::ScalarDTLBPort::SenderState(gpuDynInst); pkt->senderState = new TheISA::GpuTLB::TranslationState(tlb_mode, shader->gpuTc, false, pkt->senderState); if (scalarDTLBPort->isStalled()) { assert(scalarDTLBPort->retries.size()); scalarDTLBPort->retries.push_back(pkt); } else if (!scalarDTLBPort->sendTimingReq(pkt)) { scalarDTLBPort->stallPort(); scalarDTLBPort->retries.push_back(pkt); } else { DPRINTF(GPUTLB, "sent scalar %s translation request for addr %#x\n", tlb_mode == BaseTLB::Read ? "read" : "write", pkt->req->getVaddr()); } } void ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req) { assert(gpuDynInst->isGlobalSeg() || gpuDynInst->executedAs() == Enums::SC_GLOBAL); if (!req) { req = std::make_shared( 0, 0, 0, masterId(), 0, gpuDynInst->wfDynId); } // all mem sync requests have Paddr == 0 req->setPaddr(0); PacketPtr pkt = nullptr; if (kernelMemSync) { if (gpuDynInst->isKernelLaunch()) { req->setCacheCoherenceFlags(Request::ACQUIRE); req->setReqInstSeqNum(gpuDynInst->seqNum()); req->setFlags(Request::KERNEL); pkt = new Packet(req, MemCmd::MemSyncReq); pkt->pushSenderState( new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr)); EventFunctionWrapper *mem_req_event = memPort[0]->createMemReqEvent(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling " "an acquire\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0, pkt->req->getPaddr()); schedule(mem_req_event, curTick() + req_tick_latency); } else { assert(gpuDynInst->isEndOfKernel()); req->setCacheCoherenceFlags(Request::RELEASE); req->setReqInstSeqNum(gpuDynInst->seqNum()); req->setFlags(Request::KERNEL); pkt = new Packet(req, MemCmd::MemSyncReq); pkt->pushSenderState( new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr)); EventFunctionWrapper *mem_req_event = memPort[0]->createMemReqEvent(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling " "a release\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0, pkt->req->getPaddr()); schedule(mem_req_event, curTick() + req_tick_latency); } } else { gpuDynInst->setRequestFlags(req); req->setReqInstSeqNum(gpuDynInst->seqNum()); pkt = new Packet(req, MemCmd::MemSyncReq); pkt->pushSenderState( new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr)); EventFunctionWrapper *mem_req_event = memPort[0]->createMemReqEvent(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0, pkt->req->getPaddr()); schedule(mem_req_event, curTick() + req_tick_latency); } } void ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt) { DataPort::SenderState *sender_state = safe_cast(pkt->senderState); GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; ComputeUnit *compute_unit = computeUnit; assert(gpuDynInst); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n", compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, pkt->req->getPaddr(), index); Addr paddr = pkt->req->getPaddr(); // mem sync resp and write-complete callback must be handled already in // DataPort::recvTimingResp assert(pkt->cmd != MemCmd::MemSyncResp); assert(pkt->cmd != MemCmd::WriteCompleteResp); // this is for read, write and atomic int index = gpuDynInst->memStatusVector[paddr].back(); DPRINTF(GPUMem, "Response for addr %#x, index %d\n", pkt->req->getPaddr(), index); gpuDynInst->memStatusVector[paddr].pop_back(); gpuDynInst->pAddr = pkt->req->getPaddr(); gpuDynInst->decrementStatusVector(index); DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector()); if (gpuDynInst->allLanesZero()) { auto iter = gpuDynInst->memStatusVector.begin(); auto end = gpuDynInst->memStatusVector.end(); while (iter != end) { assert(iter->second.empty()); ++iter; } // Calculate the difference between the arrival of the first cache // block and the last cache block to arrive if we have the time // for the first cache block. if (compute_unit->headTailMap.count(gpuDynInst)) { Tick headTick = compute_unit->headTailMap.at(gpuDynInst); compute_unit->headTailLatency.sample(curTick() - headTick); compute_unit->headTailMap.erase(gpuDynInst); } gpuDynInst->memStatusVector.clear(); // note: only handle read response here; for write, the response // is separately handled when writeComplete callback is received if (pkt->isRead()) { gpuDynInst-> profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue); compute_unit->globalMemoryPipe.handleResponse(gpuDynInst); DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n", compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId); } } else { if (!compute_unit->headTailMap.count(gpuDynInst)) { compute_unit->headTailMap.insert( std::make_pair(gpuDynInst, curTick())); } } delete pkt->senderState; delete pkt; } ComputeUnit* ComputeUnitParams::create() { return new ComputeUnit(this); } bool ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt) { Addr line = pkt->req->getPaddr(); DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id, pkt->req->getVaddr(), line); assert(pkt->senderState); computeUnit->tlbCycles += curTick(); // pop off the TLB translation state TheISA::GpuTLB::TranslationState *translation_state = safe_cast(pkt->senderState); // no PageFaults are permitted for data accesses if (!translation_state->tlbEntry) { DTLBPort::SenderState *sender_state = safe_cast(translation_state->saved); Wavefront *w M5_VAR_USED = computeUnit->wfList[sender_state->_gpuDynInst->simdId] [sender_state->_gpuDynInst->wfSlotId]; DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId, pkt->req->getVaddr()); } // update the hitLevel distribution int hit_level = translation_state->hitLevel; computeUnit->hitsPerTLBLevel[hit_level]++; delete translation_state->tlbEntry; assert(!translation_state->ports.size()); pkt->senderState = translation_state->saved; // for prefetch pkt BaseTLB::Mode TLB_mode = translation_state->tlbMode; delete translation_state; // use the original sender state to know how to close this transaction DTLBPort::SenderState *sender_state = safe_cast(pkt->senderState); GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; int mp_index = sender_state->portIndex; Addr vaddr = pkt->req->getVaddr(); gpuDynInst->memStatusVector[line].push_back(mp_index); gpuDynInst->tlbHitLevel[mp_index] = hit_level; MemCmd requestCmd; if (pkt->cmd == MemCmd::ReadResp) { requestCmd = MemCmd::ReadReq; } else if (pkt->cmd == MemCmd::WriteResp) { requestCmd = MemCmd::WriteReq; } else if (pkt->cmd == MemCmd::SwapResp) { requestCmd = MemCmd::SwapReq; } else { panic("unsupported response to request conversion %s\n", pkt->cmd.toString()); } if (computeUnit->prefetchDepth) { int simdId = gpuDynInst->simdId; int wfSlotId = gpuDynInst->wfSlotId; Addr last = 0; switch(computeUnit->prefetchType) { case Enums::PF_CU: last = computeUnit->lastVaddrCU[mp_index]; break; case Enums::PF_PHASE: last = computeUnit->lastVaddrSimd[simdId][mp_index]; break; case Enums::PF_WF: last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index]; default: break; } DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n", computeUnit->cu_id, simdId, wfSlotId, mp_index, last); int stride = last ? (roundDown(vaddr, TheISA::PageBytes) - roundDown(last, TheISA::PageBytes)) >> TheISA::PageShift : 0; DPRINTF(GPUPrefetch, "Stride is %d\n", stride); computeUnit->lastVaddrCU[mp_index] = vaddr; computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr; computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr; stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ? computeUnit->prefetchStride: stride; DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr, computeUnit->cu_id, simdId, wfSlotId, mp_index); DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr); // Prefetch Next few pages atomically for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) { DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride, vaddr+stride*pf*TheISA::PageBytes); if (!stride) break; RequestPtr prefetch_req = std::make_shared( vaddr + stride * pf * TheISA::PageBytes, sizeof(uint8_t), 0, computeUnit->masterId(), 0, 0, nullptr); PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd); uint8_t foo = 0; prefetch_pkt->dataStatic(&foo); // Because it's atomic operation, only need TLB translation state prefetch_pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode, computeUnit->shader->gpuTc, true); // Currently prefetches are zero-latency, hence the sendFunctional sendFunctional(prefetch_pkt); /* safe_cast the senderState */ TheISA::GpuTLB::TranslationState *tlb_state = safe_cast( prefetch_pkt->senderState); delete tlb_state->tlbEntry; delete tlb_state; delete prefetch_pkt; } } // First we must convert the response cmd back to a request cmd so that // the request can be sent through the cu's master port PacketPtr new_pkt = new Packet(pkt->req, requestCmd); new_pkt->dataStatic(pkt->getPtr()); delete pkt->senderState; delete pkt; // New SenderState for the memory access new_pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index, nullptr); // translation is done. Schedule the mem_req_event at the appropriate // cycle to send the timing memory request to ruby EventFunctionWrapper *mem_req_event = computeUnit->memPort[mp_index]->createMemReqEvent(new_pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n", computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr()); computeUnit->schedule(mem_req_event, curTick() + computeUnit->req_tick_latency); return true; } EventFunctionWrapper* ComputeUnit::DataPort::createMemReqEvent(PacketPtr pkt) { return new EventFunctionWrapper( [this, pkt]{ processMemReqEvent(pkt); }, "ComputeUnit memory request event", true); } EventFunctionWrapper* ComputeUnit::DataPort::createMemRespEvent(PacketPtr pkt) { return new EventFunctionWrapper( [this, pkt]{ processMemRespEvent(pkt); }, "ComputeUnit memory response event", true); } void ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt) { SenderState *sender_state = safe_cast(pkt->senderState); GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; ComputeUnit *compute_unit M5_VAR_USED = computeUnit; if (!(sendTimingReq(pkt))) { retries.push_back(std::make_pair(pkt, gpuDynInst)); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n", compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index, pkt->req->getPaddr()); } else { DPRINTF(GPUPort, "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data " "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, gpuDynInst->seqNum(), index, pkt->req->getPaddr()); } } const char* ComputeUnit::ScalarDataPort::MemReqEvent::description() const { return "ComputeUnit scalar memory request event"; } void ComputeUnit::ScalarDataPort::MemReqEvent::process() { SenderState *sender_state = safe_cast(pkt->senderState); GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; ComputeUnit *compute_unit M5_VAR_USED = scalarDataPort->computeUnit; if (!(scalarDataPort->sendTimingReq(pkt))) { scalarDataPort->retries.push_back(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n", compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, scalarDataPort->index, pkt->req->getPaddr()); } else { DPRINTF(GPUPort, "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data " "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, gpuDynInst->seqNum(), scalarDataPort->index, pkt->req->getPaddr()); } } /* * The initial translation request could have been rejected, * if queue is not Retry sending the translation * request. sendRetry() is called from the peer port whenever * a translation completes. */ void ComputeUnit::DTLBPort::recvReqRetry() { int len = retries.size(); DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n", computeUnit->cu_id, len); assert(len > 0); assert(isStalled()); // recvReqRetry is an indication that the resource on which this // port was stalling on is freed. So, remove the stall first unstallPort(); for (int i = 0; i < len; ++i) { PacketPtr pkt = retries.front(); Addr vaddr M5_VAR_USED = pkt->req->getVaddr(); DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr); if (!sendTimingReq(pkt)) { // Stall port stallPort(); DPRINTF(GPUTLB, ": failed again\n"); break; } else { DPRINTF(GPUTLB, ": successful\n"); retries.pop_front(); } } } bool ComputeUnit::ScalarDTLBPort::recvTimingResp(PacketPtr pkt) { assert(pkt->senderState); TheISA::GpuTLB::TranslationState *translation_state = safe_cast(pkt->senderState); // Page faults are not allowed fatal_if(!translation_state->tlbEntry, "Translation of vaddr %#x failed\n", pkt->req->getVaddr()); delete translation_state->tlbEntry; assert(!translation_state->ports.size()); pkt->senderState = translation_state->saved; delete translation_state; ScalarDTLBPort::SenderState *sender_state = safe_cast(pkt->senderState); GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; delete pkt->senderState; Wavefront *w M5_VAR_USED = gpuDynInst->wavefront(); DPRINTF(GPUTLB, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received " "translation: PA %#x -> %#x\n", computeUnit->cu_id, w->simdId, w->wfSlotId, w->kernId, pkt->req->getVaddr(), pkt->req->getPaddr()); MemCmd mem_cmd; if (pkt->cmd == MemCmd::ReadResp) { mem_cmd = MemCmd::ReadReq; } else if (pkt->cmd == MemCmd::WriteResp) { mem_cmd = MemCmd::WriteReq; } else { fatal("Scalar DTLB receieved unexpected MemCmd response %s\n", pkt->cmd.toString()); } PacketPtr req_pkt = new Packet(pkt->req, mem_cmd); req_pkt->dataStatic(pkt->getPtr()); delete pkt; req_pkt->senderState = new ComputeUnit::ScalarDataPort::SenderState(gpuDynInst); if (!computeUnit->scalarDataPort->sendTimingReq(req_pkt)) { computeUnit->scalarDataPort->retries.push_back(req_pkt); DPRINTF(GPUMem, "send scalar req failed for: %s\n", gpuDynInst->disassemble()); } else { DPRINTF(GPUMem, "send scalar req for: %s\n", gpuDynInst->disassemble()); } return true; } bool ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt) { Addr line M5_VAR_USED = pkt->req->getPaddr(); DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n", computeUnit->cu_id, pkt->req->getVaddr(), line); assert(pkt->senderState); // pop off the TLB translation state TheISA::GpuTLB::TranslationState *translation_state = safe_cast(pkt->senderState); bool success = translation_state->tlbEntry != nullptr; delete translation_state->tlbEntry; assert(!translation_state->ports.size()); pkt->senderState = translation_state->saved; delete translation_state; // use the original sender state to know how to close this transaction ITLBPort::SenderState *sender_state = safe_cast(pkt->senderState); // get the wavefront associated with this translation request Wavefront *wavefront = sender_state->wavefront; delete pkt->senderState; if (success) { // pkt is reused in fetch(), don't delete it here. However, we must // reset the command to be a request so that it can be sent through // the cu's master port assert(pkt->cmd == MemCmd::ReadResp); pkt->cmd = MemCmd::ReadReq; computeUnit->fetchStage.fetch(pkt, wavefront); } else { if (wavefront->dropFetch) { assert(wavefront->instructionBuffer.empty()); wavefront->dropFetch = false; } wavefront->pendingFetch = 0; } return true; } /* * The initial translation request could have been rejected, if * queue is not empty. Retry sending the translation * request. sendRetry() is called from the peer port whenever * a translation completes. */ void ComputeUnit::ITLBPort::recvReqRetry() { int len = retries.size(); DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len); assert(len > 0); assert(isStalled()); // recvReqRetry is an indication that the resource on which this // port was stalling on is freed. So, remove the stall first unstallPort(); for (int i = 0; i < len; ++i) { PacketPtr pkt = retries.front(); Addr vaddr M5_VAR_USED = pkt->req->getVaddr(); DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr); if (!sendTimingReq(pkt)) { stallPort(); // Stall port DPRINTF(GPUTLB, ": failed again\n"); break; } else { DPRINTF(GPUTLB, ": successful\n"); retries.pop_front(); } } } void ComputeUnit::regStats() { ClockedObject::regStats(); vALUInsts .name(name() + ".valu_insts") .desc("Number of vector ALU insts issued.") ; vALUInstsPerWF .name(name() + ".valu_insts_per_wf") .desc("The avg. number of vector ALU insts issued per-wavefront.") ; sALUInsts .name(name() + ".salu_insts") .desc("Number of scalar ALU insts issued.") ; sALUInstsPerWF .name(name() + ".salu_insts_per_wf") .desc("The avg. number of scalar ALU insts issued per-wavefront.") ; instCyclesVALU .name(name() + ".inst_cycles_valu") .desc("Number of cycles needed to execute VALU insts.") ; instCyclesSALU .name(name() + ".inst_cycles_salu") .desc("Number of cycles needed to execute SALU insts.") ; threadCyclesVALU .name(name() + ".thread_cycles_valu") .desc("Number of thread cycles used to execute vector ALU ops. " "Similar to instCyclesVALU but multiplied by the number of " "active threads.") ; vALUUtilization .name(name() + ".valu_utilization") .desc("Percentage of active vector ALU threads in a wave.") ; ldsNoFlatInsts .name(name() + ".lds_no_flat_insts") .desc("Number of LDS insts issued, not including FLAT " "accesses that resolve to LDS.") ; ldsNoFlatInstsPerWF .name(name() + ".lds_no_flat_insts_per_wf") .desc("The avg. number of LDS insts (not including FLAT " "accesses that resolve to LDS) per-wavefront.") ; flatVMemInsts .name(name() + ".flat_vmem_insts") .desc("The number of FLAT insts that resolve to vmem issued.") ; flatVMemInstsPerWF .name(name() + ".flat_vmem_insts_per_wf") .desc("The average number of FLAT insts that resolve to vmem " "issued per-wavefront.") ; flatLDSInsts .name(name() + ".flat_lds_insts") .desc("The number of FLAT insts that resolve to LDS issued.") ; flatLDSInstsPerWF .name(name() + ".flat_lds_insts_per_wf") .desc("The average number of FLAT insts that resolve to LDS " "issued per-wavefront.") ; vectorMemWrites .name(name() + ".vector_mem_writes") .desc("Number of vector mem write insts (excluding FLAT insts).") ; vectorMemWritesPerWF .name(name() + ".vector_mem_writes_per_wf") .desc("The average number of vector mem write insts " "(excluding FLAT insts) per-wavefront.") ; vectorMemReads .name(name() + ".vector_mem_reads") .desc("Number of vector mem read insts (excluding FLAT insts).") ; vectorMemReadsPerWF .name(name() + ".vector_mem_reads_per_wf") .desc("The avg. number of vector mem read insts (excluding " "FLAT insts) per-wavefront.") ; scalarMemWrites .name(name() + ".scalar_mem_writes") .desc("Number of scalar mem write insts.") ; scalarMemWritesPerWF .name(name() + ".scalar_mem_writes_per_wf") .desc("The average number of scalar mem write insts per-wavefront.") ; scalarMemReads .name(name() + ".scalar_mem_reads") .desc("Number of scalar mem read insts.") ; scalarMemReadsPerWF .name(name() + ".scalar_mem_reads_per_wf") .desc("The average number of scalar mem read insts per-wavefront.") ; vALUInstsPerWF = vALUInsts / completedWfs; sALUInstsPerWF = sALUInsts / completedWfs; vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100; ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs; flatVMemInstsPerWF = flatVMemInsts / completedWfs; flatLDSInstsPerWF = flatLDSInsts / completedWfs; vectorMemWritesPerWF = vectorMemWrites / completedWfs; vectorMemReadsPerWF = vectorMemReads / completedWfs; scalarMemWritesPerWF = scalarMemWrites / completedWfs; scalarMemReadsPerWF = scalarMemReads / completedWfs; vectorMemReadsPerKiloInst .name(name() + ".vector_mem_reads_per_kilo_inst") .desc("Number of vector mem reads per kilo-instruction") ; vectorMemReadsPerKiloInst = (vectorMemReads / numInstrExecuted) * 1000; vectorMemWritesPerKiloInst .name(name() + ".vector_mem_writes_per_kilo_inst") .desc("Number of vector mem writes per kilo-instruction") ; vectorMemWritesPerKiloInst = (vectorMemWrites / numInstrExecuted) * 1000; vectorMemInstsPerKiloInst .name(name() + ".vector_mem_insts_per_kilo_inst") .desc("Number of vector mem insts per kilo-instruction") ; vectorMemInstsPerKiloInst = ((vectorMemReads + vectorMemWrites) / numInstrExecuted) * 1000; scalarMemReadsPerKiloInst .name(name() + ".scalar_mem_reads_per_kilo_inst") .desc("Number of scalar mem reads per kilo-instruction") ; scalarMemReadsPerKiloInst = (scalarMemReads / numInstrExecuted) * 1000; scalarMemWritesPerKiloInst .name(name() + ".scalar_mem_writes_per_kilo_inst") .desc("Number of scalar mem writes per kilo-instruction") ; scalarMemWritesPerKiloInst = (scalarMemWrites / numInstrExecuted) * 1000; scalarMemInstsPerKiloInst .name(name() + ".scalar_mem_insts_per_kilo_inst") .desc("Number of scalar mem insts per kilo-instruction") ; scalarMemInstsPerKiloInst = ((scalarMemReads + scalarMemWrites) / numInstrExecuted) * 1000; instCyclesVMemPerSimd .init(numVectorALUs) .name(name() + ".inst_cycles_vector_memory") .desc("Number of cycles to send address, command, data from VRF to " "vector memory unit, per SIMD") ; instCyclesScMemPerSimd .init(numVectorALUs) .name(name() + ".inst_cycles_scalar_memory") .desc("Number of cycles to send address, command, data from SRF to " "scalar memory unit, per SIMD") ; instCyclesLdsPerSimd .init(numVectorALUs) .name(name() + ".inst_cycles_lds") .desc("Number of cycles to send address, command, data from VRF to " "LDS unit, per SIMD") ; globalReads .name(name() + ".global_mem_reads") .desc("Number of reads to the global segment") ; globalWrites .name(name() + ".global_mem_writes") .desc("Number of writes to the global segment") ; globalMemInsts .name(name() + ".global_mem_insts") .desc("Number of memory instructions sent to the global segment") ; globalMemInsts = globalReads + globalWrites; argReads .name(name() + ".arg_reads") .desc("Number of reads to the arg segment") ; argWrites .name(name() + ".arg_writes") .desc("NUmber of writes to the arg segment") ; argMemInsts .name(name() + ".arg_mem_insts") .desc("Number of memory instructions sent to the arg segment") ; argMemInsts = argReads + argWrites; spillReads .name(name() + ".spill_reads") .desc("Number of reads to the spill segment") ; spillWrites .name(name() + ".spill_writes") .desc("Number of writes to the spill segment") ; spillMemInsts .name(name() + ".spill_mem_insts") .desc("Number of memory instructions sent to the spill segment") ; spillMemInsts = spillReads + spillWrites; groupReads .name(name() + ".group_reads") .desc("Number of reads to the group segment") ; groupWrites .name(name() + ".group_writes") .desc("Number of writes to the group segment") ; groupMemInsts .name(name() + ".group_mem_insts") .desc("Number of memory instructions sent to the group segment") ; groupMemInsts = groupReads + groupWrites; privReads .name(name() + ".private_reads") .desc("Number of reads to the private segment") ; privWrites .name(name() + ".private_writes") .desc("Number of writes to the private segment") ; privMemInsts .name(name() + ".private_mem_insts") .desc("Number of memory instructions sent to the private segment") ; privMemInsts = privReads + privWrites; readonlyReads .name(name() + ".readonly_reads") .desc("Number of reads to the readonly segment") ; readonlyWrites .name(name() + ".readonly_writes") .desc("Number of memory instructions sent to the readonly segment") ; readonlyMemInsts .name(name() + ".readonly_mem_insts") .desc("Number of memory instructions sent to the readonly segment") ; readonlyMemInsts = readonlyReads + readonlyWrites; kernargReads .name(name() + ".kernarg_reads") .desc("Number of reads sent to the kernarg segment") ; kernargWrites .name(name() + ".kernarg_writes") .desc("Number of memory instructions sent to the kernarg segment") ; kernargMemInsts .name(name() + ".kernarg_mem_insts") .desc("Number of memory instructions sent to the kernarg segment") ; kernargMemInsts = kernargReads + kernargWrites; tlbCycles .name(name() + ".tlb_cycles") .desc("total number of cycles for all uncoalesced requests") ; tlbRequests .name(name() + ".tlb_requests") .desc("number of uncoalesced requests") ; tlbLatency .name(name() + ".avg_translation_latency") .desc("Avg. translation latency for data translations") ; tlbLatency = tlbCycles / tlbRequests; hitsPerTLBLevel .init(4) .name(name() + ".TLB_hits_distribution") .desc("TLB hits distribution (0 for page table, x for Lx-TLB") ; // fixed number of TLB levels for (int i = 0; i < 4; ++i) { if (!i) hitsPerTLBLevel.subname(i,"page_table"); else hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i)); } execRateDist .init(0, 10, 2) .name(name() + ".inst_exec_rate") .desc("Instruction Execution Rate: Number of executed vector " "instructions per cycle") ; ldsBankConflictDist .init(0, wfSize(), 2) .name(name() + ".lds_bank_conflicts") .desc("Number of bank conflicts per LDS memory packet") ; ldsBankAccesses .name(name() + ".lds_bank_access_cnt") .desc("Total number of LDS bank accesses") ; pageDivergenceDist // A wavefront can touch up to N pages per memory instruction where // N is equal to the wavefront size // The number of pages per bin can be configured (here it's 4). .init(1, wfSize(), 4) .name(name() + ".page_divergence_dist") .desc("pages touched per wf (over all mem. instr.)") ; controlFlowDivergenceDist .init(1, wfSize(), 4) .name(name() + ".warp_execution_dist") .desc("number of lanes active per instruction (oval all instructions)") ; activeLanesPerGMemInstrDist .init(1, wfSize(), 4) .name(name() + ".gmem_lanes_execution_dist") .desc("number of active lanes per global memory instruction") ; activeLanesPerLMemInstrDist .init(1, wfSize(), 4) .name(name() + ".lmem_lanes_execution_dist") .desc("number of active lanes per local memory instruction") ; numInstrExecuted .name(name() + ".num_instr_executed") .desc("number of instructions executed") ; numVecOpsExecuted .name(name() + ".num_vec_ops_executed") .desc("number of vec ops executed (e.g. WF size/inst)") ; numVecOpsExecutedF16 .name(name() + ".num_vec_ops_f16_executed") .desc("number of f16 vec ops executed (e.g. WF size/inst)") ; numVecOpsExecutedF32 .name(name() + ".num_vec_ops_f32_executed") .desc("number of f32 vec ops executed (e.g. WF size/inst)") ; numVecOpsExecutedF64 .name(name() + ".num_vec_ops_f64_executed") .desc("number of f64 vec ops executed (e.g. WF size/inst)") ; numVecOpsExecutedFMA16 .name(name() + ".num_vec_ops_fma16_executed") .desc("number of fma16 vec ops executed (e.g. WF size/inst)") ; numVecOpsExecutedFMA32 .name(name() + ".num_vec_ops_fma32_executed") .desc("number of fma32 vec ops executed (e.g. WF size/inst)") ; numVecOpsExecutedFMA64 .name(name() + ".num_vec_ops_fma64_executed") .desc("number of fma64 vec ops executed (e.g. WF size/inst)") ; numVecOpsExecutedMAD16 .name(name() + ".num_vec_ops_mad16_executed") .desc("number of mad16 vec ops executed (e.g. WF size/inst)") ; numVecOpsExecutedMAD32 .name(name() + ".num_vec_ops_mad32_executed") .desc("number of mad32 vec ops executed (e.g. WF size/inst)") ; numVecOpsExecutedMAD64 .name(name() + ".num_vec_ops_mad64_executed") .desc("number of mad64 vec ops executed (e.g. WF size/inst)") ; numVecOpsExecutedMAC16 .name(name() + ".num_vec_ops_mac16_executed") .desc("number of mac16 vec ops executed (e.g. WF size/inst)") ; numVecOpsExecutedMAC32 .name(name() + ".num_vec_ops_mac32_executed") .desc("number of mac32 vec ops executed (e.g. WF size/inst)") ; numVecOpsExecutedMAC64 .name(name() + ".num_vec_ops_mac64_executed") .desc("number of mac64 vec ops executed (e.g. WF size/inst)") ; numVecOpsExecutedTwoOpFP .name(name() + ".num_vec_ops_two_op_fp_executed") .desc("number of two op FP vec ops executed (e.g. WF size/inst)") ; totalCycles .name(name() + ".num_total_cycles") .desc("number of cycles the CU ran for") ; ipc .name(name() + ".ipc") .desc("Instructions per cycle (this CU only)") ; vpc .name(name() + ".vpc") .desc("Vector Operations per cycle (this CU only)") ; vpc_f16 .name(name() + ".vpc_f16") .desc("F16 Vector Operations per cycle (this CU only)") ; vpc_f32 .name(name() + ".vpc_f32") .desc("F32 Vector Operations per cycle (this CU only)") ; vpc_f64 .name(name() + ".vpc_f64") .desc("F64 Vector Operations per cycle (this CU only)") ; numALUInstsExecuted .name(name() + ".num_alu_insts_executed") .desc("Number of dynamic non-GM memory insts executed") ; wgBlockedDueLdsAllocation .name(name() + ".wg_blocked_due_lds_alloc") .desc("Workgroup blocked due to LDS capacity") ; ipc = numInstrExecuted / totalCycles; vpc = numVecOpsExecuted / totalCycles; vpc_f16 = numVecOpsExecutedF16 / totalCycles; vpc_f32 = numVecOpsExecutedF32 / totalCycles; vpc_f64 = numVecOpsExecutedF64 / totalCycles; numTimesWgBlockedDueVgprAlloc .name(name() + ".times_wg_blocked_due_vgpr_alloc") .desc("Number of times WGs are blocked due to VGPR allocation per " "SIMD") ; numTimesWgBlockedDueSgprAlloc .name(name() + ".times_wg_blocked_due_sgpr_alloc") .desc("Number of times WGs are blocked due to SGPR allocation per " "SIMD") ; dynamicGMemInstrCnt .name(name() + ".global_mem_instr_cnt") .desc("dynamic non-flat global memory instruction count") ; dynamicFlatMemInstrCnt .name(name() + ".flat_global_mem_instr_cnt") .desc("dynamic flat global memory instruction count") ; dynamicLMemInstrCnt .name(name() + ".local_mem_instr_cnt") .desc("dynamic local memory intruction count") ; numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt - dynamicLMemInstrCnt; completedWfs .name(name() + ".num_completed_wfs") .desc("number of completed wavefronts") ; completedWGs .name(name() + ".num_completed_wgs") .desc("number of completed workgroups") ; numCASOps .name(name() + ".num_CAS_ops") .desc("number of compare and swap operations") ; numFailedCASOps .name(name() + ".num_failed_CAS_ops") .desc("number of compare and swap operations that failed") ; headTailLatency .init(0, 1000000, 10000) .name(name() + ".head_tail_latency") .desc("ticks between first and last cache block arrival at coalescer") .flags(Stats::pdf | Stats::oneline) ; waveLevelParallelism .init(0, shader->n_wf * numVectorALUs, 1) .name(name() + ".wlp") .desc("wave level parallelism: count of active waves at wave launch") ; instInterleave .init(numVectorALUs, 0, 20, 1) .name(name() + ".interleaving") .desc("Measure of instruction interleaving per SIMD") ; // register stats of pipeline stages fetchStage.regStats(); scoreboardCheckStage.regStats(); scheduleStage.regStats(); execStage.regStats(); // register stats of memory pipelines globalMemoryPipe.regStats(); localMemoryPipe.regStats(); scalarMemoryPipe.regStats(); registerManager->regStats(); } void ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst) { if (gpuDynInst->isScalar()) { if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) { sALUInsts++; instCyclesSALU++; } else if (gpuDynInst->isLoad()) { scalarMemReads++; } else if (gpuDynInst->isStore()) { scalarMemWrites++; } } else { if (gpuDynInst->isALU()) { shader->total_valu_insts++; if (shader->total_valu_insts == shader->max_valu_insts) { exitSimLoop("max vALU insts"); } vALUInsts++; instCyclesVALU++; threadCyclesVALU += gpuDynInst->wavefront()->execMask().count(); } else if (gpuDynInst->isFlat()) { if (gpuDynInst->isLocalMem()) { flatLDSInsts++; } else { flatVMemInsts++; } } else if (gpuDynInst->isLocalMem()) { ldsNoFlatInsts++; } else if (gpuDynInst->isLoad()) { vectorMemReads++; } else if (gpuDynInst->isStore()) { vectorMemWrites++; } if (gpuDynInst->isLoad()) { switch (gpuDynInst->executedAs()) { case Enums::SC_SPILL: spillReads++; break; case Enums::SC_GLOBAL: globalReads++; break; case Enums::SC_GROUP: groupReads++; break; case Enums::SC_PRIVATE: privReads++; break; case Enums::SC_READONLY: readonlyReads++; break; case Enums::SC_KERNARG: kernargReads++; break; case Enums::SC_ARG: argReads++; break; case Enums::SC_NONE: /** * this case can occur for flat mem insts * who execute with EXEC = 0 */ break; default: fatal("%s has no valid segment\n", gpuDynInst->disassemble()); break; } } else if (gpuDynInst->isStore()) { switch (gpuDynInst->executedAs()) { case Enums::SC_SPILL: spillWrites++; break; case Enums::SC_GLOBAL: globalWrites++; break; case Enums::SC_GROUP: groupWrites++; break; case Enums::SC_PRIVATE: privWrites++; break; case Enums::SC_READONLY: readonlyWrites++; break; case Enums::SC_KERNARG: kernargWrites++; break; case Enums::SC_ARG: argWrites++; break; case Enums::SC_NONE: /** * this case can occur for flat mem insts * who execute with EXEC = 0 */ break; default: fatal("%s has no valid segment\n", gpuDynInst->disassemble()); break; } } } } void ComputeUnit::updatePageDivergenceDist(Addr addr) { Addr virt_page_addr = roundDown(addr, TheISA::PageBytes); if (!pagesTouched.count(virt_page_addr)) pagesTouched[virt_page_addr] = 1; else pagesTouched[virt_page_addr]++; } void ComputeUnit::CUExitCallback::process() { if (computeUnit->countPages) { std::ostream *page_stat_file = simout.create(computeUnit->name().c_str())->stream(); *page_stat_file << "page, wavefront accesses, workitem accesses" << std::endl; for (auto iter : computeUnit->pageAccesses) { *page_stat_file << std::hex << iter.first << ","; *page_stat_file << std::dec << iter.second.first << ","; *page_stat_file << std::dec << iter.second.second << std::endl; } } } bool ComputeUnit::isDone() const { for (int i = 0; i < numVectorALUs; ++i) { if (!isVectorAluIdle(i)) { return false; } } // TODO: FIXME if more than 1 of any memory pipe supported if (!srfToScalarMemPipeBus.rdy()) { return false; } if (!vrfToGlobalMemPipeBus.rdy()) { return false; } if (!vrfToLocalMemPipeBus.rdy()) { return false; } if (!globalMemoryPipe.isGMReqFIFOWrRdy() || !localMemoryPipe.isLMReqFIFOWrRdy() || !localMemoryPipe.isLMRespFIFOWrRdy() || !locMemToVrfBus.rdy() || !glbMemToVrfBus.rdy() || !scalarMemToSrfBus.rdy()) { return false; } return true; } int32_t ComputeUnit::getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const { return lds.getRefCounter(dispatchId, wgId); } bool ComputeUnit::isVectorAluIdle(uint32_t simdId) const { assert(simdId < numVectorALUs); for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){ if (wfList[simdId][i_wf]->getStatus() != Wavefront::S_STOPPED) { return false; } } return true; } /** * send a general request to the LDS * make sure to look at the return value here as your request might be * NACK'd and returning false means that you have to have some backup plan */ bool ComputeUnit::sendToLds(GPUDynInstPtr gpuDynInst) { // this is just a request to carry the GPUDynInstPtr // back and forth RequestPtr newRequest = std::make_shared(); newRequest->setPaddr(0x0); // ReadReq is not evaluted by the LDS but the Packet ctor requires this PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq); // This is the SenderState needed upon return newPacket->senderState = new LDSPort::SenderState(gpuDynInst); return ldsPort->sendTimingReq(newPacket); } /** * get the result of packets sent to the LDS when they return */ bool ComputeUnit::LDSPort::recvTimingResp(PacketPtr packet) { const ComputeUnit::LDSPort::SenderState *senderState = dynamic_cast(packet->senderState); fatal_if(!senderState, "did not get the right sort of sender state"); GPUDynInstPtr gpuDynInst = senderState->getMemInst(); delete packet->senderState; delete packet; computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst); return true; } /** * attempt to send this packet, either the port is already stalled, the request * is nack'd and must stall or the request goes through * when a request cannot be sent, add it to the retries queue */ bool ComputeUnit::LDSPort::sendTimingReq(PacketPtr pkt) { ComputeUnit::LDSPort::SenderState *sender_state = dynamic_cast(pkt->senderState); fatal_if(!sender_state, "packet without a valid sender state"); GPUDynInstPtr gpuDynInst M5_VAR_USED = sender_state->getMemInst(); if (isStalled()) { fatal_if(retries.empty(), "must have retries waiting to be stalled"); retries.push(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n", computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId); return false; } else if (!MasterPort::sendTimingReq(pkt)) { // need to stall the LDS port until a recvReqRetry() is received // this indicates that there is more space stallPort(); retries.push(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n", computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, pkt->req->getPaddr()); return false; } else { DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n", computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, pkt->req->getPaddr()); return true; } } /** * the bus is telling the port that there is now space so retrying stalled * requests should work now * this allows the port to have a request be nack'd and then have the receiver * say when there is space, rather than simply retrying the send every cycle */ void ComputeUnit::LDSPort::recvReqRetry() { auto queueSize = retries.size(); DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n", computeUnit->cu_id, queueSize); fatal_if(queueSize < 1, "why was there a recvReqRetry() with no pending reqs?"); fatal_if(!isStalled(), "recvReqRetry() happened when the port was not stalled"); unstallPort(); while (!retries.empty()) { PacketPtr packet = retries.front(); DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id); if (!MasterPort::sendTimingReq(packet)) { // Stall port stallPort(); DPRINTF(GPUPort, ": LDS send failed again\n"); break; } else { DPRINTF(GPUTLB, ": LDS send successful\n"); retries.pop(); } } }