/* * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "gpu-compute/compute_unit.hh" #include #include "arch/amdgpu/common/gpu_translation_state.hh" #include "arch/amdgpu/common/tlb.hh" #include "base/output.hh" #include "debug/GPUDisp.hh" #include "debug/GPUExec.hh" #include "debug/GPUFetch.hh" #include "debug/GPUMem.hh" #include "debug/GPUPort.hh" #include "debug/GPUPrefetch.hh" #include "debug/GPUReg.hh" #include "debug/GPURename.hh" #include "debug/GPUSync.hh" #include "debug/GPUTLB.hh" #include "gpu-compute/dispatcher.hh" #include "gpu-compute/gpu_command_processor.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/gpu_static_inst.hh" #include "gpu-compute/scalar_register_file.hh" #include "gpu-compute/shader.hh" #include "gpu-compute/simple_pool_manager.hh" #include "gpu-compute/vector_register_file.hh" #include "gpu-compute/wavefront.hh" #include "mem/page_table.hh" #include "sim/process.hh" #include "sim/sim_exit.hh" namespace gem5 { ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p), numVectorGlobalMemUnits(p.num_global_mem_pipes), numVectorSharedMemUnits(p.num_shared_mem_pipes), numScalarMemUnits(p.num_scalar_mem_pipes), numVectorALUs(p.num_SIMDs), numScalarALUs(p.num_scalar_cores), vrfToCoalescerBusWidth(p.vrf_to_coalescer_bus_width), coalescerToVrfBusWidth(p.coalescer_to_vrf_bus_width), registerManager(p.register_manager), fetchStage(p, *this), scoreboardCheckStage(p, *this, scoreboardCheckToSchedule), scheduleStage(p, *this, scoreboardCheckToSchedule, scheduleToExecute), execStage(p, *this, scheduleToExecute), globalMemoryPipe(p, *this), localMemoryPipe(p, *this), scalarMemoryPipe(p, *this), tickEvent([this]{ exec(); }, "Compute unit tick event", false, Event::CPU_Tick_Pri), cu_id(p.cu_id), vrf(p.vector_register_file), srf(p.scalar_register_file), simdWidth(p.simd_width), spBypassPipeLength(p.spbypass_pipe_length), dpBypassPipeLength(p.dpbypass_pipe_length), scalarPipeStages(p.scalar_pipe_length), operandNetworkLength(p.operand_network_length), issuePeriod(p.issue_period), vrf_gm_bus_latency(p.vrf_gm_bus_latency), srf_scm_bus_latency(p.srf_scm_bus_latency), vrf_lm_bus_latency(p.vrf_lm_bus_latency), perLaneTLB(p.perLaneTLB), prefetchDepth(p.prefetch_depth), prefetchStride(p.prefetch_stride), prefetchType(p.prefetch_prev_type), debugSegFault(p.debugSegFault), functionalTLB(p.functionalTLB), localMemBarrier(p.localMemBarrier), countPages(p.countPages), req_tick_latency(p.mem_req_latency * p.clk_domain->clockPeriod()), resp_tick_latency(p.mem_resp_latency * p.clk_domain->clockPeriod()), _requestorId(p.system->getRequestorId(this, "ComputeUnit")), lds(*p.localDataStore), gmTokenPort(name() + ".gmTokenPort", this), ldsPort(csprintf("%s-port", name()), this), scalarDataPort(csprintf("%s-port", name()), this), scalarDTLBPort(csprintf("%s-port", name()), this), sqcPort(csprintf("%s-port", name()), this), sqcTLBPort(csprintf("%s-port", name()), this), _cacheLineSize(p.system->cacheLineSize()), _numBarrierSlots(p.num_barrier_slots), globalSeqNum(0), wavefrontSize(p.wf_size), scoreboardCheckToSchedule(p), scheduleToExecute(p), stats(this, p.n_wf) { // This is not currently supported and would require adding more handling // for system vs. device memory requests on the functional paths, so we // fatal immediately in the constructor if this configuration is seen. fatal_if(functionalTLB && FullSystem, "Functional TLB not supported in full-system GPU simulation"); /** * This check is necessary because std::bitset only provides conversion * to unsigned long or unsigned long long via to_ulong() or to_ullong(). * there are a few places in the code where to_ullong() is used, however * if wavefrontSize is larger than a value the host can support then * bitset will throw a runtime exception. We should remove all use of * to_long() or to_ullong() so we can have wavefrontSize greater than 64b, * however until that is done this assert is required. */ fatal_if(p.wf_size > std::numeric_limits::digits || p.wf_size <= 0, "WF size is larger than the host can support"); fatal_if(!isPowerOf2(wavefrontSize), "Wavefront size should be a power of 2"); // calculate how many cycles a vector load or store will need to transfer // its data over the corresponding buses numCyclesPerStoreTransfer = (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) / (double)vrfToCoalescerBusWidth); numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t)) / coalescerToVrfBusWidth; // Initialization: all WF slots are assumed STOPPED idleWfs = p.n_wf * numVectorALUs; lastVaddrWF.resize(numVectorALUs); wfList.resize(numVectorALUs); wfBarrierSlots.resize(p.num_barrier_slots, WFBarrier()); for (int i = 0; i < p.num_barrier_slots; ++i) { freeBarrierIds.insert(i); } for (int j = 0; j < numVectorALUs; ++j) { lastVaddrWF[j].resize(p.n_wf); for (int i = 0; i < p.n_wf; ++i) { lastVaddrWF[j][i].resize(wfSize()); wfList[j].push_back(p.wavefronts[j * p.n_wf + i]); wfList[j][i]->setParent(this); for (int k = 0; k < wfSize(); ++k) { lastVaddrWF[j][i][k] = 0; } } } lastVaddrSimd.resize(numVectorALUs); for (int i = 0; i < numVectorALUs; ++i) { lastVaddrSimd[i].resize(wfSize(), 0); } lastVaddrCU.resize(wfSize()); lds.setParent(this); if (p.execPolicy == "OLDEST-FIRST") { exec_policy = EXEC_POLICY::OLDEST; } else if (p.execPolicy == "ROUND-ROBIN") { exec_policy = EXEC_POLICY::RR; } else { fatal("Invalid WF execution policy (CU)\n"); } for (int i = 0; i < p.port_memory_port_connection_count; ++i) { memPort.emplace_back(csprintf("%s-port%d", name(), i), this, i); } for (int i = 0; i < p.port_translation_port_connection_count; ++i) { tlbPort.emplace_back(csprintf("%s-port%d", name(), i), this, i); } // Setup tokens for response ports. The number of tokens in memPortTokens // is the total token count for the entire vector port (i.e., this CU). memPortTokens = new TokenManager(p.max_cu_tokens); registerExitCallback([this]() { exitCallback(); }); lastExecCycle.resize(numVectorALUs, 0); for (int i = 0; i < vrf.size(); ++i) { vrf[i]->setParent(this); } for (int i = 0; i < srf.size(); ++i) { srf[i]->setParent(this); } numVecRegsPerSimd = vrf[0]->numRegs(); numScalarRegsPerSimd = srf[0]->numRegs(); registerManager->setParent(this); activeWaves = 0; instExecPerSimd.resize(numVectorALUs, 0); // Calculate the number of bits to address a cache line panic_if(!isPowerOf2(_cacheLineSize), "Cache line size should be a power of two."); cacheLineBits = floorLog2(_cacheLineSize); } ComputeUnit::~ComputeUnit() { // Delete wavefront slots for (int j = 0; j < numVectorALUs; ++j) { for (int i = 0; i < shader->n_wf; ++i) { delete wfList[j][i]; } lastVaddrSimd[j].clear(); } lastVaddrCU.clear(); } int ComputeUnit::numExeUnits() const { return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits + numVectorSharedMemUnits + numScalarMemUnits; } // index into readyList of the first memory unit int ComputeUnit::firstMemUnit() const { return numVectorALUs + numScalarALUs; } // index into readyList of the last memory unit int ComputeUnit::lastMemUnit() const { return numExeUnits() - 1; } // index into scalarALUs vector of SALU used by the wavefront int ComputeUnit::mapWaveToScalarAlu(Wavefront *w) const { if (numScalarALUs == 1) { return 0; } else { return w->simdId % numScalarALUs; } } // index into readyList of Scalar ALU unit used by wavefront int ComputeUnit::mapWaveToScalarAluGlobalIdx(Wavefront *w) const { return numVectorALUs + mapWaveToScalarAlu(w); } // index into readyList of Global Memory unit used by wavefront int ComputeUnit::mapWaveToGlobalMem(Wavefront *w) const { // TODO: FIXME if more than 1 GM pipe supported return numVectorALUs + numScalarALUs; } // index into readyList of Local Memory unit used by wavefront int ComputeUnit::mapWaveToLocalMem(Wavefront *w) const { // TODO: FIXME if more than 1 LM pipe supported return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits; } // index into readyList of Scalar Memory unit used by wavefront int ComputeUnit::mapWaveToScalarMem(Wavefront *w) const { // TODO: FIXME if more than 1 ScM pipe supported return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits + numVectorSharedMemUnits; } void ComputeUnit::fillKernelState(Wavefront *w, HSAQueueEntry *task) { w->resizeRegFiles(task->numVectorRegs(), task->numScalarRegs()); w->workGroupSz[0] = task->wgSize(0); w->workGroupSz[1] = task->wgSize(1); w->workGroupSz[2] = task->wgSize(2); w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2]; w->gridSz[0] = task->gridSize(0); w->gridSz[1] = task->gridSize(1); w->gridSz[2] = task->gridSize(2); w->computeActualWgSz(task); } void ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext) { static int _n_wave = 0; VectorMask init_mask; init_mask.reset(); for (int k = 0; k < wfSize(); ++k) { if (k + waveId * wfSize() < w->actualWgSzTotal) init_mask[k] = 1; } w->execMask() = init_mask; w->kernId = task->dispatchId(); w->wfId = waveId; w->initMask = init_mask.to_ullong(); if (bar_id > WFBarrier::InvalidID) { w->barrierId(bar_id); } else { assert(!w->hasBarrier()); } for (int k = 0; k < wfSize(); ++k) { w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0]; w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) % w->actualWgSz[1]; w->workItemId[2][k] = (k + waveId * wfSize()) / (w->actualWgSz[0] * w->actualWgSz[1]); w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] * w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] + w->workItemId[0][k]; } // WG state w->wgId = task->globalWgId(); w->dispatchId = task->dispatchId(); w->workGroupId[0] = w->wgId % task->numWg(0); w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1); w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1)); // set the wavefront context to have a pointer to this section of the LDS w->ldsChunk = ldsChunk; [[maybe_unused]] int32_t refCount = lds.increaseRefCounter(w->dispatchId, w->wgId); DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n", cu_id, w->wgId, refCount); w->instructionBuffer.clear(); if (w->pendingFetch) w->dropFetch = true; DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: " "WF[%d][%d]. Ref cnt:%d\n", _n_wave, w->barrierId(), cu_id, w->simdId, w->wfSlotId, refCount); w->initRegState(task, w->actualWgSzTotal); w->start(_n_wave++, task->codeAddr()); stats.waveLevelParallelism.sample(activeWaves); activeWaves++; } /** * trigger invalidate operation in the cu * * req: request initialized in shader, carrying the invlidate flags */ void ComputeUnit::doInvalidate(RequestPtr req, int kernId){ GPUDynInstPtr gpuDynInst = std::make_shared(this, nullptr, new KernelLaunchStaticInst(), getAndIncSeqNum()); // kern_id will be used in inv responses gpuDynInst->kern_id = kernId; // update contextId field req->setContext(gpuDynInst->wfDynId); injectGlobalMemFence(gpuDynInst, true, req); } /** * trigger flush operation in the cu * * gpuDynInst: inst passed to the request */ void ComputeUnit::doFlush(GPUDynInstPtr gpuDynInst) { injectGlobalMemFence(gpuDynInst, true); } // reseting SIMD register pools // I couldn't think of any other place and // I think it is needed in my implementation void ComputeUnit::resetRegisterPool() { for (int i=0; ivrfPoolMgrs[i]->resetRegion(numVecRegsPerSimd); registerManager->srfPoolMgrs[i]->resetRegion(numScalarRegsPerSimd); } } void ComputeUnit::dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg) { // If we aren't ticking, start it up! if (!tickEvent.scheduled()) { DPRINTF(GPUDisp, "CU%d: Scheduling wakeup next cycle\n", cu_id); schedule(tickEvent, nextCycle()); } // the kernel's invalidate must have finished before any wg dispatch assert(task->isInvDone()); // reserve the LDS capacity allocated to the work group // disambiguated by the dispatch ID and workgroup ID, which should be // globally unique LdsChunk *ldsChunk = lds.reserveSpace(task->dispatchId(), task->globalWgId(), task->ldsSize()); panic_if(!ldsChunk, "was not able to reserve space for this WG"); // calculate the number of 32-bit vector registers required // by each work item int vregDemand = task->numVectorRegs(); int sregDemand = task->numScalarRegs(); int wave_id = 0; int barrier_id = WFBarrier::InvalidID; /** * If this WG only has one WF it will not consume any barrier * resources because it has no need of them. */ if (num_wfs_in_wg > 1) { /** * Find a free barrier slot for this WG. Each WF in the WG will * receive the same barrier ID. */ barrier_id = getFreeBarrierId(); auto &wf_barrier = barrierSlot(barrier_id); assert(!wf_barrier.maxBarrierCnt()); assert(!wf_barrier.numAtBarrier()); wf_barrier.setMaxBarrierCnt(num_wfs_in_wg); DPRINTF(GPUSync, "CU[%d] - Dispatching WG with barrier Id%d. " "%d waves using this barrier.\n", cu_id, barrier_id, num_wfs_in_wg); } // Assign WFs according to numWfsToSched vector, which is computed by // hasDispResources() for (int j = 0; j < shader->n_wf; ++j) { for (int i = 0; i < numVectorALUs; ++i) { Wavefront *w = wfList[i][j]; // Check if this wavefront slot is available and there are WFs // remaining to be dispatched to current SIMD: // WF slot must be stopped and not waiting // for a release to complete S_RETURNING if (w->getStatus() == Wavefront::S_STOPPED && numWfsToSched[i] > 0) { // decrement number of WFs awaiting dispatch to current SIMD numWfsToSched[i] -= 1; fillKernelState(w, task); DPRINTF(GPURename, "SIMD[%d] wfSlotId[%d] WF[%d] " "vregDemand[%d] sregDemand[%d]\n", i, j, w->wfDynId, vregDemand, sregDemand); registerManager->allocateRegisters(w, vregDemand, sregDemand); startWavefront(w, wave_id, ldsChunk, task, barrier_id); ++wave_id; } } } } void ComputeUnit::insertInPipeMap(Wavefront *w) { panic_if(w->instructionBuffer.empty(), "Instruction Buffer of WF%d can't be empty", w->wgId); GPUDynInstPtr ii = w->instructionBuffer.front(); pipeMap.emplace(ii->seqNum()); } void ComputeUnit::deleteFromPipeMap(Wavefront *w) { panic_if(w->instructionBuffer.empty(), "Instruction Buffer of WF%d can't be empty", w->wgId); GPUDynInstPtr ii = w->instructionBuffer.front(); // delete the dynamic instruction from the pipeline map auto it = pipeMap.find(ii->seqNum()); panic_if(it == pipeMap.end(), "Pipeline Map is empty\n"); pipeMap.erase(it); } bool ComputeUnit::hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg) { // compute true size of workgroup (after clamping to grid size) int trueWgSize[HSAQueueEntry::MAX_DIM]; int trueWgSizeTotal = 1; for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) { trueWgSize[d] = std::min(task->wgSize(d), task->gridSize(d) - task->wgId(d) * task->wgSize(d)); trueWgSizeTotal *= trueWgSize[d]; DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]); } DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal); // calculate the number of WFs in this WG int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize(); num_wfs_in_wg = numWfs; bool barrier_avail = true; if (numWfs > 1 && !freeBarrierIds.size()) { barrier_avail = false; } // calculate the number of 32-bit vector registers required by each // work item of the work group int vregDemandPerWI = task->numVectorRegs(); // calculate the number of 32-bit scalar registers required by each // work item of the work group int sregDemandPerWI = task->numScalarRegs(); // check if the total number of VGPRs snd SGPRs required by all WFs // of the WG fit in the VRFs of all SIMD units and the CU's SRF panic_if((numWfs * vregDemandPerWI) > (numVectorALUs * numVecRegsPerSimd), "WG with %d WFs and %d VGPRs per WI can not be allocated to CU " "that has %d VGPRs\n", numWfs, vregDemandPerWI, numVectorALUs * numVecRegsPerSimd); panic_if((numWfs * sregDemandPerWI) > numScalarRegsPerSimd, "WG with %d WFs and %d SGPRs per WI can not be scheduled to CU " "with %d SGPRs\n", numWfs, sregDemandPerWI, numScalarRegsPerSimd); // number of WF slots that are not occupied int freeWfSlots = 0; // number of Wfs from WG that were successfully mapped to a SIMD int numMappedWfs = 0; numWfsToSched.clear(); numWfsToSched.resize(numVectorALUs, 0); // attempt to map WFs to the SIMDs, based on WF slot availability // and register file availability for (int j = 0; j < shader->n_wf; ++j) { for (int i = 0; i < numVectorALUs; ++i) { if (wfList[i][j]->getStatus() == Wavefront::S_STOPPED) { ++freeWfSlots; // check if current WF will fit onto current SIMD/VRF // if all WFs have not yet been mapped to the SIMDs if (numMappedWfs < numWfs && registerManager->canAllocateSgprs(i, numWfsToSched[i] + 1, sregDemandPerWI) && registerManager->canAllocateVgprs(i, numWfsToSched[i] + 1, vregDemandPerWI)) { numWfsToSched[i]++; numMappedWfs++; } } } } // check that the number of mapped WFs is not greater // than the actual number of WFs assert(numMappedWfs <= numWfs); bool vregAvail = true; bool sregAvail = true; // if a WF to SIMD mapping was not found, find the limiting resource if (numMappedWfs < numWfs) { for (int j = 0; j < numVectorALUs; ++j) { // find if there are enough free VGPRs in the SIMD's VRF // to accomodate the WFs of the new WG that would be mapped // to this SIMD unit vregAvail &= registerManager-> canAllocateVgprs(j, numWfsToSched[j], vregDemandPerWI); // find if there are enough free SGPRs in the SIMD's SRF // to accomodate the WFs of the new WG that would be mapped // to this SIMD unit sregAvail &= registerManager-> canAllocateSgprs(j, numWfsToSched[j], sregDemandPerWI); } } DPRINTF(GPUDisp, "Free WF slots = %d, Mapped WFs = %d, \ VGPR Availability = %d, SGPR Availability = %d\n", freeWfSlots, numMappedWfs, vregAvail, sregAvail); if (!vregAvail) { ++stats.numTimesWgBlockedDueVgprAlloc; } if (!sregAvail) { ++stats.numTimesWgBlockedDueSgprAlloc; } // Return true if enough WF slots to submit workgroup and if there are // enough VGPRs to schedule all WFs to their SIMD units bool ldsAvail = lds.canReserve(task->ldsSize()); if (!ldsAvail) { stats.wgBlockedDueLdsAllocation++; } if (!barrier_avail) { stats.wgBlockedDueBarrierAllocation++; } // Return true if the following are all true: // (a) all WFs of the WG were mapped to free WF slots // (b) there are enough VGPRs to schedule all WFs to their SIMD units // (c) there are enough SGPRs on the CU to schedule all WFs // (d) there is enough space in LDS to allocate for all WFs bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail && ldsAvail && barrier_avail; return can_dispatch; } int ComputeUnit::numYetToReachBarrier(int bar_id) { auto &wf_barrier = barrierSlot(bar_id); return wf_barrier.numYetToReachBarrier(); } bool ComputeUnit::allAtBarrier(int bar_id) { auto &wf_barrier = barrierSlot(bar_id); return wf_barrier.allAtBarrier(); } void ComputeUnit::incNumAtBarrier(int bar_id) { auto &wf_barrier = barrierSlot(bar_id); wf_barrier.incNumAtBarrier(); } int ComputeUnit::numAtBarrier(int bar_id) { auto &wf_barrier = barrierSlot(bar_id); return wf_barrier.numAtBarrier(); } int ComputeUnit::maxBarrierCnt(int bar_id) { auto &wf_barrier = barrierSlot(bar_id); return wf_barrier.maxBarrierCnt(); } void ComputeUnit::resetBarrier(int bar_id) { auto &wf_barrier = barrierSlot(bar_id); wf_barrier.reset(); } void ComputeUnit::decMaxBarrierCnt(int bar_id) { auto &wf_barrier = barrierSlot(bar_id); wf_barrier.decMaxBarrierCnt(); } void ComputeUnit::releaseBarrier(int bar_id) { auto &wf_barrier = barrierSlot(bar_id); wf_barrier.release(); freeBarrierIds.insert(bar_id); } void ComputeUnit::releaseWFsFromBarrier(int bar_id) { for (int i = 0; i < numVectorALUs; ++i) { for (int j = 0; j < shader->n_wf; ++j) { Wavefront *wf = wfList[i][j]; if (wf->barrierId() == bar_id) { assert(wf->getStatus() == Wavefront::S_BARRIER); wf->setStatus(Wavefront::S_RUNNING); } } } } // Execute one clock worth of work on the ComputeUnit. void ComputeUnit::exec() { // process reads and writes in the RFs for (auto &vecRegFile : vrf) { vecRegFile->exec(); } for (auto &scRegFile : srf) { scRegFile->exec(); } // Execute pipeline stages in reverse order to simulate // the pipeline latency scalarMemoryPipe.exec(); globalMemoryPipe.exec(); localMemoryPipe.exec(); execStage.exec(); scheduleStage.exec(); scoreboardCheckStage.exec(); fetchStage.exec(); stats.totalCycles++; // Put this CU to sleep if there is no more work to be done. if (!isDone()) { schedule(tickEvent, nextCycle()); } else { shader->notifyCuSleep(); DPRINTF(GPUDisp, "CU%d: Going to sleep\n", cu_id); } } void ComputeUnit::init() { // Initialize CU Bus models and execution resources // Vector ALUs vectorALUs.clear(); for (int i = 0; i < numVectorALUs; i++) { vectorALUs.emplace_back(this, clockPeriod()); } // Scalar ALUs scalarALUs.clear(); for (int i = 0; i < numScalarALUs; i++) { scalarALUs.emplace_back(this, clockPeriod()); } // Vector Global Memory fatal_if(numVectorGlobalMemUnits > 1, "No support for multiple Global Memory Pipelines exists!!!"); vectorGlobalMemUnit.init(this, clockPeriod()); vrfToGlobalMemPipeBus.init(this, clockPeriod()); glbMemToVrfBus.init(this, clockPeriod()); // Vector Local/Shared Memory fatal_if(numVectorSharedMemUnits > 1, "No support for multiple Local Memory Pipelines exists!!!"); vectorSharedMemUnit.init(this, clockPeriod()); vrfToLocalMemPipeBus.init(this, clockPeriod()); locMemToVrfBus.init(this, clockPeriod()); // Scalar Memory fatal_if(numScalarMemUnits > 1, "No support for multiple Scalar Memory Pipelines exists!!!"); scalarMemUnit.init(this, clockPeriod()); srfToScalarMemPipeBus.init(this, clockPeriod()); scalarMemToSrfBus.init(this, clockPeriod()); vectorRegsReserved.resize(numVectorALUs, 0); scalarRegsReserved.resize(numVectorALUs, 0); fetchStage.init(); scheduleStage.init(); execStage.init(); globalMemoryPipe.init(); gmTokenPort.setTokenManager(memPortTokens); } bool ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) { return handleResponse(pkt); } bool ComputeUnit::DataPort::handleResponse(PacketPtr pkt) { // Ruby has completed the memory op. Schedule the mem_resp_event at the // appropriate cycle to process the timing memory response // This delay represents the pipeline delay SenderState *sender_state = safe_cast(pkt->senderState); PortID index = sender_state->port_index; GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; GPUDispatcher &dispatcher = computeUnit->shader->dispatcher(); // MemSyncResp + WriteAckResp are handled completely here and we don't // schedule a MemRespEvent to process the responses further if (pkt->cmd == MemCmd::MemSyncResp) { // This response is for 1 of the following request types: // - kernel launch // - kernel end // - non-kernel mem sync // Kernel Launch // wavefront was nullptr when launching kernel, so it is meaningless // here (simdId=-1, wfSlotId=-1) if (gpuDynInst->isKernelLaunch()) { // for kernel launch, the original request must be both kernel-type // and INV_L1 assert(pkt->req->isKernel()); assert(pkt->req->isInvL1()); // one D-Cache inv is done, decrement counter dispatcher.updateInvCounter(gpuDynInst->kern_id); delete pkt->senderState; delete pkt; return true; } // retrieve wavefront from inst Wavefront *w = gpuDynInst->wavefront(); // Check if we are waiting on Kernel End Flush if (w->getStatus() == Wavefront::S_RETURNING && gpuDynInst->isEndOfKernel()) { // for kernel end, the original request must be both kernel-type // and last-level GPU cache should be flushed if it contains // dirty data. This request may have been quiesced and // immediately responded to if the GL2 is a write-through / // read-only cache. assert(pkt->req->isKernel()); assert(pkt->req->isGL2CacheFlush()); // once flush done, decrement counter, and return whether all // dirty writeback operations are done for the kernel bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id); // not all wbs are done for the kernel, just release pkt // resources if (!isWbDone) { delete pkt->senderState; delete pkt; return true; } // all wbs are completed for the kernel, do retirement work // for the workgroup DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG %d completed\n", computeUnit->cu_id, w->simdId, w->wfSlotId, w->wfDynId, w->wgId); dispatcher.notifyWgCompl(w); w->setStatus(Wavefront::S_STOPPED); } if (!pkt->req->isKernel()) { w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId]; DPRINTF(GPUExec, "MemSyncResp: WF[%d][%d] WV%d %s decrementing " "outstanding reqs %d => %d\n", gpuDynInst->simdId, gpuDynInst->wfSlotId, gpuDynInst->wfDynId, gpuDynInst->disassemble(), w->outstandingReqs, w->outstandingReqs - 1); computeUnit->globalMemoryPipe.handleResponse(gpuDynInst); } delete pkt->senderState; delete pkt; return true; } EventFunctionWrapper *mem_resp_event = computeUnit->memPort[index].createMemRespEvent(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n", computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, gpuDynInst->seqNum(), index, pkt->req->getPaddr()); computeUnit->schedule(mem_resp_event, curTick() + computeUnit->resp_tick_latency); return true; } bool ComputeUnit::ScalarDataPort::recvTimingResp(PacketPtr pkt) { return handleResponse(pkt); } bool ComputeUnit::ScalarDataPort::handleResponse(PacketPtr pkt) { assert(!pkt->req->isKernel()); // retrieve sender state SenderState *sender_state = safe_cast(pkt->senderState); GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; assert(pkt->isRead() || pkt->isWrite()); assert(gpuDynInst->numScalarReqs > 0); gpuDynInst->numScalarReqs--; /** * for each returned scalar request we decrement the * numScalarReqs counter that is associated with this * gpuDynInst, which should have been set to correspond * to the number of packets sent for the memory op. * once all packets return, the memory op is finished * and we can push it into the response queue. */ if (!gpuDynInst->numScalarReqs) { if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) { computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push( gpuDynInst); } else { computeUnit->scalarMemoryPipe.getGMStRespFIFO().push( gpuDynInst); } } delete pkt->senderState; delete pkt; return true; } void ComputeUnit::ScalarDataPort::recvReqRetry() { for (const auto &pkt : retries) { if (!sendTimingReq(pkt)) { break; } else { retries.pop_front(); } } } void ComputeUnit::DataPort::recvReqRetry() { int len = retries.size(); assert(len > 0); for (int i = 0; i < len; ++i) { PacketPtr pkt = retries.front().first; [[maybe_unused]] GPUDynInstPtr gpuDynInst = retries.front().second; DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n", computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, pkt->req->getPaddr()); /** Currently Ruby can return false due to conflicts for the particular * cache block or address. Thus other requests should be allowed to * pass and the data port should expect multiple retries. */ if (!sendTimingReq(pkt)) { DPRINTF(GPUMem, "failed again!\n"); break; } else { DPRINTF(GPUMem, "successful!\n"); retries.pop_front(); } } } bool ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt) { computeUnit->handleSQCReturn(pkt); return true; } void ComputeUnit::handleSQCReturn(PacketPtr pkt) { fetchStage.processFetchReturn(pkt); } void ComputeUnit::SQCPort::recvReqRetry() { int len = retries.size(); assert(len > 0); for (int i = 0; i < len; ++i) { PacketPtr pkt = retries.front().first; [[maybe_unused]] Wavefront *wavefront = retries.front().second; DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n", computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr()); if (!sendTimingReq(pkt)) { DPRINTF(GPUFetch, "failed again!\n"); break; } else { DPRINTF(GPUFetch, "successful!\n"); retries.pop_front(); } } } void ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt) { // There must be a way around this check to do the globalMemStart... Addr tmp_vaddr = pkt->req->getVaddr(); updatePageDivergenceDist(tmp_vaddr); // set PC in request pkt->req->setPC(gpuDynInst->wavefront()->pc()); pkt->req->setReqInstSeqNum(gpuDynInst->seqNum()); // figure out the type of the request to set read/write BaseMMU::Mode TLB_mode; assert(pkt->isRead() || pkt->isWrite()); // only do some things if actually accessing data bool isDataAccess = pkt->isWrite() || pkt->isRead(); // For dGPUs, real hardware will extract MTYPE from the PTE. SE mode // uses x86 pagetables which don't have fields to track GPU MTYPEs. // Rather than hacking up the pagetable to add these bits in, we just // keep a structure local to our GPUs that are populated in our // emulated driver whenever memory is allocated. Consult that structure // here in case we need a memtype override. // // In full system mode these can be extracted from the PTE and assigned // after address translation takes place. if (!FullSystem) { shader->gpuCmdProc.driver()->setMtype(pkt->req); } // Check write before read for atomic operations // since atomic operations should use BaseMMU::Write if (pkt->isWrite()) { TLB_mode = BaseMMU::Write; } else if (pkt->isRead()) { TLB_mode = BaseMMU::Read; } else { fatal("pkt is not a read nor a write\n"); } stats.tlbCycles -= curTick(); ++stats.tlbRequests; PortID tlbPort_index = perLaneTLB ? index : 0; if (shader->timingSim) { if (!FullSystem && debugSegFault) { Process *p = shader->gpuTc->getProcessPtr(); Addr vaddr = pkt->req->getVaddr(); unsigned size = pkt->getSize(); if ((vaddr + size - 1) % 64 < vaddr % 64) { panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr); } Addr paddr; if (!p->pTable->translate(vaddr, paddr)) { if (!p->fixupFault(vaddr)) { panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr); } } } // This is the SenderState needed upon return pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index); // This is the senderState needed by the TLB hierarchy to function GpuTranslationState *translation_state = new GpuTranslationState(TLB_mode, shader->gpuTc, false, pkt->senderState); pkt->senderState = translation_state; if (functionalTLB) { tlbPort[tlbPort_index].sendFunctional(pkt); // update the hitLevel distribution int hit_level = translation_state->hitLevel; assert(hit_level != -1); stats.hitsPerTLBLevel[hit_level]++; // New SenderState for the memory access GpuTranslationState *sender_state = safe_cast(pkt->senderState); delete sender_state->tlbEntry; delete sender_state->saved; delete sender_state; assert(pkt->req->hasPaddr()); assert(pkt->req->hasSize()); // this is necessary because the GPU TLB receives packets instead // of requests. when the translation is complete, all relevent // fields in the request will be populated, but not in the packet. // here we create the new packet so we can set the size, addr, // and proper flags. PacketPtr oldPkt = pkt; pkt = new Packet(oldPkt->req, oldPkt->cmd); if (isDataAccess) { uint8_t *tmpData = oldPkt->getPtr(); pkt->dataStatic(tmpData); } delete oldPkt; // New SenderState for the memory access pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, index, nullptr); gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index); gpuDynInst->tlbHitLevel[index] = hit_level; // translation is done. Schedule the mem_req_event at the // appropriate cycle to send the timing memory request to ruby EventFunctionWrapper *mem_req_event = memPort[index].createMemReqEvent(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data " "scheduled\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index, pkt->req->getPaddr()); schedule(mem_req_event, curTick() + req_tick_latency); } else if (tlbPort[tlbPort_index].isStalled()) { assert(tlbPort[tlbPort_index].retries.size() > 0); DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x " "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr); tlbPort[tlbPort_index].retries.push_back(pkt); } else if (!tlbPort[tlbPort_index].sendTimingReq(pkt)) { // Stall the data port; // No more packet will be issued till // ruby indicates resources are freed by // a recvReqRetry() call back on this port. tlbPort[tlbPort_index].stallPort(); DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x " "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr); tlbPort[tlbPort_index].retries.push_back(pkt); } else { DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x from " "instruction %s sent!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr, gpuDynInst->disassemble().c_str()); } } else { if (pkt->cmd == MemCmd::MemSyncReq) { gpuDynInst->resetEntireStatusVector(); } else { gpuDynInst->decrementStatusVector(index); } // New SenderState for the memory access delete pkt->senderState; // Because it's atomic operation, only need TLB translation state pkt->senderState = new GpuTranslationState(TLB_mode, shader->gpuTc); tlbPort[tlbPort_index].sendFunctional(pkt); // the addr of the packet is not modified, so we need to create a new // packet, or otherwise the memory access will have the old virtual // address sent in the translation packet, instead of the physical // address returned by the translation. PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd); new_pkt->dataStatic(pkt->getPtr()); // Translation is done. It is safe to send the packet to memory. memPort[0].sendFunctional(new_pkt); DPRINTF(GPUMem, "Functional sendRequest\n"); DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index, new_pkt->req->getPaddr()); // safe_cast the senderState GpuTranslationState *sender_state = safe_cast(pkt->senderState); delete sender_state->tlbEntry; delete new_pkt; delete pkt->senderState; delete pkt; } } void ComputeUnit::sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt) { assert(pkt->isWrite() || pkt->isRead()); BaseMMU::Mode tlb_mode = pkt->isRead() ? BaseMMU::Read : BaseMMU::Write; pkt->senderState = new ComputeUnit::ScalarDTLBPort::SenderState(gpuDynInst); pkt->senderState = new GpuTranslationState(tlb_mode, shader->gpuTc, false, pkt->senderState); if (scalarDTLBPort.isStalled()) { assert(scalarDTLBPort.retries.size()); scalarDTLBPort.retries.push_back(pkt); } else if (!scalarDTLBPort.sendTimingReq(pkt)) { scalarDTLBPort.stallPort(); scalarDTLBPort.retries.push_back(pkt); } else { DPRINTF(GPUTLB, "sent scalar %s translation request for addr %#x\n", tlb_mode == BaseMMU::Read ? "read" : "write", pkt->req->getVaddr()); } } void ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req) { assert(gpuDynInst->isGlobalSeg() || gpuDynInst->executedAs() == enums::SC_GLOBAL); // Fences will never be issued to system memory, so we can mark the // requestor as a device memory ID here. if (!req) { req = std::make_shared( 0, 0, 0, vramRequestorId(), 0, gpuDynInst->wfDynId); } else { req->requestorId(vramRequestorId()); } // all mem sync requests have Paddr == 0 req->setPaddr(0); PacketPtr pkt = nullptr; if (kernelMemSync) { if (gpuDynInst->isKernelLaunch()) { req->setCacheCoherenceFlags(Request::INV_L1); req->setReqInstSeqNum(gpuDynInst->seqNum()); req->setFlags(Request::KERNEL); pkt = new Packet(req, MemCmd::MemSyncReq); pkt->pushSenderState( new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr)); EventFunctionWrapper *mem_req_event = memPort[0].createMemReqEvent(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling " "an acquire\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0, pkt->req->getPaddr()); schedule(mem_req_event, curTick() + req_tick_latency); } else { // kernel end flush of GL2 cache may be quiesced by Ruby if the // GL2 is a read-only cache assert(shader->impl_kern_end_rel); assert(gpuDynInst->isEndOfKernel()); req->setCacheCoherenceFlags(Request::FLUSH_L2); req->setReqInstSeqNum(gpuDynInst->seqNum()); req->setFlags(Request::KERNEL); pkt = new Packet(req, MemCmd::MemSyncReq); pkt->pushSenderState( new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr)); EventFunctionWrapper *mem_req_event = memPort[0].createMemReqEvent(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling " "a release\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0, pkt->req->getPaddr()); schedule(mem_req_event, curTick() + req_tick_latency); } } else { gpuDynInst->setRequestFlags(req); req->setReqInstSeqNum(gpuDynInst->seqNum()); pkt = new Packet(req, MemCmd::MemSyncReq); pkt->pushSenderState( new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr)); EventFunctionWrapper *mem_req_event = memPort[0].createMemReqEvent(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0, pkt->req->getPaddr()); schedule(mem_req_event, curTick() + req_tick_latency); } } void ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt) { DataPort::SenderState *sender_state = safe_cast(pkt->senderState); GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; ComputeUnit *compute_unit = computeUnit; assert(gpuDynInst); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n", compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, pkt->req->getPaddr(), id); Addr paddr = pkt->req->getPaddr(); // mem sync resp callback must be handled already in // DataPort::recvTimingResp assert(pkt->cmd != MemCmd::MemSyncResp); // The status vector and global memory response for WriteResp packets get // handled by the WriteCompleteResp packets. if (pkt->cmd == MemCmd::WriteResp) { if (!FullSystem || !pkt->req->systemReq()) { delete pkt; return; } } // this is for read, write and atomic int index = gpuDynInst->memStatusVector[paddr].back(); DPRINTF(GPUMem, "Response for addr %#x, index %d\n", pkt->req->getPaddr(), id); gpuDynInst->memStatusVector[paddr].pop_back(); gpuDynInst->pAddr = pkt->req->getPaddr(); gpuDynInst->decrementStatusVector(index); DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector()); if (gpuDynInst->allLanesZero()) { auto iter = gpuDynInst->memStatusVector.begin(); auto end = gpuDynInst->memStatusVector.end(); while (iter != end) { assert(iter->second.empty()); ++iter; } // Calculate the difference between the arrival of the first cache // block and the last cache block to arrive if we have the time // for the first cache block. if (compute_unit->headTailMap.count(gpuDynInst)) { Tick headTick = compute_unit->headTailMap.at(gpuDynInst); compute_unit->stats.headTailLatency.sample(curTick() - headTick); compute_unit->headTailMap.erase(gpuDynInst); } gpuDynInst->memStatusVector.clear(); gpuDynInst-> profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue); compute_unit->globalMemoryPipe.handleResponse(gpuDynInst); DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n", compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId); } else { if (pkt->isRead()) { if (!compute_unit->headTailMap.count(gpuDynInst)) { compute_unit->headTailMap .insert(std::make_pair(gpuDynInst, curTick())); } } } delete pkt->senderState; delete pkt; } bool ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt) { Addr line = pkt->req->getPaddr(); DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id, pkt->req->getVaddr(), line); assert(pkt->senderState); computeUnit->stats.tlbCycles += curTick(); // pop off the TLB translation state GpuTranslationState *translation_state = safe_cast(pkt->senderState); // no PageFaults are permitted for data accesses if (!translation_state->tlbEntry) { DTLBPort::SenderState *sender_state = safe_cast(translation_state->saved); [[maybe_unused]] Wavefront *w = computeUnit->wfList[sender_state->_gpuDynInst->simdId] [sender_state->_gpuDynInst->wfSlotId]; DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId, pkt->req->getVaddr()); } // update the hitLevel distribution int hit_level = translation_state->hitLevel; computeUnit->stats.hitsPerTLBLevel[hit_level]++; delete translation_state->tlbEntry; assert(!translation_state->ports.size()); pkt->senderState = translation_state->saved; // for prefetch pkt BaseMMU::Mode TLB_mode = translation_state->tlbMode; delete translation_state; // use the original sender state to know how to close this transaction DTLBPort::SenderState *sender_state = safe_cast(pkt->senderState); GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; PortID mp_index = sender_state->portIndex; Addr vaddr = pkt->req->getVaddr(); gpuDynInst->memStatusVector[line].push_back(mp_index); gpuDynInst->tlbHitLevel[mp_index] = hit_level; MemCmd requestCmd; if (pkt->cmd == MemCmd::ReadResp) { requestCmd = MemCmd::ReadReq; } else if (pkt->cmd == MemCmd::WriteResp) { requestCmd = MemCmd::WriteReq; } else if (pkt->cmd == MemCmd::SwapResp) { requestCmd = MemCmd::SwapReq; } else { panic("unsupported response to request conversion %s\n", pkt->cmd.toString()); } if (computeUnit->prefetchDepth) { int simdId = gpuDynInst->simdId; int wfSlotId = gpuDynInst->wfSlotId; Addr last = 0; switch(computeUnit->prefetchType) { case enums::PF_CU: last = computeUnit->lastVaddrCU[mp_index]; break; case enums::PF_PHASE: last = computeUnit->lastVaddrSimd[simdId][mp_index]; break; case enums::PF_WF: last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index]; default: break; } DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n", computeUnit->cu_id, simdId, wfSlotId, mp_index, last); int stride = last ? (roundDown(vaddr, X86ISA::PageBytes) - roundDown(last, X86ISA::PageBytes)) >> X86ISA::PageShift : 0; DPRINTF(GPUPrefetch, "Stride is %d\n", stride); computeUnit->lastVaddrCU[mp_index] = vaddr; computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr; computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr; stride = (computeUnit->prefetchType == enums::PF_STRIDE) ? computeUnit->prefetchStride: stride; DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr, computeUnit->cu_id, simdId, wfSlotId, mp_index); DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr); // Prefetch Next few pages atomically for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) { DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride, vaddr + stride * pf * X86ISA::PageBytes); if (!stride) break; RequestPtr prefetch_req = std::make_shared( vaddr + stride * pf * X86ISA::PageBytes, sizeof(uint8_t), 0, computeUnit->requestorId(), 0, 0, nullptr); PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd); uint8_t foo = 0; prefetch_pkt->dataStatic(&foo); // Because it's atomic operation, only need TLB translation state prefetch_pkt->senderState = new GpuTranslationState(TLB_mode, computeUnit->shader->gpuTc, true); // Currently prefetches are zero-latency, hence the sendFunctional sendFunctional(prefetch_pkt); /* safe_cast the senderState */ GpuTranslationState *tlb_state = safe_cast( prefetch_pkt->senderState); delete tlb_state->tlbEntry; delete tlb_state; delete prefetch_pkt; } } // First we must convert the response cmd back to a request cmd so that // the request can be sent through the cu's request port PacketPtr new_pkt = new Packet(pkt->req, requestCmd); new_pkt->dataStatic(pkt->getPtr()); delete pkt->senderState; delete pkt; // New SenderState for the memory access new_pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index, nullptr); // Set VRAM ID for device requests // For now, system vmem requests use functional reads. This is not that // critical to model as the region of interest should always be accessing // device memory. System vmem requests are used by blit kernels to do // memcpys and load code objects into device memory. if (new_pkt->req->systemReq()) { // There will be multiple packets returned for the same gpuDynInst, // so first check if systemReq is not already set and if so, return // the token acquired when the dispatch list is filled as system // requests do not require a GPU coalescer token. if (!gpuDynInst->isSystemReq()) { computeUnit->getTokenManager()->recvTokens(1); gpuDynInst->setSystemReq(); } } else { new_pkt->req->requestorId(computeUnit->vramRequestorId()); } // translation is done. Schedule the mem_req_event at the appropriate // cycle to send the timing memory request to ruby EventFunctionWrapper *mem_req_event = computeUnit->memPort[mp_index].createMemReqEvent(new_pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n", computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr()); computeUnit->schedule(mem_req_event, curTick() + computeUnit->req_tick_latency); return true; } EventFunctionWrapper* ComputeUnit::DataPort::createMemReqEvent(PacketPtr pkt) { return new EventFunctionWrapper( [this, pkt]{ processMemReqEvent(pkt); }, "ComputeUnit memory request event", true); } EventFunctionWrapper* ComputeUnit::DataPort::createMemRespEvent(PacketPtr pkt) { return new EventFunctionWrapper( [this, pkt]{ processMemRespEvent(pkt); }, "ComputeUnit memory response event", true); } void ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt) { SenderState *sender_state = safe_cast(pkt->senderState); GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; [[maybe_unused]] ComputeUnit *compute_unit = computeUnit; if (pkt->req->systemReq()) { assert(compute_unit->shader->systemHub); SystemHubEvent *resp_event = new SystemHubEvent(pkt, this); compute_unit->shader->systemHub->sendRequest(pkt, resp_event); } else if (!(sendTimingReq(pkt))) { retries.push_back(std::make_pair(pkt, gpuDynInst)); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n", compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, id, pkt->req->getPaddr()); } else { DPRINTF(GPUPort, "CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data " "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, gpuDynInst->seqNum(), id, pkt->req->getPaddr()); } } const char* ComputeUnit::ScalarDataPort::MemReqEvent::description() const { return "ComputeUnit scalar memory request event"; } void ComputeUnit::ScalarDataPort::MemReqEvent::process() { SenderState *sender_state = safe_cast(pkt->senderState); GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; [[maybe_unused]] ComputeUnit *compute_unit = scalarDataPort.computeUnit; if (pkt->req->systemReq()) { assert(compute_unit->shader->systemHub); SystemHubEvent *resp_event = new SystemHubEvent(pkt, &scalarDataPort); compute_unit->shader->systemHub->sendRequest(pkt, resp_event); } else if (!(scalarDataPort.sendTimingReq(pkt))) { scalarDataPort.retries.push_back(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x data req failed!\n", compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, pkt->req->getPaddr()); } else { DPRINTF(GPUPort, "CU%d: WF[%d][%d]: gpuDynInst: %d, addr %#x data " "req sent!\n", compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, gpuDynInst->seqNum(), pkt->req->getPaddr()); } } /* * The initial translation request could have been rejected, * if queue is not Retry sending the translation * request. sendRetry() is called from the peer port whenever * a translation completes. */ void ComputeUnit::DTLBPort::recvReqRetry() { int len = retries.size(); DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n", computeUnit->cu_id, len); assert(len > 0); assert(isStalled()); // recvReqRetry is an indication that the resource on which this // port was stalling on is freed. So, remove the stall first unstallPort(); for (int i = 0; i < len; ++i) { PacketPtr pkt = retries.front(); [[maybe_unused]] Addr vaddr = pkt->req->getVaddr(); DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr); if (!sendTimingReq(pkt)) { // Stall port stallPort(); DPRINTF(GPUTLB, ": failed again\n"); break; } else { DPRINTF(GPUTLB, ": successful\n"); retries.pop_front(); } } } bool ComputeUnit::ScalarDTLBPort::recvTimingResp(PacketPtr pkt) { assert(pkt->senderState); GpuTranslationState *translation_state = safe_cast(pkt->senderState); // Page faults are not allowed fatal_if(!translation_state->tlbEntry, "Translation of vaddr %#x failed\n", pkt->req->getVaddr()); delete translation_state->tlbEntry; assert(!translation_state->ports.size()); pkt->senderState = translation_state->saved; delete translation_state; ScalarDTLBPort::SenderState *sender_state = safe_cast(pkt->senderState); GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst; delete pkt->senderState; [[maybe_unused]] Wavefront *w = gpuDynInst->wavefront(); DPRINTF(GPUTLB, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received " "translation: PA %#x -> %#x\n", computeUnit->cu_id, w->simdId, w->wfSlotId, w->kernId, pkt->req->getVaddr(), pkt->req->getPaddr()); MemCmd mem_cmd; if (pkt->cmd == MemCmd::ReadResp) { mem_cmd = MemCmd::ReadReq; } else if (pkt->cmd == MemCmd::WriteResp) { mem_cmd = MemCmd::WriteReq; } else { fatal("Scalar DTLB receieved unexpected MemCmd response %s\n", pkt->cmd.toString()); } PacketPtr req_pkt = new Packet(pkt->req, mem_cmd); req_pkt->dataStatic(pkt->getPtr()); delete pkt; req_pkt->senderState = new ComputeUnit::ScalarDataPort::SenderState(gpuDynInst); // For a system request we want to mark the GPU instruction as a system // load/store so that after the request is issued to system memory we can // return any token acquired for the request. Since tokens are returned // by the coalescer and system requests do not take that path, this needs // to be tracked. // // Device requests change the requestor ID to something in the device // memory Ruby network. if (req_pkt->req->systemReq()) { gpuDynInst->setSystemReq(); } else { req_pkt->req->requestorId(computeUnit->vramRequestorId()); } ComputeUnit::ScalarDataPort::MemReqEvent *scalar_mem_req_event = new ComputeUnit::ScalarDataPort::MemReqEvent (computeUnit->scalarDataPort, req_pkt); computeUnit->schedule(scalar_mem_req_event, curTick() + computeUnit->req_tick_latency); return true; } bool ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt) { [[maybe_unused]] Addr line = pkt->req->getPaddr(); DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n", computeUnit->cu_id, pkt->req->getVaddr(), line); assert(pkt->senderState); // pop off the TLB translation state GpuTranslationState *translation_state = safe_cast(pkt->senderState); bool success = translation_state->tlbEntry != nullptr; delete translation_state->tlbEntry; assert(!translation_state->ports.size()); pkt->senderState = translation_state->saved; delete translation_state; // use the original sender state to know how to close this transaction ITLBPort::SenderState *sender_state = safe_cast(pkt->senderState); // get the wavefront associated with this translation request Wavefront *wavefront = sender_state->wavefront; delete pkt->senderState; if (success) { // pkt is reused in fetch(), don't delete it here. However, we must // reset the command to be a request so that it can be sent through // the cu's request port assert(pkt->cmd == MemCmd::ReadResp); pkt->cmd = MemCmd::ReadReq; computeUnit->fetchStage.fetch(pkt, wavefront); } else { if (wavefront->dropFetch) { assert(wavefront->instructionBuffer.empty()); wavefront->dropFetch = false; } wavefront->pendingFetch = 0; } return true; } /* * The initial translation request could have been rejected, if * queue is not empty. Retry sending the translation * request. sendRetry() is called from the peer port whenever * a translation completes. */ void ComputeUnit::ITLBPort::recvReqRetry() { int len = retries.size(); DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len); assert(len > 0); assert(isStalled()); // recvReqRetry is an indication that the resource on which this // port was stalling on is freed. So, remove the stall first unstallPort(); for (int i = 0; i < len; ++i) { PacketPtr pkt = retries.front(); [[maybe_unused]] Addr vaddr = pkt->req->getVaddr(); DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr); if (!sendTimingReq(pkt)) { stallPort(); // Stall port DPRINTF(GPUTLB, ": failed again\n"); break; } else { DPRINTF(GPUTLB, ": successful\n"); retries.pop_front(); } } } void ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst) { if (gpuDynInst->isScalar()) { if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) { stats.sALUInsts++; stats.instCyclesSALU++; } else if (gpuDynInst->isLoad()) { stats.scalarMemReads++; } else if (gpuDynInst->isStore()) { stats.scalarMemWrites++; } } else { if (gpuDynInst->isALU()) { shader->total_valu_insts++; if (shader->total_valu_insts == shader->max_valu_insts) { exitSimLoop("max vALU insts"); } stats.vALUInsts++; stats.instCyclesVALU++; stats.threadCyclesVALU += gpuDynInst->wavefront()->execMask().count(); } else if (gpuDynInst->isFlat()) { if (gpuDynInst->isLocalMem()) { stats.flatLDSInsts++; } else { stats.flatVMemInsts++; } } else if (gpuDynInst->isFlatGlobal()) { stats.flatVMemInsts++; } else if (gpuDynInst->isLocalMem()) { stats.ldsNoFlatInsts++; } else if (gpuDynInst->isLoad()) { stats.vectorMemReads++; } else if (gpuDynInst->isStore()) { stats.vectorMemWrites++; } if (gpuDynInst->isLoad()) { switch (gpuDynInst->executedAs()) { case enums::SC_SPILL: stats.spillReads++; break; case enums::SC_GLOBAL: stats.globalReads++; break; case enums::SC_GROUP: stats.groupReads++; break; case enums::SC_PRIVATE: stats.privReads++; break; case enums::SC_READONLY: stats.readonlyReads++; break; case enums::SC_KERNARG: stats.kernargReads++; break; case enums::SC_ARG: stats.argReads++; break; case enums::SC_NONE: /** * this case can occur for flat mem insts * who execute with EXEC = 0 */ break; default: fatal("%s has no valid segment\n", gpuDynInst->disassemble()); break; } } else if (gpuDynInst->isStore()) { switch (gpuDynInst->executedAs()) { case enums::SC_SPILL: stats.spillWrites++; break; case enums::SC_GLOBAL: stats.globalWrites++; break; case enums::SC_GROUP: stats.groupWrites++; break; case enums::SC_PRIVATE: stats.privWrites++; break; case enums::SC_READONLY: stats.readonlyWrites++; break; case enums::SC_KERNARG: stats.kernargWrites++; break; case enums::SC_ARG: stats.argWrites++; break; case enums::SC_NONE: /** * this case can occur for flat mem insts * who execute with EXEC = 0 */ break; default: fatal("%s has no valid segment\n", gpuDynInst->disassemble()); break; } } } } void ComputeUnit::updatePageDivergenceDist(Addr addr) { Addr virt_page_addr = roundDown(addr, X86ISA::PageBytes); if (!pagesTouched.count(virt_page_addr)) pagesTouched[virt_page_addr] = 1; else pagesTouched[virt_page_addr]++; } void ComputeUnit::exitCallback() { if (countPages) { std::ostream *page_stat_file = simout.create(name().c_str())->stream(); *page_stat_file << "page, wavefront accesses, workitem accesses" << std::endl; for (auto iter : pageAccesses) { *page_stat_file << std::hex << iter.first << ","; *page_stat_file << std::dec << iter.second.first << ","; *page_stat_file << std::dec << iter.second.second << std::endl; } } } bool ComputeUnit::isDone() const { for (int i = 0; i < numVectorALUs; ++i) { if (!isVectorAluIdle(i)) { return false; } } // TODO: FIXME if more than 1 of any memory pipe supported if (!srfToScalarMemPipeBus.rdy()) { return false; } if (!vrfToGlobalMemPipeBus.rdy()) { return false; } if (!vrfToLocalMemPipeBus.rdy()) { return false; } if (!globalMemoryPipe.isGMReqFIFOWrRdy() || !localMemoryPipe.isLMReqFIFOWrRdy() || !localMemoryPipe.isLMRespFIFOWrRdy() || !locMemToVrfBus.rdy() || !glbMemToVrfBus.rdy() || !scalarMemToSrfBus.rdy()) { return false; } return true; } int32_t ComputeUnit::getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const { return lds.getRefCounter(dispatchId, wgId); } bool ComputeUnit::isVectorAluIdle(uint32_t simdId) const { assert(simdId < numVectorALUs); for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){ if (wfList[simdId][i_wf]->getStatus() != Wavefront::S_STOPPED) { return false; } } return true; } /** * send a general request to the LDS * make sure to look at the return value here as your request might be * NACK'd and returning false means that you have to have some backup plan */ bool ComputeUnit::sendToLds(GPUDynInstPtr gpuDynInst) { // this is just a request to carry the GPUDynInstPtr // back and forth RequestPtr newRequest = std::make_shared(); newRequest->setPaddr(0x0); // ReadReq is not evaluted by the LDS but the Packet ctor requires this PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq); // This is the SenderState needed upon return newPacket->senderState = new LDSPort::SenderState(gpuDynInst); return ldsPort.sendTimingReq(newPacket); } /** * Forward the VRAM requestor ID needed for device memory from shader. */ RequestorID ComputeUnit::vramRequestorId() { return FullSystem ? shader->vramRequestorId() : requestorId(); } /** * get the result of packets sent to the LDS when they return */ bool ComputeUnit::LDSPort::recvTimingResp(PacketPtr packet) { const ComputeUnit::LDSPort::SenderState *senderState = dynamic_cast(packet->senderState); fatal_if(!senderState, "did not get the right sort of sender state"); GPUDynInstPtr gpuDynInst = senderState->getMemInst(); delete packet->senderState; delete packet; computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst); return true; } /** * attempt to send this packet, either the port is already stalled, the request * is nack'd and must stall or the request goes through * when a request cannot be sent, add it to the retries queue */ bool ComputeUnit::LDSPort::sendTimingReq(PacketPtr pkt) { ComputeUnit::LDSPort::SenderState *sender_state = dynamic_cast(pkt->senderState); fatal_if(!sender_state, "packet without a valid sender state"); [[maybe_unused]] GPUDynInstPtr gpuDynInst = sender_state->getMemInst(); if (isStalled()) { fatal_if(retries.empty(), "must have retries waiting to be stalled"); retries.push(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n", computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId); return false; } else if (!RequestPort::sendTimingReq(pkt)) { // need to stall the LDS port until a recvReqRetry() is received // this indicates that there is more space stallPort(); retries.push(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n", computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, pkt->req->getPaddr()); return false; } else { DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n", computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, pkt->req->getPaddr()); return true; } } /** * the bus is telling the port that there is now space so retrying stalled * requests should work now * this allows the port to have a request be nack'd and then have the receiver * say when there is space, rather than simply retrying the send every cycle */ void ComputeUnit::LDSPort::recvReqRetry() { auto queueSize = retries.size(); DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n", computeUnit->cu_id, queueSize); fatal_if(queueSize < 1, "why was there a recvReqRetry() with no pending reqs?"); fatal_if(!isStalled(), "recvReqRetry() happened when the port was not stalled"); unstallPort(); while (!retries.empty()) { PacketPtr packet = retries.front(); DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id); if (!RequestPort::sendTimingReq(packet)) { // Stall port stallPort(); DPRINTF(GPUPort, ": LDS send failed again\n"); break; } else { DPRINTF(GPUTLB, ": LDS send successful\n"); retries.pop(); } } } ComputeUnit::ComputeUnitStats::ComputeUnitStats(statistics::Group *parent, int n_wf) : statistics::Group(parent), ADD_STAT(vALUInsts, "Number of vector ALU insts issued."), ADD_STAT(vALUInstsPerWF, "The avg. number of vector ALU insts issued " "per-wavefront."), ADD_STAT(sALUInsts, "Number of scalar ALU insts issued."), ADD_STAT(sALUInstsPerWF, "The avg. number of scalar ALU insts issued " "per-wavefront."), ADD_STAT(instCyclesVALU, "Number of cycles needed to execute VALU insts."), ADD_STAT(instCyclesSALU, "Number of cycles needed to execute SALU insts."), ADD_STAT(threadCyclesVALU, "Number of thread cycles used to execute " "vector ALU ops. Similar to instCyclesVALU but multiplied by " "the number of active threads."), ADD_STAT(vALUUtilization, "Percentage of active vector ALU threads in a wave."), ADD_STAT(ldsNoFlatInsts, "Number of LDS insts issued, not including FLAT" " accesses that resolve to LDS."), ADD_STAT(ldsNoFlatInstsPerWF, "The avg. number of LDS insts (not " "including FLAT accesses that resolve to LDS) per-wavefront."), ADD_STAT(flatVMemInsts, "The number of FLAT insts that resolve to vmem issued."), ADD_STAT(flatVMemInstsPerWF, "The average number of FLAT insts that " "resolve to vmem issued per-wavefront."), ADD_STAT(flatLDSInsts, "The number of FLAT insts that resolve to LDS issued."), ADD_STAT(flatLDSInstsPerWF, "The average number of FLAT insts that " "resolve to LDS issued per-wavefront."), ADD_STAT(vectorMemWrites, "Number of vector mem write insts (excluding FLAT insts)."), ADD_STAT(vectorMemWritesPerWF, "The average number of vector mem write " "insts (excluding FLAT insts) per-wavefront."), ADD_STAT(vectorMemReads, "Number of vector mem read insts (excluding FLAT insts)."), ADD_STAT(vectorMemReadsPerWF, "The avg. number of vector mem read insts " "(excluding FLAT insts) per-wavefront."), ADD_STAT(scalarMemWrites, "Number of scalar mem write insts."), ADD_STAT(scalarMemWritesPerWF, "The average number of scalar mem write insts per-wavefront."), ADD_STAT(scalarMemReads, "Number of scalar mem read insts."), ADD_STAT(scalarMemReadsPerWF, "The average number of scalar mem read insts per-wavefront."), ADD_STAT(vectorMemReadsPerKiloInst, "Number of vector mem reads per kilo-instruction"), ADD_STAT(vectorMemWritesPerKiloInst, "Number of vector mem writes per kilo-instruction"), ADD_STAT(vectorMemInstsPerKiloInst, "Number of vector mem insts per kilo-instruction"), ADD_STAT(scalarMemReadsPerKiloInst, "Number of scalar mem reads per kilo-instruction"), ADD_STAT(scalarMemWritesPerKiloInst, "Number of scalar mem writes per kilo-instruction"), ADD_STAT(scalarMemInstsPerKiloInst, "Number of scalar mem insts per kilo-instruction"), ADD_STAT(instCyclesVMemPerSimd, "Number of cycles to send address, " "command, data from VRF to vector memory unit, per SIMD"), ADD_STAT(instCyclesScMemPerSimd, "Number of cycles to send address, " "command, data from SRF to scalar memory unit, per SIMD"), ADD_STAT(instCyclesLdsPerSimd, "Number of cycles to send address, " "command, data from VRF to LDS unit, per SIMD"), ADD_STAT(globalReads, "Number of reads to the global segment"), ADD_STAT(globalWrites, "Number of writes to the global segment"), ADD_STAT(globalMemInsts, "Number of memory instructions sent to the global segment"), ADD_STAT(argReads, "Number of reads to the arg segment"), ADD_STAT(argWrites, "NUmber of writes to the arg segment"), ADD_STAT(argMemInsts, "Number of memory instructions sent to the arg segment"), ADD_STAT(spillReads, "Number of reads to the spill segment"), ADD_STAT(spillWrites, "Number of writes to the spill segment"), ADD_STAT(spillMemInsts, "Number of memory instructions sent to the spill segment"), ADD_STAT(groupReads, "Number of reads to the group segment"), ADD_STAT(groupWrites, "Number of writes to the group segment"), ADD_STAT(groupMemInsts, "Number of memory instructions sent to the group segment"), ADD_STAT(privReads, "Number of reads to the private segment"), ADD_STAT(privWrites, "Number of writes to the private segment"), ADD_STAT(privMemInsts, "Number of memory instructions sent to the private segment"), ADD_STAT(readonlyReads, "Number of reads to the readonly segment"), ADD_STAT(readonlyWrites, "Number of memory instructions sent to the readonly segment"), ADD_STAT(readonlyMemInsts, "Number of memory instructions sent to the readonly segment"), ADD_STAT(kernargReads, "Number of reads sent to the kernarg segment"), ADD_STAT(kernargWrites, "Number of memory instructions sent to the kernarg segment"), ADD_STAT(kernargMemInsts, "Number of memory instructions sent to the kernarg segment"), ADD_STAT(waveLevelParallelism, "wave level parallelism: count of active waves at wave launch"), ADD_STAT(tlbRequests, "number of uncoalesced requests"), ADD_STAT(tlbCycles, "total number of cycles for all uncoalesced requests"), ADD_STAT(tlbLatency, "Avg. translation latency for data translations"), ADD_STAT(hitsPerTLBLevel, "TLB hits distribution (0 for page table, x for Lx-TLB)"), ADD_STAT(ldsBankAccesses, "Total number of LDS bank accesses"), ADD_STAT(ldsBankConflictDist, "Number of bank conflicts per LDS memory packet"), ADD_STAT(pageDivergenceDist, "pages touched per wf (over all mem. instr.)"), ADD_STAT(dynamicGMemInstrCnt, "dynamic non-flat global memory instruction count"), ADD_STAT(dynamicFlatMemInstrCnt, "dynamic flat global memory instruction count"), ADD_STAT(dynamicLMemInstrCnt, "dynamic local memory intruction count"), ADD_STAT(wgBlockedDueBarrierAllocation, "WG dispatch was blocked due to lack of barrier resources"), ADD_STAT(wgBlockedDueLdsAllocation, "Workgroup blocked due to LDS capacity"), ADD_STAT(numInstrExecuted, "number of instructions executed"), ADD_STAT(execRateDist, "Instruction Execution Rate: Number of executed " "vector instructions per cycle"), ADD_STAT(numVecOpsExecuted, "number of vec ops executed (e.g. WF size/inst)"), ADD_STAT(numVecOpsExecutedF16, "number of f16 vec ops executed (e.g. WF size/inst)"), ADD_STAT(numVecOpsExecutedF32, "number of f32 vec ops executed (e.g. WF size/inst)"), ADD_STAT(numVecOpsExecutedF64, "number of f64 vec ops executed (e.g. WF size/inst)"), ADD_STAT(numVecOpsExecutedFMA16, "number of fma16 vec ops executed (e.g. WF size/inst)"), ADD_STAT(numVecOpsExecutedFMA32, "number of fma32 vec ops executed (e.g. WF size/inst)"), ADD_STAT(numVecOpsExecutedFMA64, "number of fma64 vec ops executed (e.g. WF size/inst)"), ADD_STAT(numVecOpsExecutedMAC16, "number of mac16 vec ops executed (e.g. WF size/inst)"), ADD_STAT(numVecOpsExecutedMAC32, "number of mac32 vec ops executed (e.g. WF size/inst)"), ADD_STAT(numVecOpsExecutedMAC64, "number of mac64 vec ops executed (e.g. WF size/inst)"), ADD_STAT(numVecOpsExecutedMAD16, "number of mad16 vec ops executed (e.g. WF size/inst)"), ADD_STAT(numVecOpsExecutedMAD32, "number of mad32 vec ops executed (e.g. WF size/inst)"), ADD_STAT(numVecOpsExecutedMAD64, "number of mad64 vec ops executed (e.g. WF size/inst)"), ADD_STAT(numVecOpsExecutedTwoOpFP, "number of two op FP vec ops executed (e.g. WF size/inst)"), ADD_STAT(totalCycles, "number of cycles the CU ran for"), ADD_STAT(vpc, "Vector Operations per cycle (this CU only)"), ADD_STAT(vpc_f16, "F16 Vector Operations per cycle (this CU only)"), ADD_STAT(vpc_f32, "F32 Vector Operations per cycle (this CU only)"), ADD_STAT(vpc_f64, "F64 Vector Operations per cycle (this CU only)"), ADD_STAT(ipc, "Instructions per cycle (this CU only)"), ADD_STAT(controlFlowDivergenceDist, "number of lanes active per " "instruction (over all instructions)"), ADD_STAT(activeLanesPerGMemInstrDist, "number of active lanes per global memory instruction"), ADD_STAT(activeLanesPerLMemInstrDist, "number of active lanes per local memory instruction"), ADD_STAT(numALUInstsExecuted, "Number of dynamic non-GM memory insts executed"), ADD_STAT(numTimesWgBlockedDueVgprAlloc, "Number of times WGs are " "blocked due to VGPR allocation per SIMD"), ADD_STAT(numTimesWgBlockedDueSgprAlloc, "Number of times WGs are " "blocked due to SGPR allocation per SIMD"), ADD_STAT(numCASOps, "number of compare and swap operations"), ADD_STAT(numFailedCASOps, "number of compare and swap operations that failed"), ADD_STAT(completedWfs, "number of completed wavefronts"), ADD_STAT(completedWGs, "number of completed workgroups"), ADD_STAT(headTailLatency, "ticks between first and last cache block " "arrival at coalescer"), ADD_STAT(instInterleave, "Measure of instruction interleaving per SIMD") { ComputeUnit *cu = static_cast(parent); instCyclesVMemPerSimd.init(cu->numVectorALUs); instCyclesScMemPerSimd.init(cu->numVectorALUs); instCyclesLdsPerSimd.init(cu->numVectorALUs); hitsPerTLBLevel.init(4); execRateDist.init(0, 10, 2); ldsBankConflictDist.init(0, cu->wfSize(), 2); pageDivergenceDist.init(1, cu->wfSize(), 4); controlFlowDivergenceDist.init(1, cu->wfSize(), 4); activeLanesPerGMemInstrDist.init(1, cu->wfSize(), 4); activeLanesPerLMemInstrDist.init(1, cu->wfSize(), 4); headTailLatency.init(0, 1000000, 10000).flags(statistics::pdf | statistics::oneline); waveLevelParallelism.init(0, n_wf * cu->numVectorALUs, 1); instInterleave.init(cu->numVectorALUs, 0, 20, 1); vALUInstsPerWF = vALUInsts / completedWfs; sALUInstsPerWF = sALUInsts / completedWfs; vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100; ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs; flatVMemInstsPerWF = flatVMemInsts / completedWfs; flatLDSInstsPerWF = flatLDSInsts / completedWfs; vectorMemWritesPerWF = vectorMemWrites / completedWfs; vectorMemReadsPerWF = vectorMemReads / completedWfs; scalarMemWritesPerWF = scalarMemWrites / completedWfs; scalarMemReadsPerWF = scalarMemReads / completedWfs; vectorMemReadsPerKiloInst = (vectorMemReads / numInstrExecuted) * 1000; vectorMemWritesPerKiloInst = (vectorMemWrites / numInstrExecuted) * 1000; vectorMemInstsPerKiloInst = ((vectorMemReads + vectorMemWrites) / numInstrExecuted) * 1000; scalarMemReadsPerKiloInst = (scalarMemReads / numInstrExecuted) * 1000; scalarMemWritesPerKiloInst = (scalarMemWrites / numInstrExecuted) * 1000; scalarMemInstsPerKiloInst = ((scalarMemReads + scalarMemWrites) / numInstrExecuted) * 1000; globalMemInsts = globalReads + globalWrites; argMemInsts = argReads + argWrites; spillMemInsts = spillReads + spillWrites; groupMemInsts = groupReads + groupWrites; privMemInsts = privReads + privWrites; readonlyMemInsts = readonlyReads + readonlyWrites; kernargMemInsts = kernargReads + kernargWrites; tlbLatency = tlbCycles / tlbRequests; // fixed number of TLB levels for (int i = 0; i < 4; ++i) { if (!i) hitsPerTLBLevel.subname(i,"page_table"); else hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i)); } ipc = numInstrExecuted / totalCycles; vpc = numVecOpsExecuted / totalCycles; vpc_f16 = numVecOpsExecutedF16 / totalCycles; vpc_f32 = numVecOpsExecutedF32 / totalCycles; vpc_f64 = numVecOpsExecutedF64 / totalCycles; numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt - dynamicLMemInstrCnt; } } // namespace gem5