Files
gem5/src/gpu-compute/compute_unit.cc
Matt Sinclair 8177fc4392 arch-gcn3: add support for unaligned accesses
Previously, with HSAIL, we were guaranteed by the HSA specification
that the GPU will never issue unaligned accesses.  However, now
that we are directly running GCN this is no longer true.
Accordingly, this commit adds support for unaligned accesses.
Moreover, to reduce the replication of nearly identical
code for the different request types, I also added new helper
functions that are called by all the different memory request
producing instruction types in op_encodings.hh.

Adding support for unaligned instructions requires changing
the statusBitVector used to track the status of the memory
requests for each lane from a bit per lane to an int per lane.
This is necessary because an unaligned access may span multiple
cache lines.  In the worst case, each lane may span multiple
cache lines.  There are corresponding changes in the files that
use the statusBitVector.

Change-Id: I319bf2f0f644083e98ca546d2bfe68cf87a5f967
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29920
Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com>
Tested-by: kokoro <noreply+kokoro@google.com>
2020-06-19 20:41:18 +00:00

2604 lines
86 KiB
C++

/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "gpu-compute/compute_unit.hh"
#include <limits>
#include "base/output.hh"
#include "debug/GPUDisp.hh"
#include "debug/GPUExec.hh"
#include "debug/GPUFetch.hh"
#include "debug/GPUMem.hh"
#include "debug/GPUPort.hh"
#include "debug/GPUPrefetch.hh"
#include "debug/GPUReg.hh"
#include "debug/GPURename.hh"
#include "debug/GPUSync.hh"
#include "debug/GPUTLB.hh"
#include "gpu-compute/dispatcher.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/gpu_static_inst.hh"
#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/simple_pool_manager.hh"
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"
#include "mem/page_table.hh"
#include "sim/process.hh"
#include "sim/sim_exit.hh"
ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p),
numVectorGlobalMemUnits(p->num_global_mem_pipes),
numVectorSharedMemUnits(p->num_shared_mem_pipes),
numScalarMemUnits(p->num_scalar_mem_pipes),
numVectorALUs(p->num_SIMDs),
numScalarALUs(p->num_scalar_cores),
vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
registerManager(p->register_manager), fetchStage(p),
scoreboardCheckStage(p), scheduleStage(p, this), execStage(p),
globalMemoryPipe(p), localMemoryPipe(p), scalarMemoryPipe(p),
tickEvent([this]{ exec(); }, "Compute unit tick event",
false, Event::CPU_Tick_Pri),
cu_id(p->cu_id),
vrf(p->vector_register_file), srf(p->scalar_register_file),
simdWidth(p->simd_width),
spBypassPipeLength(p->spbypass_pipe_length),
dpBypassPipeLength(p->dpbypass_pipe_length),
scalarPipeStages(p->scalar_pipe_length),
operandNetworkLength(p->operand_network_length),
issuePeriod(p->issue_period),
vrf_gm_bus_latency(p->vrf_gm_bus_latency),
srf_scm_bus_latency(p->srf_scm_bus_latency),
vrf_lm_bus_latency(p->vrf_lm_bus_latency),
perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
debugSegFault(p->debugSegFault),
functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
countPages(p->countPages), barrier_id(0),
req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
_masterId(p->system->getMasterId(this, "ComputeUnit")),
lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
_cacheLineSize(p->system->cacheLineSize()), globalSeqNum(0),
wavefrontSize(p->wf_size)
{
/**
* This check is necessary because std::bitset only provides conversion
* to unsigned long or unsigned long long via to_ulong() or to_ullong().
* there are a few places in the code where to_ullong() is used, however
* if wavefrontSize is larger than a value the host can support then
* bitset will throw a runtime exception. We should remove all use of
* to_long() or to_ullong() so we can have wavefrontSize greater than 64b,
* however until that is done this assert is required.
*/
fatal_if(p->wf_size > std::numeric_limits<unsigned long long>::digits ||
p->wf_size <= 0,
"WF size is larger than the host can support");
fatal_if(!isPowerOf2(wavefrontSize),
"Wavefront size should be a power of 2");
// calculate how many cycles a vector load or store will need to transfer
// its data over the corresponding buses
numCyclesPerStoreTransfer =
(uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
(double)vrfToCoalescerBusWidth);
numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
/ coalescerToVrfBusWidth;
// Initialization: all WF slots are assumed STOPPED
idleWfs = p->n_wf * numVectorALUs;
lastVaddrWF.resize(numVectorALUs);
wfList.resize(numVectorALUs);
for (int j = 0; j < numVectorALUs; ++j) {
lastVaddrWF[j].resize(p->n_wf);
for (int i = 0; i < p->n_wf; ++i) {
lastVaddrWF[j][i].resize(wfSize());
wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
wfList[j][i]->setParent(this);
for (int k = 0; k < wfSize(); ++k) {
lastVaddrWF[j][i][k] = 0;
}
}
}
lastVaddrSimd.resize(numVectorALUs);
for (int i = 0; i < numVectorALUs; ++i) {
lastVaddrSimd[i].resize(wfSize(), 0);
}
lastVaddrCU.resize(wfSize());
lds.setParent(this);
if (p->execPolicy == "OLDEST-FIRST") {
exec_policy = EXEC_POLICY::OLDEST;
} else if (p->execPolicy == "ROUND-ROBIN") {
exec_policy = EXEC_POLICY::RR;
} else {
fatal("Invalid WF execution policy (CU)\n");
}
memPort.resize(wfSize());
// Setup tokens for slave ports. The number of tokens in memSlaveTokens
// is the total token count for the entire vector port (i.e., this CU).
memPortTokens = new TokenManager(p->max_cu_tokens);
// resize the tlbPort vectorArray
int tlbPort_width = perLaneTLB ? wfSize() : 1;
tlbPort.resize(tlbPort_width);
cuExitCallback = new CUExitCallback(this);
registerExitCallback(cuExitCallback);
lastExecCycle.resize(numVectorALUs, 0);
for (int i = 0; i < vrf.size(); ++i) {
vrf[i]->setParent(this);
}
for (int i = 0; i < srf.size(); ++i) {
srf[i]->setParent(this);
}
numVecRegsPerSimd = vrf[0]->numRegs();
numScalarRegsPerSimd = srf[0]->numRegs();
registerManager->setParent(this);
activeWaves = 0;
instExecPerSimd.resize(numVectorALUs, 0);
// Calculate the number of bits to address a cache line
panic_if(!isPowerOf2(_cacheLineSize),
"Cache line size should be a power of two.");
cacheLineBits = floorLog2(_cacheLineSize);
}
ComputeUnit::~ComputeUnit()
{
// Delete wavefront slots
for (int j = 0; j < numVectorALUs; ++j) {
for (int i = 0; i < shader->n_wf; ++i) {
delete wfList[j][i];
}
lastVaddrSimd[j].clear();
}
lastVaddrCU.clear();
readyList.clear();
dispatchList.clear();
delete cuExitCallback;
delete ldsPort;
}
int
ComputeUnit::numExeUnits() const
{
return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits +
numVectorSharedMemUnits + numScalarMemUnits;
}
// index into readyList of the first memory unit
int
ComputeUnit::firstMemUnit() const
{
return numVectorALUs + numScalarALUs;
}
// index into readyList of the last memory unit
int
ComputeUnit::lastMemUnit() const
{
return numExeUnits() - 1;
}
// index into scalarALUs vector of SALU used by the wavefront
int
ComputeUnit::mapWaveToScalarAlu(Wavefront *w) const
{
if (numScalarALUs == 1) {
return 0;
} else {
return w->simdId % numScalarALUs;
}
}
// index into readyList of Scalar ALU unit used by wavefront
int
ComputeUnit::mapWaveToScalarAluGlobalIdx(Wavefront *w) const
{
return numVectorALUs + mapWaveToScalarAlu(w);
}
// index into readyList of Global Memory unit used by wavefront
int
ComputeUnit::mapWaveToGlobalMem(Wavefront *w) const
{
// TODO: FIXME if more than 1 GM pipe supported
return numVectorALUs + numScalarALUs;
}
// index into readyList of Local Memory unit used by wavefront
int
ComputeUnit::mapWaveToLocalMem(Wavefront *w) const
{
// TODO: FIXME if more than 1 LM pipe supported
return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits;
}
// index into readyList of Scalar Memory unit used by wavefront
int
ComputeUnit::mapWaveToScalarMem(Wavefront *w) const
{
// TODO: FIXME if more than 1 ScM pipe supported
return numVectorALUs + numScalarALUs + numVectorGlobalMemUnits +
numVectorSharedMemUnits;
}
void
ComputeUnit::fillKernelState(Wavefront *w, HSAQueueEntry *task)
{
w->resizeRegFiles(task->numVectorRegs(), task->numScalarRegs());
w->workGroupSz[0] = task->wgSize(0);
w->workGroupSz[1] = task->wgSize(1);
w->workGroupSz[2] = task->wgSize(2);
w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
w->gridSz[0] = task->gridSize(0);
w->gridSz[1] = task->gridSize(1);
w->gridSz[2] = task->gridSize(2);
w->computeActualWgSz(task);
}
// delete all wavefronts that have been marked as ready at SCB stage
// but are found to have empty instruction buffers at SCH stage
void
ComputeUnit::updateReadyList(int unitId)
{
if (!readyList[unitId].empty()) {
for (std::vector<Wavefront *>::iterator it = readyList[unitId].begin();
it != readyList[unitId].end();) {
if ((*it)->instructionBuffer.empty()) {
it = readyList[unitId].erase(it);
}
else {
++it;
}
}
}
}
void
ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
HSAQueueEntry *task, bool fetchContext)
{
static int _n_wave = 0;
VectorMask init_mask;
init_mask.reset();
for (int k = 0; k < wfSize(); ++k) {
if (k + waveId * wfSize() < w->actualWgSzTotal)
init_mask[k] = 1;
}
w->execMask() = init_mask;
w->kernId = task->dispatchId();
w->wfId = waveId;
w->initMask = init_mask.to_ullong();
for (int k = 0; k < wfSize(); ++k) {
w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
w->actualWgSz[1];
w->workItemId[2][k] = (k + waveId * wfSize()) /
(w->actualWgSz[0] * w->actualWgSz[1]);
w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
w->workItemId[0][k];
}
w->barrierSlots = divCeil(w->actualWgSzTotal, wfSize());
w->barCnt.resize(wfSize(), 0);
w->maxBarCnt = 0;
w->oldBarrierCnt = 0;
w->barrierCnt = 0;
// WG state
w->wgId = task->globalWgId();
w->dispatchId = task->dispatchId();
w->workGroupId[0] = w->wgId % task->numWg(0);
w->workGroupId[1] = (w->wgId / task->numWg(0)) % task->numWg(1);
w->workGroupId[2] = w->wgId / (task->numWg(0) * task->numWg(1));
w->barrierId = barrier_id;
w->stalledAtBarrier = (w->oldBarrierCnt == w->barrierCnt) ? false : true;
// set the wavefront context to have a pointer to this section of the LDS
w->ldsChunk = ldsChunk;
int32_t refCount M5_VAR_USED =
lds.increaseRefCounter(w->dispatchId, w->wgId);
DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
cu_id, w->wgId, refCount);
w->instructionBuffer.clear();
if (w->pendingFetch)
w->dropFetch = true;
DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
"WF[%d][%d]\n", _n_wave, w->barrierId, cu_id, w->simdId,
w->wfSlotId);
w->initRegState(task, w->actualWgSzTotal);
w->start(_n_wave++, task->codeAddr());
waveLevelParallelism.sample(activeWaves);
activeWaves++;
}
/**
* trigger invalidate operation in the cu
*
* req: request initialized in shader, carrying the invlidate flags
*/
void
ComputeUnit::doInvalidate(RequestPtr req, int kernId){
GPUDynInstPtr gpuDynInst
= std::make_shared<GPUDynInst>(this, nullptr,
new KernelLaunchStaticInst(), getAndIncSeqNum());
// kern_id will be used in inv responses
gpuDynInst->kern_id = kernId;
// update contextId field
req->setContext(gpuDynInst->wfDynId);
injectGlobalMemFence(gpuDynInst, true, req);
}
/**
* trigger flush operation in the cu
*
* gpuDynInst: inst passed to the request
*/
void
ComputeUnit::doFlush(GPUDynInstPtr gpuDynInst) {
injectGlobalMemFence(gpuDynInst, true);
}
void
ComputeUnit::dispWorkgroup(HSAQueueEntry *task, bool startFromScheduler)
{
// If we aren't ticking, start it up!
if (!tickEvent.scheduled()) {
DPRINTF(GPUDisp, "CU%d: Scheduling wakeup next cycle\n", cu_id);
schedule(tickEvent, nextCycle());
}
// the kernel's invalidate must have finished before any wg dispatch
assert(task->isInvDone());
// reserve the LDS capacity allocated to the work group
// disambiguated by the dispatch ID and workgroup ID, which should be
// globally unique
LdsChunk *ldsChunk = lds.reserveSpace(task->dispatchId(),
task->globalWgId(),
task->ldsSize());
panic_if(!ldsChunk, "was not able to reserve space for this WG");
// calculate the number of 32-bit vector registers required
// by each work item
int vregDemand = task->numVectorRegs();
int sregDemand = task->numScalarRegs();
int wave_id = 0;
// Assign WFs according to numWfsToSched vector, which is computed by
// hasDispResources()
for (int j = 0; j < shader->n_wf; ++j) {
for (int i = 0; i < numVectorALUs; ++i) {
Wavefront *w = wfList[i][j];
// Check if this wavefront slot is available and there are WFs
// remaining to be dispatched to current SIMD:
// WF slot must be stopped and not waiting
// for a release to complete S_RETURNING
if (w->getStatus() == Wavefront::S_STOPPED &&
numWfsToSched[i] > 0) {
// decrement number of WFs awaiting dispatch to current SIMD
numWfsToSched[i] -= 1;
fillKernelState(w, task);
DPRINTF(GPURename, "SIMD[%d] wfSlotId[%d] WF[%d] "
"vregDemand[%d] sregDemand[%d]\n", i, j, w->wfDynId,
vregDemand, sregDemand);
registerManager->allocateRegisters(w, vregDemand, sregDemand);
startWavefront(w, wave_id, ldsChunk, task);
++wave_id;
}
}
}
++barrier_id;
}
void
ComputeUnit::insertInPipeMap(Wavefront *w)
{
panic_if(w->instructionBuffer.empty(),
"Instruction Buffer of WF%d can't be empty", w->wgId);
GPUDynInstPtr ii = w->instructionBuffer.front();
pipeMap.emplace(ii->seqNum());
}
void
ComputeUnit::deleteFromPipeMap(Wavefront *w)
{
panic_if(w->instructionBuffer.empty(),
"Instruction Buffer of WF%d can't be empty", w->wgId);
GPUDynInstPtr ii = w->instructionBuffer.front();
// delete the dynamic instruction from the pipeline map
auto it = pipeMap.find(ii->seqNum());
panic_if(it == pipeMap.end(), "Pipeline Map is empty\n");
pipeMap.erase(it);
}
bool
ComputeUnit::hasDispResources(HSAQueueEntry *task)
{
// compute true size of workgroup (after clamping to grid size)
int trueWgSize[HSAQueueEntry::MAX_DIM];
int trueWgSizeTotal = 1;
for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
trueWgSize[d] = std::min(task->wgSize(d), task->gridSize(d) -
task->wgId(d) * task->wgSize(d));
trueWgSizeTotal *= trueWgSize[d];
DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
}
DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
// calculate the number of WFs in this WG
int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
// calculate the number of 32-bit vector registers required by each
// work item of the work group
int vregDemandPerWI = task->numVectorRegs();
// calculate the number of 32-bit scalar registers required by each
// work item of the work group
int sregDemandPerWI = task->numScalarRegs();
// check if the total number of VGPRs snd SGPRs required by all WFs
// of the WG fit in the VRFs of all SIMD units and the CU's SRF
panic_if((numWfs * vregDemandPerWI) > (numVectorALUs * numVecRegsPerSimd),
"WG with %d WFs and %d VGPRs per WI can not be allocated to CU "
"that has %d VGPRs\n",
numWfs, vregDemandPerWI, numVectorALUs * numVecRegsPerSimd);
panic_if((numWfs * sregDemandPerWI) > numScalarRegsPerSimd,
"WG with %d WFs and %d SGPRs per WI can not be scheduled to CU "
"with %d SGPRs\n",
numWfs, sregDemandPerWI, numScalarRegsPerSimd);
// number of WF slots that are not occupied
int freeWfSlots = 0;
// number of Wfs from WG that were successfully mapped to a SIMD
int numMappedWfs = 0;
numWfsToSched.clear();
numWfsToSched.resize(numVectorALUs, 0);
// attempt to map WFs to the SIMDs, based on WF slot availability
// and register file availability
for (int j = 0; j < shader->n_wf; ++j) {
for (int i = 0; i < numVectorALUs; ++i) {
if (wfList[i][j]->getStatus() == Wavefront::S_STOPPED) {
++freeWfSlots;
// check if current WF will fit onto current SIMD/VRF
// if all WFs have not yet been mapped to the SIMDs
if (numMappedWfs < numWfs &&
registerManager->canAllocateSgprs(i, numWfsToSched[i] + 1,
sregDemandPerWI) &&
registerManager->canAllocateVgprs(i, numWfsToSched[i] + 1,
vregDemandPerWI)) {
numWfsToSched[i]++;
numMappedWfs++;
}
}
}
}
// check that the number of mapped WFs is not greater
// than the actual number of WFs
assert(numMappedWfs <= numWfs);
bool vregAvail = true;
bool sregAvail = true;
// if a WF to SIMD mapping was not found, find the limiting resource
if (numMappedWfs < numWfs) {
for (int j = 0; j < numVectorALUs; ++j) {
// find if there are enough free VGPRs in the SIMD's VRF
// to accomodate the WFs of the new WG that would be mapped
// to this SIMD unit
vregAvail &= registerManager->
canAllocateVgprs(j, numWfsToSched[j], vregDemandPerWI);
// find if there are enough free SGPRs in the SIMD's SRF
// to accomodate the WFs of the new WG that would be mapped
// to this SIMD unit
sregAvail &= registerManager->
canAllocateSgprs(j, numWfsToSched[j], sregDemandPerWI);
}
}
DPRINTF(GPUDisp, "Free WF slots = %d, Mapped WFs = %d, \
VGPR Availability = %d, SGPR Availability = %d\n",
freeWfSlots, numMappedWfs, vregAvail, sregAvail);
if (!vregAvail) {
++numTimesWgBlockedDueVgprAlloc;
}
if (!sregAvail) {
++numTimesWgBlockedDueSgprAlloc;
}
// Return true if enough WF slots to submit workgroup and if there are
// enough VGPRs to schedule all WFs to their SIMD units
bool ldsAvail = lds.canReserve(task->ldsSize());
if (!ldsAvail) {
wgBlockedDueLdsAllocation++;
}
// Return true if the following are all true:
// (a) all WFs of the WG were mapped to free WF slots
// (b) there are enough VGPRs to schedule all WFs to their SIMD units
// (c) there are enough SGPRs on the CU to schedule all WFs
// (d) there is enough space in LDS to allocate for all WFs
bool can_dispatch = numMappedWfs == numWfs && vregAvail && sregAvail
&& ldsAvail;
return can_dispatch;
}
int
ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
{
DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id);
int ccnt = 0;
for (int i_simd = 0; i_simd < numVectorALUs; ++i_simd) {
for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) {
Wavefront *w = wfList[i_simd][i_wf];
if (w->getStatus() == Wavefront::S_RUNNING) {
DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf);
DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n",
w->barrierId, _barrier_id);
DPRINTF(GPUSync, "wf->barrierCnt %d, bcnt = %d\n",
w->barrierCnt, bcnt);
DPRINTF(GPUSync, "outstanding Reqs = %d\n",
w->outstandingReqs);
}
if (w->getStatus() == Wavefront::S_RUNNING &&
w->barrierId == _barrier_id && w->barrierCnt == bcnt &&
!w->outstandingReqs) {
++ccnt;
DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to "
"%d\n", i_simd, i_wf, ccnt);
}
}
}
DPRINTF(GPUSync, "CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n",
cu_id, ccnt, bslots);
return ccnt == bslots;
}
// Execute one clock worth of work on the ComputeUnit.
void
ComputeUnit::exec()
{
// process reads and writes in the RFs
for (auto &vecRegFile : vrf) {
vecRegFile->exec();
}
for (auto &scRegFile : srf) {
scRegFile->exec();
}
// Execute pipeline stages in reverse order to simulate
// the pipeline latency
scalarMemoryPipe.exec();
globalMemoryPipe.exec();
localMemoryPipe.exec();
execStage.exec();
scheduleStage.exec();
scoreboardCheckStage.exec();
fetchStage.exec();
totalCycles++;
// Put this CU to sleep if there is no more work to be done.
if (!isDone()) {
schedule(tickEvent, nextCycle());
} else {
shader->notifyCuSleep();
DPRINTF(GPUDisp, "CU%d: Going to sleep\n", cu_id);
}
}
void
ComputeUnit::init()
{
// Initialize CU Bus models and execution resources
// Vector ALUs
vectorALUs.clear();
for (int i = 0; i < numVectorALUs; i++) {
vectorALUs.emplace_back(this, clockPeriod());
}
// Scalar ALUs
scalarALUs.clear();
for (int i = 0; i < numScalarALUs; i++) {
scalarALUs.emplace_back(this, clockPeriod());
}
// Vector Global Memory
fatal_if(numVectorGlobalMemUnits > 1,
"No support for multiple Global Memory Pipelines exists!!!");
vectorGlobalMemUnit.init(this, clockPeriod());
vrfToGlobalMemPipeBus.init(this, clockPeriod());
glbMemToVrfBus.init(this, clockPeriod());
// Vector Local/Shared Memory
fatal_if(numVectorSharedMemUnits > 1,
"No support for multiple Local Memory Pipelines exists!!!");
vectorSharedMemUnit.init(this, clockPeriod());
vrfToLocalMemPipeBus.init(this, clockPeriod());
locMemToVrfBus.init(this, clockPeriod());
// Scalar Memory
fatal_if(numScalarMemUnits > 1,
"No support for multiple Scalar Memory Pipelines exists!!!");
scalarMemUnit.init(this, clockPeriod());
srfToScalarMemPipeBus.init(this, clockPeriod());
scalarMemToSrfBus.init(this, clockPeriod());
vectorRegsReserved.resize(numVectorALUs, 0);
scalarRegsReserved.resize(numVectorALUs, 0);
// Initializing pipeline resources
readyList.resize(numExeUnits());
for (int j = 0; j < numExeUnits(); ++j) {
dispatchList.push_back(std::make_pair(nullptr, EMPTY));
}
fetchStage.init(this);
scoreboardCheckStage.init(this);
scheduleStage.init(this);
execStage.init(this);
globalMemoryPipe.init(this);
localMemoryPipe.init(this);
scalarMemoryPipe.init(this);
gmTokenPort.setTokenManager(memPortTokens);
}
bool
ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
{
// Ruby has completed the memory op. Schedule the mem_resp_event at the
// appropriate cycle to process the timing memory response
// This delay represents the pipeline delay
SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
int index = sender_state->port_index;
GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
GPUDispatcher &dispatcher = computeUnit->shader->dispatcher();
// MemSyncResp + WriteAckResp are handled completely here and we don't
// schedule a MemRespEvent to process the responses further
if (pkt->cmd == MemCmd::MemSyncResp) {
// This response is for 1 of the following request types:
// - kernel launch
// - kernel end
// - non-kernel mem sync
// Kernel Launch
// wavefront was nullptr when launching kernel, so it is meaningless
// here (simdId=-1, wfSlotId=-1)
if (gpuDynInst->isKernelLaunch()) {
// for kernel launch, the original request must be both kernel-type
// and acquire
assert(pkt->req->isKernel());
assert(pkt->req->isAcquire());
// one D-Cache inv is done, decrement counter
dispatcher.updateInvCounter(gpuDynInst->kern_id);
delete pkt->senderState;
delete pkt;
return true;
}
// retrieve wavefront from inst
Wavefront *w = gpuDynInst->wavefront();
// Check if we are waiting on Kernel End Release
if (w->getStatus() == Wavefront::S_RETURNING
&& gpuDynInst->isEndOfKernel()) {
// for kernel end, the original request must be both kernel-type
// and release
assert(pkt->req->isKernel());
assert(pkt->req->isRelease());
// one wb done, decrement counter, and return whether all wbs are
// done for the kernel
bool isWbDone = dispatcher.updateWbCounter(gpuDynInst->kern_id);
// not all wbs are done for the kernel, just release pkt
// resources
if (!isWbDone) {
delete pkt->senderState;
delete pkt;
return true;
}
// all wbs are completed for the kernel, do retirement work
// for the workgroup
DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG %d completed\n",
computeUnit->cu_id, w->simdId, w->wfSlotId,
w->wfDynId, w->wgId);
dispatcher.notifyWgCompl(w);
w->setStatus(Wavefront::S_STOPPED);
}
if (!pkt->req->isKernel()) {
w = computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
DPRINTF(GPUExec, "MemSyncResp: WF[%d][%d] WV%d %s decrementing "
"outstanding reqs %d => %d\n", gpuDynInst->simdId,
gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
gpuDynInst->disassemble(), w->outstandingReqs,
w->outstandingReqs - 1);
computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
}
DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrierCnt = %d\n",
computeUnit->cu_id, gpuDynInst->simdId,
gpuDynInst->wfSlotId, w->barrierCnt);
delete pkt->senderState;
delete pkt;
return true;
} else if (pkt->cmd == MemCmd::WriteCompleteResp) {
// this is for writeComplete callback
// we simply get decrement write-related wait counters
assert(gpuDynInst);
Wavefront *w M5_VAR_USED =
computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
assert(w);
DPRINTF(GPUExec, "WriteCompleteResp: WF[%d][%d] WV%d %s decrementing "
"outstanding reqs %d => %d\n", gpuDynInst->simdId,
gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
gpuDynInst->disassemble(), w->outstandingReqs,
w->outstandingReqs - 1);
if (gpuDynInst->allLanesZero()) {
// ask gm pipe to decrement request counters, instead of directly
// performing here, to avoid asynchronous counter update and
// instruction retirement (which may hurt waincnt effects)
computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: write totally complete\n",
computeUnit->cu_id, gpuDynInst->simdId,
gpuDynInst->wfSlotId);
}
delete pkt->senderState;
delete pkt;
return true;
}
EventFunctionWrapper *mem_resp_event =
computeUnit->memPort[index]->createMemRespEvent(pkt);
DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x received!\n",
computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
gpuDynInst->seqNum(), index, pkt->req->getPaddr());
computeUnit->schedule(mem_resp_event,
curTick() + computeUnit->resp_tick_latency);
return true;
}
bool
ComputeUnit::ScalarDataPort::recvTimingResp(PacketPtr pkt)
{
assert(!pkt->req->isKernel());
// retrieve sender state
SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
assert(pkt->isRead() || pkt->isWrite());
assert(gpuDynInst->numScalarReqs > 0);
gpuDynInst->numScalarReqs--;
/**
* for each returned scalar request we decrement the
* numScalarReqs counter that is associated with this
* gpuDynInst, which should have been set to correspond
* to the number of packets sent for the memory op.
* once all packets return, the memory op is finished
* and we can push it into the response queue.
*/
if (!gpuDynInst->numScalarReqs) {
if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {
computeUnit->scalarMemoryPipe.getGMLdRespFIFO().push(
gpuDynInst);
} else {
computeUnit->scalarMemoryPipe.getGMStRespFIFO().push(
gpuDynInst);
}
}
delete pkt->senderState;
delete pkt;
return true;
}
void
ComputeUnit::ScalarDataPort::recvReqRetry()
{
for (const auto &pkt : retries) {
if (!sendTimingReq(pkt)) {
break;
} else {
retries.pop_front();
}
}
}
void
ComputeUnit::DataPort::recvReqRetry()
{
int len = retries.size();
assert(len > 0);
for (int i = 0; i < len; ++i) {
PacketPtr pkt = retries.front().first;
GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second;
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
pkt->req->getPaddr());
/** Currently Ruby can return false due to conflicts for the particular
* cache block or address. Thus other requests should be allowed to
* pass and the data port should expect multiple retries. */
if (!sendTimingReq(pkt)) {
DPRINTF(GPUMem, "failed again!\n");
break;
} else {
DPRINTF(GPUMem, "successful!\n");
retries.pop_front();
}
}
}
bool
ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
{
computeUnit->fetchStage.processFetchReturn(pkt);
return true;
}
void
ComputeUnit::SQCPort::recvReqRetry()
{
int len = retries.size();
assert(len > 0);
for (int i = 0; i < len; ++i) {
PacketPtr pkt = retries.front().first;
Wavefront *wavefront M5_VAR_USED = retries.front().second;
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
pkt->req->getPaddr());
if (!sendTimingReq(pkt)) {
DPRINTF(GPUFetch, "failed again!\n");
break;
} else {
DPRINTF(GPUFetch, "successful!\n");
retries.pop_front();
}
}
}
void
ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
{
// There must be a way around this check to do the globalMemStart...
Addr tmp_vaddr = pkt->req->getVaddr();
updatePageDivergenceDist(tmp_vaddr);
// set PC in request
pkt->req->setPC(gpuDynInst->wavefront()->pc());
pkt->req->setReqInstSeqNum(gpuDynInst->seqNum());
// figure out the type of the request to set read/write
BaseTLB::Mode TLB_mode;
assert(pkt->isRead() || pkt->isWrite());
// only do some things if actually accessing data
bool isDataAccess = pkt->isWrite() || pkt->isRead();
// Check write before read for atomic operations
// since atomic operations should use BaseTLB::Write
if (pkt->isWrite()) {
TLB_mode = BaseTLB::Write;
} else if (pkt->isRead()) {
TLB_mode = BaseTLB::Read;
} else {
fatal("pkt is not a read nor a write\n");
}
tlbCycles -= curTick();
++tlbRequests;
int tlbPort_index = perLaneTLB ? index : 0;
if (shader->timingSim) {
if (debugSegFault) {
Process *p = shader->gpuTc->getProcessPtr();
Addr vaddr = pkt->req->getVaddr();
unsigned size = pkt->getSize();
if ((vaddr + size - 1) % 64 < vaddr % 64) {
panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
}
Addr paddr;
if (!p->pTable->translate(vaddr, paddr)) {
if (!p->fixupFault(vaddr)) {
panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
vaddr);
}
}
}
// This is the SenderState needed upon return
pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
// This is the senderState needed by the TLB hierarchy to function
TheISA::GpuTLB::TranslationState *translation_state =
new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false,
pkt->senderState);
pkt->senderState = translation_state;
if (functionalTLB) {
tlbPort[tlbPort_index]->sendFunctional(pkt);
// update the hitLevel distribution
int hit_level = translation_state->hitLevel;
assert(hit_level != -1);
hitsPerTLBLevel[hit_level]++;
// New SenderState for the memory access
X86ISA::GpuTLB::TranslationState *sender_state =
safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
delete sender_state->tlbEntry;
delete sender_state->saved;
delete sender_state;
assert(pkt->req->hasPaddr());
assert(pkt->req->hasSize());
// this is necessary because the GPU TLB receives packets instead
// of requests. when the translation is complete, all relevent
// fields in the request will be populated, but not in the packet.
// here we create the new packet so we can set the size, addr,
// and proper flags.
PacketPtr oldPkt = pkt;
pkt = new Packet(oldPkt->req, oldPkt->cmd);
if (isDataAccess) {
uint8_t *tmpData = oldPkt->getPtr<uint8_t>();
pkt->dataStatic(tmpData);
}
delete oldPkt;
// New SenderState for the memory access
pkt->senderState =
new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
nullptr);
gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
gpuDynInst->tlbHitLevel[index] = hit_level;
// translation is done. Schedule the mem_req_event at the
// appropriate cycle to send the timing memory request to ruby
EventFunctionWrapper *mem_req_event =
memPort[index]->createMemReqEvent(pkt);
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
"scheduled\n", cu_id, gpuDynInst->simdId,
gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
schedule(mem_req_event, curTick() + req_tick_latency);
} else if (tlbPort[tlbPort_index]->isStalled()) {
assert(tlbPort[tlbPort_index]->retries.size() > 0);
DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
"failed!\n", cu_id, gpuDynInst->simdId,
gpuDynInst->wfSlotId, tmp_vaddr);
tlbPort[tlbPort_index]->retries.push_back(pkt);
} else if (!tlbPort[tlbPort_index]->sendTimingReq(pkt)) {
// Stall the data port;
// No more packet will be issued till
// ruby indicates resources are freed by
// a recvReqRetry() call back on this port.
tlbPort[tlbPort_index]->stallPort();
DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
"failed!\n", cu_id, gpuDynInst->simdId,
gpuDynInst->wfSlotId, tmp_vaddr);
tlbPort[tlbPort_index]->retries.push_back(pkt);
} else {
DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
}
} else {
if (pkt->cmd == MemCmd::MemSyncReq) {
gpuDynInst->resetEntireStatusVector();
} else {
gpuDynInst->decrementStatusVector(index);
}
// New SenderState for the memory access
delete pkt->senderState;
// Because it's atomic operation, only need TLB translation state
pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode,
shader->gpuTc);
tlbPort[tlbPort_index]->sendFunctional(pkt);
// the addr of the packet is not modified, so we need to create a new
// packet, or otherwise the memory access will have the old virtual
// address sent in the translation packet, instead of the physical
// address returned by the translation.
PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
new_pkt->dataStatic(pkt->getPtr<uint8_t>());
// Translation is done. It is safe to send the packet to memory.
memPort[0]->sendFunctional(new_pkt);
DPRINTF(GPUMem, "Functional sendRequest\n");
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
new_pkt->req->getPaddr());
// safe_cast the senderState
TheISA::GpuTLB::TranslationState *sender_state =
safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
delete sender_state->tlbEntry;
delete new_pkt;
delete pkt->senderState;
delete pkt;
}
}
void
ComputeUnit::sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt)
{
assert(pkt->isWrite() || pkt->isRead());
BaseTLB::Mode tlb_mode = pkt->isRead() ? BaseTLB::Read : BaseTLB::Write;
pkt->senderState =
new ComputeUnit::ScalarDTLBPort::SenderState(gpuDynInst);
pkt->senderState =
new TheISA::GpuTLB::TranslationState(tlb_mode, shader->gpuTc, false,
pkt->senderState);
if (scalarDTLBPort->isStalled()) {
assert(scalarDTLBPort->retries.size());
scalarDTLBPort->retries.push_back(pkt);
} else if (!scalarDTLBPort->sendTimingReq(pkt)) {
scalarDTLBPort->stallPort();
scalarDTLBPort->retries.push_back(pkt);
} else {
DPRINTF(GPUTLB, "sent scalar %s translation request for addr %#x\n",
tlb_mode == BaseTLB::Read ? "read" : "write",
pkt->req->getVaddr());
}
}
void
ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
bool kernelMemSync,
RequestPtr req)
{
assert(gpuDynInst->isGlobalSeg() ||
gpuDynInst->executedAs() == Enums::SC_GLOBAL);
if (!req) {
req = std::make_shared<Request>(
0, 0, 0, masterId(), 0, gpuDynInst->wfDynId);
}
// all mem sync requests have Paddr == 0
req->setPaddr(0);
PacketPtr pkt = nullptr;
if (kernelMemSync) {
if (gpuDynInst->isKernelLaunch()) {
req->setCacheCoherenceFlags(Request::ACQUIRE);
req->setReqInstSeqNum(gpuDynInst->seqNum());
req->setFlags(Request::KERNEL);
pkt = new Packet(req, MemCmd::MemSyncReq);
pkt->pushSenderState(
new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
EventFunctionWrapper *mem_req_event =
memPort[0]->createMemReqEvent(pkt);
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
"an acquire\n", cu_id, gpuDynInst->simdId,
gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
schedule(mem_req_event, curTick() + req_tick_latency);
} else {
assert(gpuDynInst->isEndOfKernel());
req->setCacheCoherenceFlags(Request::RELEASE);
req->setReqInstSeqNum(gpuDynInst->seqNum());
req->setFlags(Request::KERNEL);
pkt = new Packet(req, MemCmd::MemSyncReq);
pkt->pushSenderState(
new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
EventFunctionWrapper *mem_req_event =
memPort[0]->createMemReqEvent(pkt);
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x scheduling "
"a release\n", cu_id, gpuDynInst->simdId,
gpuDynInst->wfSlotId, 0, pkt->req->getPaddr());
schedule(mem_req_event, curTick() + req_tick_latency);
}
} else {
gpuDynInst->setRequestFlags(req);
req->setReqInstSeqNum(gpuDynInst->seqNum());
pkt = new Packet(req, MemCmd::MemSyncReq);
pkt->pushSenderState(
new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr));
EventFunctionWrapper *mem_req_event =
memPort[0]->createMemReqEvent(pkt);
DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, 0,
pkt->req->getPaddr());
schedule(mem_req_event, curTick() + req_tick_latency);
}
}
void
ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
{
DataPort::SenderState *sender_state =
safe_cast<DataPort::SenderState*>(pkt->senderState);
GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
ComputeUnit *compute_unit = computeUnit;
assert(gpuDynInst);
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
pkt->req->getPaddr(), index);
Addr paddr = pkt->req->getPaddr();
// mem sync resp and write-complete callback must be handled already in
// DataPort::recvTimingResp
assert(pkt->cmd != MemCmd::MemSyncResp);
assert(pkt->cmd != MemCmd::WriteCompleteResp);
// this is for read, write and atomic
int index = gpuDynInst->memStatusVector[paddr].back();
DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
pkt->req->getPaddr(), index);
gpuDynInst->memStatusVector[paddr].pop_back();
gpuDynInst->pAddr = pkt->req->getPaddr();
gpuDynInst->decrementStatusVector(index);
DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector());
if (gpuDynInst->allLanesZero()) {
auto iter = gpuDynInst->memStatusVector.begin();
auto end = gpuDynInst->memStatusVector.end();
while (iter != end) {
assert(iter->second.empty());
++iter;
}
// Calculate the difference between the arrival of the first cache
// block and the last cache block to arrive if we have the time
// for the first cache block.
if (compute_unit->headTailMap.count(gpuDynInst)) {
Tick headTick = compute_unit->headTailMap.at(gpuDynInst);
compute_unit->headTailLatency.sample(curTick() - headTick);
compute_unit->headTailMap.erase(gpuDynInst);
}
gpuDynInst->memStatusVector.clear();
// note: only handle read response here; for write, the response
// is separately handled when writeComplete callback is received
if (pkt->isRead()) {
gpuDynInst->
profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
compute_unit->cu_id, gpuDynInst->simdId,
gpuDynInst->wfSlotId);
}
} else {
if (!compute_unit->headTailMap.count(gpuDynInst)) {
compute_unit->headTailMap.insert(
std::make_pair(gpuDynInst, curTick()));
}
}
delete pkt->senderState;
delete pkt;
}
ComputeUnit*
ComputeUnitParams::create()
{
return new ComputeUnit(this);
}
bool
ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
{
Addr line = pkt->req->getPaddr();
DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
pkt->req->getVaddr(), line);
assert(pkt->senderState);
computeUnit->tlbCycles += curTick();
// pop off the TLB translation state
TheISA::GpuTLB::TranslationState *translation_state =
safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
// no PageFaults are permitted for data accesses
if (!translation_state->tlbEntry) {
DTLBPort::SenderState *sender_state =
safe_cast<DTLBPort::SenderState*>(translation_state->saved);
Wavefront *w M5_VAR_USED =
computeUnit->wfList[sender_state->_gpuDynInst->simdId]
[sender_state->_gpuDynInst->wfSlotId];
DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
pkt->req->getVaddr());
}
// update the hitLevel distribution
int hit_level = translation_state->hitLevel;
computeUnit->hitsPerTLBLevel[hit_level]++;
delete translation_state->tlbEntry;
assert(!translation_state->ports.size());
pkt->senderState = translation_state->saved;
// for prefetch pkt
BaseTLB::Mode TLB_mode = translation_state->tlbMode;
delete translation_state;
// use the original sender state to know how to close this transaction
DTLBPort::SenderState *sender_state =
safe_cast<DTLBPort::SenderState*>(pkt->senderState);
GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
int mp_index = sender_state->portIndex;
Addr vaddr = pkt->req->getVaddr();
gpuDynInst->memStatusVector[line].push_back(mp_index);
gpuDynInst->tlbHitLevel[mp_index] = hit_level;
MemCmd requestCmd;
if (pkt->cmd == MemCmd::ReadResp) {
requestCmd = MemCmd::ReadReq;
} else if (pkt->cmd == MemCmd::WriteResp) {
requestCmd = MemCmd::WriteReq;
} else if (pkt->cmd == MemCmd::SwapResp) {
requestCmd = MemCmd::SwapReq;
} else {
panic("unsupported response to request conversion %s\n",
pkt->cmd.toString());
}
if (computeUnit->prefetchDepth) {
int simdId = gpuDynInst->simdId;
int wfSlotId = gpuDynInst->wfSlotId;
Addr last = 0;
switch(computeUnit->prefetchType) {
case Enums::PF_CU:
last = computeUnit->lastVaddrCU[mp_index];
break;
case Enums::PF_PHASE:
last = computeUnit->lastVaddrSimd[simdId][mp_index];
break;
case Enums::PF_WF:
last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
default:
break;
}
DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
int stride = last ? (roundDown(vaddr, TheISA::PageBytes) -
roundDown(last, TheISA::PageBytes)) >> TheISA::PageShift
: 0;
DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
computeUnit->lastVaddrCU[mp_index] = vaddr;
computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
computeUnit->prefetchStride: stride;
DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
computeUnit->cu_id, simdId, wfSlotId, mp_index);
DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
// Prefetch Next few pages atomically
for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
vaddr+stride*pf*TheISA::PageBytes);
if (!stride)
break;
RequestPtr prefetch_req = std::make_shared<Request>(
vaddr + stride * pf * TheISA::PageBytes,
sizeof(uint8_t), 0,
computeUnit->masterId(),
0, 0, nullptr);
PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
uint8_t foo = 0;
prefetch_pkt->dataStatic(&foo);
// Because it's atomic operation, only need TLB translation state
prefetch_pkt->senderState =
new TheISA::GpuTLB::TranslationState(TLB_mode,
computeUnit->shader->gpuTc, true);
// Currently prefetches are zero-latency, hence the sendFunctional
sendFunctional(prefetch_pkt);
/* safe_cast the senderState */
TheISA::GpuTLB::TranslationState *tlb_state =
safe_cast<TheISA::GpuTLB::TranslationState*>(
prefetch_pkt->senderState);
delete tlb_state->tlbEntry;
delete tlb_state;
delete prefetch_pkt;
}
}
// First we must convert the response cmd back to a request cmd so that
// the request can be sent through the cu's master port
PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
new_pkt->dataStatic(pkt->getPtr<uint8_t>());
delete pkt->senderState;
delete pkt;
// New SenderState for the memory access
new_pkt->senderState =
new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
nullptr);
// translation is done. Schedule the mem_req_event at the appropriate
// cycle to send the timing memory request to ruby
EventFunctionWrapper *mem_req_event =
computeUnit->memPort[mp_index]->createMemReqEvent(new_pkt);
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
computeUnit->cu_id, gpuDynInst->simdId,
gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
computeUnit->schedule(mem_req_event, curTick() +
computeUnit->req_tick_latency);
return true;
}
EventFunctionWrapper*
ComputeUnit::DataPort::createMemReqEvent(PacketPtr pkt)
{
return new EventFunctionWrapper(
[this, pkt]{ processMemReqEvent(pkt); },
"ComputeUnit memory request event", true);
}
EventFunctionWrapper*
ComputeUnit::DataPort::createMemRespEvent(PacketPtr pkt)
{
return new EventFunctionWrapper(
[this, pkt]{ processMemRespEvent(pkt); },
"ComputeUnit memory response event", true);
}
void
ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt)
{
SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
ComputeUnit *compute_unit M5_VAR_USED = computeUnit;
if (!(sendTimingReq(pkt))) {
retries.push_back(std::make_pair(pkt, gpuDynInst));
DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
compute_unit->cu_id, gpuDynInst->simdId,
gpuDynInst->wfSlotId, index,
pkt->req->getPaddr());
} else {
DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
"req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
gpuDynInst->wfSlotId, gpuDynInst->seqNum(), index,
pkt->req->getPaddr());
}
}
const char*
ComputeUnit::ScalarDataPort::MemReqEvent::description() const
{
return "ComputeUnit scalar memory request event";
}
void
ComputeUnit::ScalarDataPort::MemReqEvent::process()
{
SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
ComputeUnit *compute_unit M5_VAR_USED = scalarDataPort->computeUnit;
if (!(scalarDataPort->sendTimingReq(pkt))) {
scalarDataPort->retries.push_back(pkt);
DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
compute_unit->cu_id, gpuDynInst->simdId,
gpuDynInst->wfSlotId, scalarDataPort->index,
pkt->req->getPaddr());
} else {
DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: gpuDynInst: %d, index %d, addr %#x data "
"req sent!\n", compute_unit->cu_id, gpuDynInst->simdId,
gpuDynInst->wfSlotId, gpuDynInst->seqNum(),
scalarDataPort->index, pkt->req->getPaddr());
}
}
/*
* The initial translation request could have been rejected,
* if <retries> queue is not Retry sending the translation
* request. sendRetry() is called from the peer port whenever
* a translation completes.
*/
void
ComputeUnit::DTLBPort::recvReqRetry()
{
int len = retries.size();
DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
computeUnit->cu_id, len);
assert(len > 0);
assert(isStalled());
// recvReqRetry is an indication that the resource on which this
// port was stalling on is freed. So, remove the stall first
unstallPort();
for (int i = 0; i < len; ++i) {
PacketPtr pkt = retries.front();
Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
if (!sendTimingReq(pkt)) {
// Stall port
stallPort();
DPRINTF(GPUTLB, ": failed again\n");
break;
} else {
DPRINTF(GPUTLB, ": successful\n");
retries.pop_front();
}
}
}
bool
ComputeUnit::ScalarDTLBPort::recvTimingResp(PacketPtr pkt)
{
assert(pkt->senderState);
TheISA::GpuTLB::TranslationState *translation_state =
safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
// Page faults are not allowed
fatal_if(!translation_state->tlbEntry,
"Translation of vaddr %#x failed\n", pkt->req->getVaddr());
delete translation_state->tlbEntry;
assert(!translation_state->ports.size());
pkt->senderState = translation_state->saved;
delete translation_state;
ScalarDTLBPort::SenderState *sender_state =
safe_cast<ScalarDTLBPort::SenderState*>(pkt->senderState);
GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
delete pkt->senderState;
Wavefront *w M5_VAR_USED = gpuDynInst->wavefront();
DPRINTF(GPUTLB, "CU%d: WF[%d][%d][wv=%d]: scalar DTLB port received "
"translation: PA %#x -> %#x\n", computeUnit->cu_id, w->simdId,
w->wfSlotId, w->kernId, pkt->req->getVaddr(), pkt->req->getPaddr());
MemCmd mem_cmd;
if (pkt->cmd == MemCmd::ReadResp) {
mem_cmd = MemCmd::ReadReq;
} else if (pkt->cmd == MemCmd::WriteResp) {
mem_cmd = MemCmd::WriteReq;
} else {
fatal("Scalar DTLB receieved unexpected MemCmd response %s\n",
pkt->cmd.toString());
}
PacketPtr req_pkt = new Packet(pkt->req, mem_cmd);
req_pkt->dataStatic(pkt->getPtr<uint8_t>());
delete pkt;
req_pkt->senderState =
new ComputeUnit::ScalarDataPort::SenderState(gpuDynInst);
if (!computeUnit->scalarDataPort->sendTimingReq(req_pkt)) {
computeUnit->scalarDataPort->retries.push_back(req_pkt);
DPRINTF(GPUMem, "send scalar req failed for: %s\n",
gpuDynInst->disassemble());
} else {
DPRINTF(GPUMem, "send scalar req for: %s\n",
gpuDynInst->disassemble());
}
return true;
}
bool
ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt)
{
Addr line M5_VAR_USED = pkt->req->getPaddr();
DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
computeUnit->cu_id, pkt->req->getVaddr(), line);
assert(pkt->senderState);
// pop off the TLB translation state
TheISA::GpuTLB::TranslationState *translation_state
= safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
bool success = translation_state->tlbEntry != nullptr;
delete translation_state->tlbEntry;
assert(!translation_state->ports.size());
pkt->senderState = translation_state->saved;
delete translation_state;
// use the original sender state to know how to close this transaction
ITLBPort::SenderState *sender_state =
safe_cast<ITLBPort::SenderState*>(pkt->senderState);
// get the wavefront associated with this translation request
Wavefront *wavefront = sender_state->wavefront;
delete pkt->senderState;
if (success) {
// pkt is reused in fetch(), don't delete it here. However, we must
// reset the command to be a request so that it can be sent through
// the cu's master port
assert(pkt->cmd == MemCmd::ReadResp);
pkt->cmd = MemCmd::ReadReq;
computeUnit->fetchStage.fetch(pkt, wavefront);
} else {
if (wavefront->dropFetch) {
assert(wavefront->instructionBuffer.empty());
wavefront->dropFetch = false;
}
wavefront->pendingFetch = 0;
}
return true;
}
/*
* The initial translation request could have been rejected, if
* <retries> queue is not empty. Retry sending the translation
* request. sendRetry() is called from the peer port whenever
* a translation completes.
*/
void
ComputeUnit::ITLBPort::recvReqRetry()
{
int len = retries.size();
DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
assert(len > 0);
assert(isStalled());
// recvReqRetry is an indication that the resource on which this
// port was stalling on is freed. So, remove the stall first
unstallPort();
for (int i = 0; i < len; ++i) {
PacketPtr pkt = retries.front();
Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
if (!sendTimingReq(pkt)) {
stallPort(); // Stall port
DPRINTF(GPUTLB, ": failed again\n");
break;
} else {
DPRINTF(GPUTLB, ": successful\n");
retries.pop_front();
}
}
}
void
ComputeUnit::regStats()
{
ClockedObject::regStats();
vALUInsts
.name(name() + ".valu_insts")
.desc("Number of vector ALU insts issued.")
;
vALUInstsPerWF
.name(name() + ".valu_insts_per_wf")
.desc("The avg. number of vector ALU insts issued per-wavefront.")
;
sALUInsts
.name(name() + ".salu_insts")
.desc("Number of scalar ALU insts issued.")
;
sALUInstsPerWF
.name(name() + ".salu_insts_per_wf")
.desc("The avg. number of scalar ALU insts issued per-wavefront.")
;
instCyclesVALU
.name(name() + ".inst_cycles_valu")
.desc("Number of cycles needed to execute VALU insts.")
;
instCyclesSALU
.name(name() + ".inst_cycles_salu")
.desc("Number of cycles needed to execute SALU insts.")
;
threadCyclesVALU
.name(name() + ".thread_cycles_valu")
.desc("Number of thread cycles used to execute vector ALU ops. "
"Similar to instCyclesVALU but multiplied by the number of "
"active threads.")
;
vALUUtilization
.name(name() + ".valu_utilization")
.desc("Percentage of active vector ALU threads in a wave.")
;
ldsNoFlatInsts
.name(name() + ".lds_no_flat_insts")
.desc("Number of LDS insts issued, not including FLAT "
"accesses that resolve to LDS.")
;
ldsNoFlatInstsPerWF
.name(name() + ".lds_no_flat_insts_per_wf")
.desc("The avg. number of LDS insts (not including FLAT "
"accesses that resolve to LDS) per-wavefront.")
;
flatVMemInsts
.name(name() + ".flat_vmem_insts")
.desc("The number of FLAT insts that resolve to vmem issued.")
;
flatVMemInstsPerWF
.name(name() + ".flat_vmem_insts_per_wf")
.desc("The average number of FLAT insts that resolve to vmem "
"issued per-wavefront.")
;
flatLDSInsts
.name(name() + ".flat_lds_insts")
.desc("The number of FLAT insts that resolve to LDS issued.")
;
flatLDSInstsPerWF
.name(name() + ".flat_lds_insts_per_wf")
.desc("The average number of FLAT insts that resolve to LDS "
"issued per-wavefront.")
;
vectorMemWrites
.name(name() + ".vector_mem_writes")
.desc("Number of vector mem write insts (excluding FLAT insts).")
;
vectorMemWritesPerWF
.name(name() + ".vector_mem_writes_per_wf")
.desc("The average number of vector mem write insts "
"(excluding FLAT insts) per-wavefront.")
;
vectorMemReads
.name(name() + ".vector_mem_reads")
.desc("Number of vector mem read insts (excluding FLAT insts).")
;
vectorMemReadsPerWF
.name(name() + ".vector_mem_reads_per_wf")
.desc("The avg. number of vector mem read insts (excluding "
"FLAT insts) per-wavefront.")
;
scalarMemWrites
.name(name() + ".scalar_mem_writes")
.desc("Number of scalar mem write insts.")
;
scalarMemWritesPerWF
.name(name() + ".scalar_mem_writes_per_wf")
.desc("The average number of scalar mem write insts per-wavefront.")
;
scalarMemReads
.name(name() + ".scalar_mem_reads")
.desc("Number of scalar mem read insts.")
;
scalarMemReadsPerWF
.name(name() + ".scalar_mem_reads_per_wf")
.desc("The average number of scalar mem read insts per-wavefront.")
;
vALUInstsPerWF = vALUInsts / completedWfs;
sALUInstsPerWF = sALUInsts / completedWfs;
vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs;
flatVMemInstsPerWF = flatVMemInsts / completedWfs;
flatLDSInstsPerWF = flatLDSInsts / completedWfs;
vectorMemWritesPerWF = vectorMemWrites / completedWfs;
vectorMemReadsPerWF = vectorMemReads / completedWfs;
scalarMemWritesPerWF = scalarMemWrites / completedWfs;
scalarMemReadsPerWF = scalarMemReads / completedWfs;
vectorMemReadsPerKiloInst
.name(name() + ".vector_mem_reads_per_kilo_inst")
.desc("Number of vector mem reads per kilo-instruction")
;
vectorMemReadsPerKiloInst = (vectorMemReads / numInstrExecuted) * 1000;
vectorMemWritesPerKiloInst
.name(name() + ".vector_mem_writes_per_kilo_inst")
.desc("Number of vector mem writes per kilo-instruction")
;
vectorMemWritesPerKiloInst = (vectorMemWrites / numInstrExecuted) * 1000;
vectorMemInstsPerKiloInst
.name(name() + ".vector_mem_insts_per_kilo_inst")
.desc("Number of vector mem insts per kilo-instruction")
;
vectorMemInstsPerKiloInst =
((vectorMemReads + vectorMemWrites) / numInstrExecuted) * 1000;
scalarMemReadsPerKiloInst
.name(name() + ".scalar_mem_reads_per_kilo_inst")
.desc("Number of scalar mem reads per kilo-instruction")
;
scalarMemReadsPerKiloInst = (scalarMemReads / numInstrExecuted) * 1000;
scalarMemWritesPerKiloInst
.name(name() + ".scalar_mem_writes_per_kilo_inst")
.desc("Number of scalar mem writes per kilo-instruction")
;
scalarMemWritesPerKiloInst = (scalarMemWrites / numInstrExecuted) * 1000;
scalarMemInstsPerKiloInst
.name(name() + ".scalar_mem_insts_per_kilo_inst")
.desc("Number of scalar mem insts per kilo-instruction")
;
scalarMemInstsPerKiloInst =
((scalarMemReads + scalarMemWrites) / numInstrExecuted) * 1000;
instCyclesVMemPerSimd
.init(numVectorALUs)
.name(name() + ".inst_cycles_vector_memory")
.desc("Number of cycles to send address, command, data from VRF to "
"vector memory unit, per SIMD")
;
instCyclesScMemPerSimd
.init(numVectorALUs)
.name(name() + ".inst_cycles_scalar_memory")
.desc("Number of cycles to send address, command, data from SRF to "
"scalar memory unit, per SIMD")
;
instCyclesLdsPerSimd
.init(numVectorALUs)
.name(name() + ".inst_cycles_lds")
.desc("Number of cycles to send address, command, data from VRF to "
"LDS unit, per SIMD")
;
globalReads
.name(name() + ".global_mem_reads")
.desc("Number of reads to the global segment")
;
globalWrites
.name(name() + ".global_mem_writes")
.desc("Number of writes to the global segment")
;
globalMemInsts
.name(name() + ".global_mem_insts")
.desc("Number of memory instructions sent to the global segment")
;
globalMemInsts = globalReads + globalWrites;
argReads
.name(name() + ".arg_reads")
.desc("Number of reads to the arg segment")
;
argWrites
.name(name() + ".arg_writes")
.desc("NUmber of writes to the arg segment")
;
argMemInsts
.name(name() + ".arg_mem_insts")
.desc("Number of memory instructions sent to the arg segment")
;
argMemInsts = argReads + argWrites;
spillReads
.name(name() + ".spill_reads")
.desc("Number of reads to the spill segment")
;
spillWrites
.name(name() + ".spill_writes")
.desc("Number of writes to the spill segment")
;
spillMemInsts
.name(name() + ".spill_mem_insts")
.desc("Number of memory instructions sent to the spill segment")
;
spillMemInsts = spillReads + spillWrites;
groupReads
.name(name() + ".group_reads")
.desc("Number of reads to the group segment")
;
groupWrites
.name(name() + ".group_writes")
.desc("Number of writes to the group segment")
;
groupMemInsts
.name(name() + ".group_mem_insts")
.desc("Number of memory instructions sent to the group segment")
;
groupMemInsts = groupReads + groupWrites;
privReads
.name(name() + ".private_reads")
.desc("Number of reads to the private segment")
;
privWrites
.name(name() + ".private_writes")
.desc("Number of writes to the private segment")
;
privMemInsts
.name(name() + ".private_mem_insts")
.desc("Number of memory instructions sent to the private segment")
;
privMemInsts = privReads + privWrites;
readonlyReads
.name(name() + ".readonly_reads")
.desc("Number of reads to the readonly segment")
;
readonlyWrites
.name(name() + ".readonly_writes")
.desc("Number of memory instructions sent to the readonly segment")
;
readonlyMemInsts
.name(name() + ".readonly_mem_insts")
.desc("Number of memory instructions sent to the readonly segment")
;
readonlyMemInsts = readonlyReads + readonlyWrites;
kernargReads
.name(name() + ".kernarg_reads")
.desc("Number of reads sent to the kernarg segment")
;
kernargWrites
.name(name() + ".kernarg_writes")
.desc("Number of memory instructions sent to the kernarg segment")
;
kernargMemInsts
.name(name() + ".kernarg_mem_insts")
.desc("Number of memory instructions sent to the kernarg segment")
;
kernargMemInsts = kernargReads + kernargWrites;
tlbCycles
.name(name() + ".tlb_cycles")
.desc("total number of cycles for all uncoalesced requests")
;
tlbRequests
.name(name() + ".tlb_requests")
.desc("number of uncoalesced requests")
;
tlbLatency
.name(name() + ".avg_translation_latency")
.desc("Avg. translation latency for data translations")
;
tlbLatency = tlbCycles / tlbRequests;
hitsPerTLBLevel
.init(4)
.name(name() + ".TLB_hits_distribution")
.desc("TLB hits distribution (0 for page table, x for Lx-TLB")
;
// fixed number of TLB levels
for (int i = 0; i < 4; ++i) {
if (!i)
hitsPerTLBLevel.subname(i,"page_table");
else
hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
}
execRateDist
.init(0, 10, 2)
.name(name() + ".inst_exec_rate")
.desc("Instruction Execution Rate: Number of executed vector "
"instructions per cycle")
;
ldsBankConflictDist
.init(0, wfSize(), 2)
.name(name() + ".lds_bank_conflicts")
.desc("Number of bank conflicts per LDS memory packet")
;
ldsBankAccesses
.name(name() + ".lds_bank_access_cnt")
.desc("Total number of LDS bank accesses")
;
pageDivergenceDist
// A wavefront can touch up to N pages per memory instruction where
// N is equal to the wavefront size
// The number of pages per bin can be configured (here it's 4).
.init(1, wfSize(), 4)
.name(name() + ".page_divergence_dist")
.desc("pages touched per wf (over all mem. instr.)")
;
controlFlowDivergenceDist
.init(1, wfSize(), 4)
.name(name() + ".warp_execution_dist")
.desc("number of lanes active per instruction (oval all instructions)")
;
activeLanesPerGMemInstrDist
.init(1, wfSize(), 4)
.name(name() + ".gmem_lanes_execution_dist")
.desc("number of active lanes per global memory instruction")
;
activeLanesPerLMemInstrDist
.init(1, wfSize(), 4)
.name(name() + ".lmem_lanes_execution_dist")
.desc("number of active lanes per local memory instruction")
;
numInstrExecuted
.name(name() + ".num_instr_executed")
.desc("number of instructions executed")
;
numVecOpsExecuted
.name(name() + ".num_vec_ops_executed")
.desc("number of vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedF16
.name(name() + ".num_vec_ops_f16_executed")
.desc("number of f16 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedF32
.name(name() + ".num_vec_ops_f32_executed")
.desc("number of f32 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedF64
.name(name() + ".num_vec_ops_f64_executed")
.desc("number of f64 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedFMA16
.name(name() + ".num_vec_ops_fma16_executed")
.desc("number of fma16 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedFMA32
.name(name() + ".num_vec_ops_fma32_executed")
.desc("number of fma32 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedFMA64
.name(name() + ".num_vec_ops_fma64_executed")
.desc("number of fma64 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedMAD16
.name(name() + ".num_vec_ops_mad16_executed")
.desc("number of mad16 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedMAD32
.name(name() + ".num_vec_ops_mad32_executed")
.desc("number of mad32 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedMAD64
.name(name() + ".num_vec_ops_mad64_executed")
.desc("number of mad64 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedMAC16
.name(name() + ".num_vec_ops_mac16_executed")
.desc("number of mac16 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedMAC32
.name(name() + ".num_vec_ops_mac32_executed")
.desc("number of mac32 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedMAC64
.name(name() + ".num_vec_ops_mac64_executed")
.desc("number of mac64 vec ops executed (e.g. WF size/inst)")
;
numVecOpsExecutedTwoOpFP
.name(name() + ".num_vec_ops_two_op_fp_executed")
.desc("number of two op FP vec ops executed (e.g. WF size/inst)")
;
totalCycles
.name(name() + ".num_total_cycles")
.desc("number of cycles the CU ran for")
;
ipc
.name(name() + ".ipc")
.desc("Instructions per cycle (this CU only)")
;
vpc
.name(name() + ".vpc")
.desc("Vector Operations per cycle (this CU only)")
;
vpc_f16
.name(name() + ".vpc_f16")
.desc("F16 Vector Operations per cycle (this CU only)")
;
vpc_f32
.name(name() + ".vpc_f32")
.desc("F32 Vector Operations per cycle (this CU only)")
;
vpc_f64
.name(name() + ".vpc_f64")
.desc("F64 Vector Operations per cycle (this CU only)")
;
numALUInstsExecuted
.name(name() + ".num_alu_insts_executed")
.desc("Number of dynamic non-GM memory insts executed")
;
wgBlockedDueLdsAllocation
.name(name() + ".wg_blocked_due_lds_alloc")
.desc("Workgroup blocked due to LDS capacity")
;
ipc = numInstrExecuted / totalCycles;
vpc = numVecOpsExecuted / totalCycles;
vpc_f16 = numVecOpsExecutedF16 / totalCycles;
vpc_f32 = numVecOpsExecutedF32 / totalCycles;
vpc_f64 = numVecOpsExecutedF64 / totalCycles;
numTimesWgBlockedDueVgprAlloc
.name(name() + ".times_wg_blocked_due_vgpr_alloc")
.desc("Number of times WGs are blocked due to VGPR allocation per "
"SIMD")
;
numTimesWgBlockedDueSgprAlloc
.name(name() + ".times_wg_blocked_due_sgpr_alloc")
.desc("Number of times WGs are blocked due to SGPR allocation per "
"SIMD")
;
dynamicGMemInstrCnt
.name(name() + ".global_mem_instr_cnt")
.desc("dynamic non-flat global memory instruction count")
;
dynamicFlatMemInstrCnt
.name(name() + ".flat_global_mem_instr_cnt")
.desc("dynamic flat global memory instruction count")
;
dynamicLMemInstrCnt
.name(name() + ".local_mem_instr_cnt")
.desc("dynamic local memory intruction count")
;
numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt -
dynamicLMemInstrCnt;
completedWfs
.name(name() + ".num_completed_wfs")
.desc("number of completed wavefronts")
;
completedWGs
.name(name() + ".num_completed_wgs")
.desc("number of completed workgroups")
;
numCASOps
.name(name() + ".num_CAS_ops")
.desc("number of compare and swap operations")
;
numFailedCASOps
.name(name() + ".num_failed_CAS_ops")
.desc("number of compare and swap operations that failed")
;
headTailLatency
.init(0, 1000000, 10000)
.name(name() + ".head_tail_latency")
.desc("ticks between first and last cache block arrival at coalescer")
.flags(Stats::pdf | Stats::oneline)
;
waveLevelParallelism
.init(0, shader->n_wf * numVectorALUs, 1)
.name(name() + ".wlp")
.desc("wave level parallelism: count of active waves at wave launch")
;
instInterleave
.init(numVectorALUs, 0, 20, 1)
.name(name() + ".interleaving")
.desc("Measure of instruction interleaving per SIMD")
;
// register stats of pipeline stages
fetchStage.regStats();
scoreboardCheckStage.regStats();
scheduleStage.regStats();
execStage.regStats();
// register stats of memory pipelines
globalMemoryPipe.regStats();
localMemoryPipe.regStats();
scalarMemoryPipe.regStats();
registerManager->regStats();
}
void
ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
{
if (gpuDynInst->isScalar()) {
if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
sALUInsts++;
instCyclesSALU++;
} else if (gpuDynInst->isLoad()) {
scalarMemReads++;
} else if (gpuDynInst->isStore()) {
scalarMemWrites++;
}
} else {
if (gpuDynInst->isALU()) {
shader->total_valu_insts++;
if (shader->total_valu_insts == shader->max_valu_insts) {
exitSimLoop("max vALU insts");
}
vALUInsts++;
instCyclesVALU++;
threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
} else if (gpuDynInst->isFlat()) {
if (gpuDynInst->isLocalMem()) {
flatLDSInsts++;
} else {
flatVMemInsts++;
}
} else if (gpuDynInst->isLocalMem()) {
ldsNoFlatInsts++;
} else if (gpuDynInst->isLoad()) {
vectorMemReads++;
} else if (gpuDynInst->isStore()) {
vectorMemWrites++;
}
if (gpuDynInst->isLoad()) {
switch (gpuDynInst->executedAs()) {
case Enums::SC_SPILL:
spillReads++;
break;
case Enums::SC_GLOBAL:
globalReads++;
break;
case Enums::SC_GROUP:
groupReads++;
break;
case Enums::SC_PRIVATE:
privReads++;
break;
case Enums::SC_READONLY:
readonlyReads++;
break;
case Enums::SC_KERNARG:
kernargReads++;
break;
case Enums::SC_ARG:
argReads++;
break;
case Enums::SC_NONE:
/**
* this case can occur for flat mem insts
* who execute with EXEC = 0
*/
break;
default:
fatal("%s has no valid segment\n", gpuDynInst->disassemble());
break;
}
} else if (gpuDynInst->isStore()) {
switch (gpuDynInst->executedAs()) {
case Enums::SC_SPILL:
spillWrites++;
break;
case Enums::SC_GLOBAL:
globalWrites++;
break;
case Enums::SC_GROUP:
groupWrites++;
break;
case Enums::SC_PRIVATE:
privWrites++;
break;
case Enums::SC_READONLY:
readonlyWrites++;
break;
case Enums::SC_KERNARG:
kernargWrites++;
break;
case Enums::SC_ARG:
argWrites++;
break;
case Enums::SC_NONE:
/**
* this case can occur for flat mem insts
* who execute with EXEC = 0
*/
break;
default:
fatal("%s has no valid segment\n", gpuDynInst->disassemble());
break;
}
}
}
}
void
ComputeUnit::updatePageDivergenceDist(Addr addr)
{
Addr virt_page_addr = roundDown(addr, TheISA::PageBytes);
if (!pagesTouched.count(virt_page_addr))
pagesTouched[virt_page_addr] = 1;
else
pagesTouched[virt_page_addr]++;
}
void
ComputeUnit::CUExitCallback::process()
{
if (computeUnit->countPages) {
std::ostream *page_stat_file =
simout.create(computeUnit->name().c_str())->stream();
*page_stat_file << "page, wavefront accesses, workitem accesses" <<
std::endl;
for (auto iter : computeUnit->pageAccesses) {
*page_stat_file << std::hex << iter.first << ",";
*page_stat_file << std::dec << iter.second.first << ",";
*page_stat_file << std::dec << iter.second.second << std::endl;
}
}
}
bool
ComputeUnit::isDone() const
{
for (int i = 0; i < numVectorALUs; ++i) {
if (!isVectorAluIdle(i)) {
return false;
}
}
// TODO: FIXME if more than 1 of any memory pipe supported
if (!srfToScalarMemPipeBus.rdy()) {
return false;
}
if (!vrfToGlobalMemPipeBus.rdy()) {
return false;
}
if (!vrfToLocalMemPipeBus.rdy()) {
return false;
}
if (!globalMemoryPipe.isGMReqFIFOWrRdy()
|| !localMemoryPipe.isLMReqFIFOWrRdy()
|| !localMemoryPipe.isLMRespFIFOWrRdy() || !locMemToVrfBus.rdy() ||
!glbMemToVrfBus.rdy() || !scalarMemToSrfBus.rdy()) {
return false;
}
return true;
}
int32_t
ComputeUnit::getRefCounter(const uint32_t dispatchId,
const uint32_t wgId) const
{
return lds.getRefCounter(dispatchId, wgId);
}
bool
ComputeUnit::isVectorAluIdle(uint32_t simdId) const
{
assert(simdId < numVectorALUs);
for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
if (wfList[simdId][i_wf]->getStatus() != Wavefront::S_STOPPED) {
return false;
}
}
return true;
}
/**
* send a general request to the LDS
* make sure to look at the return value here as your request might be
* NACK'd and returning false means that you have to have some backup plan
*/
bool
ComputeUnit::sendToLds(GPUDynInstPtr gpuDynInst)
{
// this is just a request to carry the GPUDynInstPtr
// back and forth
RequestPtr newRequest = std::make_shared<Request>();
newRequest->setPaddr(0x0);
// ReadReq is not evaluted by the LDS but the Packet ctor requires this
PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
// This is the SenderState needed upon return
newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
return ldsPort->sendTimingReq(newPacket);
}
/**
* get the result of packets sent to the LDS when they return
*/
bool
ComputeUnit::LDSPort::recvTimingResp(PacketPtr packet)
{
const ComputeUnit::LDSPort::SenderState *senderState =
dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
fatal_if(!senderState, "did not get the right sort of sender state");
GPUDynInstPtr gpuDynInst = senderState->getMemInst();
delete packet->senderState;
delete packet;
computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
return true;
}
/**
* attempt to send this packet, either the port is already stalled, the request
* is nack'd and must stall or the request goes through
* when a request cannot be sent, add it to the retries queue
*/
bool
ComputeUnit::LDSPort::sendTimingReq(PacketPtr pkt)
{
ComputeUnit::LDSPort::SenderState *sender_state =
dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
fatal_if(!sender_state, "packet without a valid sender state");
GPUDynInstPtr gpuDynInst M5_VAR_USED = sender_state->getMemInst();
if (isStalled()) {
fatal_if(retries.empty(), "must have retries waiting to be stalled");
retries.push(pkt);
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
computeUnit->cu_id, gpuDynInst->simdId,
gpuDynInst->wfSlotId);
return false;
} else if (!MasterPort::sendTimingReq(pkt)) {
// need to stall the LDS port until a recvReqRetry() is received
// this indicates that there is more space
stallPort();
retries.push(pkt);
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
computeUnit->cu_id, gpuDynInst->simdId,
gpuDynInst->wfSlotId, pkt->req->getPaddr());
return false;
} else {
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
computeUnit->cu_id, gpuDynInst->simdId,
gpuDynInst->wfSlotId, pkt->req->getPaddr());
return true;
}
}
/**
* the bus is telling the port that there is now space so retrying stalled
* requests should work now
* this allows the port to have a request be nack'd and then have the receiver
* say when there is space, rather than simply retrying the send every cycle
*/
void
ComputeUnit::LDSPort::recvReqRetry()
{
auto queueSize = retries.size();
DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
computeUnit->cu_id, queueSize);
fatal_if(queueSize < 1,
"why was there a recvReqRetry() with no pending reqs?");
fatal_if(!isStalled(),
"recvReqRetry() happened when the port was not stalled");
unstallPort();
while (!retries.empty()) {
PacketPtr packet = retries.front();
DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
if (!MasterPort::sendTimingReq(packet)) {
// Stall port
stallPort();
DPRINTF(GPUPort, ": LDS send failed again\n");
break;
} else {
DPRINTF(GPUTLB, ": LDS send successful\n");
retries.pop();
}
}
}