Files
gem5/src/gpu-compute/wavefront.cc
Daniel R. Carvalho 974a47dfb9 misc: Adopt the gem5 namespace
Apply the gem5 namespace to the codebase.

Some anonymous namespaces could theoretically be removed,
but since this change's main goal was to keep conflicts
at a minimum, it was decided not to modify much the
general shape of the files.

A few missing comments of the form "// namespace X" that
occurred before the newly added "} // namespace gem5"
have been added for consistency.

std out should not be included in the gem5 namespace, so
they weren't.

ProtoMessage has not been included in the gem5 namespace,
since I'm not familiar with how proto works.

Regarding the SystemC files, although they belong to gem5,
they actually perform integration between gem5 and SystemC;
therefore, it deserved its own separate namespace.

Files that are automatically generated have been included
in the gem5 namespace.

The .isa files currently are limited to a single namespace.
This limitation should be later removed to make it easier
to accomodate a better API.

Regarding the files in util, gem5:: was prepended where
suitable. Notice that this patch was tested as much as
possible given that most of these were already not
previously compiling.

Change-Id: Ia53d404ec79c46edaa98f654e23bc3b0e179fe2d
Signed-off-by: Daniel R. Carvalho <odanrc@yahoo.com.br>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/46323
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Reviewed-by: Matthew Poremba <matthew.poremba@amd.com>
Tested-by: kokoro <noreply+kokoro@google.com>
2021-07-01 19:08:24 +00:00

1470 lines
51 KiB
C++

/*
* Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "gpu-compute/wavefront.hh"
#include "base/bitfield.hh"
#include "debug/GPUExec.hh"
#include "debug/GPUInitAbi.hh"
#include "debug/WavefrontStack.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/simple_pool_manager.hh"
#include "gpu-compute/vector_register_file.hh"
namespace gem5
{
Wavefront::Wavefront(const Params &p)
: SimObject(p), wfSlotId(p.wf_slot_id), simdId(p.simdId),
maxIbSize(p.max_ib_size), _gpuISA(*this),
vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
sleepCnt(0), barId(WFBarrier::InvalidID), stats(this)
{
lastTrace = 0;
execUnitId = -1;
status = S_STOPPED;
reservedVectorRegs = 0;
reservedScalarRegs = 0;
startVgprIndex = 0;
startSgprIndex = 0;
outstandingReqs = 0;
outstandingReqsWrGm = 0;
outstandingReqsWrLm = 0;
outstandingReqsRdGm = 0;
outstandingReqsRdLm = 0;
rdLmReqsInPipe = 0;
rdGmReqsInPipe = 0;
wrLmReqsInPipe = 0;
wrGmReqsInPipe = 0;
scalarRdGmReqsInPipe = 0;
scalarWrGmReqsInPipe = 0;
scalarOutstandingReqsRdGm = 0;
scalarOutstandingReqsWrGm = 0;
lastNonIdleTick = 0;
ldsChunk = nullptr;
memTraceBusy = 0;
oldVgprTcnt = 0xffffffffffffffffll;
oldDgprTcnt = 0xffffffffffffffffll;
oldVgpr.resize(p.wf_size);
pendingFetch = false;
dropFetch = false;
maxVgprs = 0;
maxSgprs = 0;
lastAddr.resize(p.wf_size);
workItemFlatId.resize(p.wf_size);
oldDgpr.resize(p.wf_size);
for (int i = 0; i < 3; ++i) {
workItemId[i].resize(p.wf_size);
}
_execMask.set();
rawDist.clear();
lastInstExec = 0;
vecReads.clear();
}
void
Wavefront::init()
{
reservedVectorRegs = 0;
reservedScalarRegs = 0;
startVgprIndex = 0;
startSgprIndex = 0;
scalarAlu = computeUnit->mapWaveToScalarAlu(this);
scalarAluGlobalIdx = computeUnit->mapWaveToScalarAluGlobalIdx(this);
globalMem = computeUnit->mapWaveToGlobalMem(this);
localMem = computeUnit->mapWaveToLocalMem(this);
scalarMem = computeUnit->mapWaveToScalarMem(this);
}
void
Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
{
int regInitIdx = 0;
// iterate over all the init fields and check which
// bits are enabled
for (int en_bit = 0; en_bit < NumScalarInitFields; ++en_bit) {
if (task->sgprBitEnabled(en_bit)) {
int physSgprIdx = 0;
uint32_t wiCount = 0;
uint32_t firstWave = 0;
int orderedAppendTerm = 0;
int numWfsInWg = 0;
uint32_t finalValue = 0;
Addr host_disp_pkt_addr = task->hostDispPktAddr();
Addr kernarg_addr = task->kernargAddr();
Addr hidden_priv_base(0);
switch (en_bit) {
case PrivateSegBuf:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->amdQueue.scratch_resource_descriptor[0]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting PrivateSegBuffer: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->amdQueue.scratch_resource_descriptor[0]);
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->amdQueue.scratch_resource_descriptor[1]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting PrivateSegBuffer: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->amdQueue.scratch_resource_descriptor[1]);
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->amdQueue.scratch_resource_descriptor[2]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting PrivateSegBuffer: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->amdQueue.scratch_resource_descriptor[2]);
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->amdQueue.scratch_resource_descriptor[3]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting PrivateSegBuffer: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->amdQueue.scratch_resource_descriptor[3]);
break;
case DispatchPtr:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(host_disp_pkt_addr, 31, 0));
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting DispatchPtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(host_disp_pkt_addr, 31, 0));
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(host_disp_pkt_addr, 63, 32));
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting DispatchPtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(host_disp_pkt_addr, 63, 32));
++regInitIdx;
break;
case QueuePtr:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(task->hostAMDQueueAddr, 31, 0));
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting QueuePtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(task->hostAMDQueueAddr, 31, 0));
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(task->hostAMDQueueAddr, 63, 32));
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting QueuePtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(task->hostAMDQueueAddr, 63, 32));
++regInitIdx;
break;
case KernargSegPtr:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(kernarg_addr, 31, 0));
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting KernargSegPtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(kernarg_addr, 31, 0));
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(kernarg_addr, 63, 32));
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting KernargSegPtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(kernarg_addr, 63, 32));
++regInitIdx;
break;
case FlatScratchInit:
physSgprIdx
= computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
(TheGpuISA::ScalarRegU32)(task->amdQueue
.scratch_backing_memory_location & 0xffffffff));
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting FlatScratch Addr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
(TheGpuISA::ScalarRegU32)(task->amdQueue
.scratch_backing_memory_location & 0xffffffff));
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
// This vallue should be sizeof(DWORD) aligned, that is
// 4 byte aligned
computeUnit->srf[simdId]->write(physSgprIdx,
task->amdQueue.scratch_workitem_byte_size);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting FlatScratch size: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->amdQueue.scratch_workitem_byte_size);
/**
* Since flat scratch init is needed for this kernel, this
* kernel is going to have flat memory instructions and we
* need to initialize the hidden private base for this queue.
* scratch_resource_descriptor[0] has this queue's scratch
* base address. scratch_backing_memory_location has the
* offset to this queue's scratch base address from the
* SH_HIDDEN_PRIVATE_BASE_VMID. Ideally, we only require this
* queue's scratch base address for address calculation
* (stored in scratch_resource_descriptor[0]). But that
* address calculation shoule be done by first finding the
* queue's scratch base address using the calculation
* "SH_HIDDEN_PRIVATE_BASE_VMID + offset". So, we initialize
* SH_HIDDEN_PRIVATE_BASE_VMID.
*
* For more details see:
* http://rocm-documentation.readthedocs.io/en/latest/
* ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
*
* https://github.com/ROCm-Developer-Tools/
* ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
* #flat-addressing
*/
hidden_priv_base =
(uint64_t)task->amdQueue.scratch_resource_descriptor[0] |
(((uint64_t)task->amdQueue.scratch_resource_descriptor[1]
& 0x000000000000ffff) << 32);
computeUnit->shader->initShHiddenPrivateBase(
hidden_priv_base,
task->amdQueue.scratch_backing_memory_location);
break;
case GridWorkgroupCountX:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
wiCount = ((task->gridSize(0) +
task->wgSize(0) - 1) /
task->wgSize(0));
computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting num WG X: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, wiCount);
break;
case GridWorkgroupCountY:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
wiCount = ((task->gridSize(1) +
task->wgSize(1) - 1) /
task->wgSize(1));
computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting num WG Y: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, wiCount);
break;
case GridWorkgroupCountZ:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
wiCount = ((task->gridSize(2) +
task->wgSize(2) - 1) /
task->wgSize(2));
computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting num WG Z: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, wiCount);
break;
case WorkgroupIdX:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
workGroupId[0]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting WG ID X: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, workGroupId[0]);
break;
case WorkgroupIdY:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
workGroupId[1]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting WG ID Y: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, workGroupId[1]);
break;
case WorkgroupIdZ:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
workGroupId[2]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting WG ID Z: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
break;
case PrivSegWaveByteOffset:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
/**
* the compute_tmpring_size_wavesize specifies the number of
* kB allocated per wavefront, hence the multiplication by
* 1024.
*
* to get the per wavefront offset into the scratch
* memory, we also multiply this by the wfId. the wfId stored
* in the Wavefront class, however, is the wave ID within the
* WG, whereas here we need the global WFID because the
* scratch space will be divided amongst all waves in the
* kernel. to get the global ID we multiply the WGID by
* the WG size, then add the WFID of the wave within its WG.
*/
computeUnit->srf[simdId]->write(physSgprIdx, 1024 *
(wgId * (wgSz / 64) + wfId) *
task->amdQueue.compute_tmpring_size_wavesize);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting Private Seg Offset: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
1024 * (wgId * (wgSz / 64) + wfId) *
task->amdQueue.compute_tmpring_size_wavesize);
break;
case WorkgroupInfo:
firstWave = (wfId == 0) ? 1 : 0;
numWfsInWg = divCeil(wgSizeInWorkItems,
computeUnit->wfSize());
finalValue = firstWave << ((sizeof(uint32_t) * 8) - 1);
finalValue |= (orderedAppendTerm << 6);
finalValue |= numWfsInWg;
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->
write(physSgprIdx, finalValue);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting WG Info: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, finalValue);
break;
default:
fatal("SGPR enable bit %i not supported\n", en_bit);
break;
}
}
}
regInitIdx = 0;
// iterate over all the init fields and check which
// bits are enabled
for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) {
if (task->vgprBitEnabled(en_bit)) {
uint32_t physVgprIdx = 0;
TheGpuISA::VecRegContainerU32 raw_vgpr;
switch (en_bit) {
case WorkitemIdX:
{
physVgprIdx = computeUnit->registerManager
->mapVgpr(this, regInitIdx);
TheGpuISA::VecElemU32 *vgpr_x
= raw_vgpr.as<TheGpuISA::VecElemU32>();
for (int lane = 0; lane < workItemId[0].size(); ++lane) {
vgpr_x[lane] = workItemId[0][lane];
}
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
rawDist[regInitIdx] = 0;
++regInitIdx;
}
break;
case WorkitemIdY:
{
physVgprIdx = computeUnit->registerManager
->mapVgpr(this, regInitIdx);
TheGpuISA::VecElemU32 *vgpr_y
= raw_vgpr.as<TheGpuISA::VecElemU32>();
for (int lane = 0; lane < workItemId[1].size(); ++lane) {
vgpr_y[lane] = workItemId[1][lane];
}
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
rawDist[regInitIdx] = 0;
++regInitIdx;
}
break;
case WorkitemIdZ:
{
physVgprIdx = computeUnit->registerManager->
mapVgpr(this, regInitIdx);
TheGpuISA::VecElemU32 *vgpr_z
= raw_vgpr.as<TheGpuISA::VecElemU32>();
for (int lane = 0; lane < workItemId[2].size(); ++lane) {
vgpr_z[lane] = workItemId[2][lane];
}
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
rawDist[regInitIdx] = 0;
++regInitIdx;
}
break;
}
}
}
}
void
Wavefront::resizeRegFiles(int num_vregs, int num_sregs)
{
maxVgprs = num_vregs;
maxSgprs = num_sregs;
}
Wavefront::~Wavefront()
{
}
void
Wavefront::setStatus(status_e newStatus)
{
if (computeUnit->idleCUTimeout > 0) {
// Wavefront's status transitions to stalled or stopped
if ((newStatus == S_STOPPED || newStatus == S_STALLED ||
newStatus == S_WAITCNT || newStatus == S_BARRIER) &&
(status != newStatus)) {
computeUnit->idleWfs++;
assert(computeUnit->idleWfs <=
(computeUnit->shader->n_wf * computeUnit->numVectorALUs));
if (computeUnit->idleWfs ==
(computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
lastNonIdleTick = curTick();
}
// Wavefront's status transitions to an active state (from
// a stopped or stalled state)
} else if ((status == S_STOPPED || status == S_STALLED ||
status == S_WAITCNT || status == S_BARRIER) &&
(status != newStatus)) {
// if all WFs in the CU were idle then check if the idleness
// period exceeded the timeout threshold
if (computeUnit->idleWfs ==
(computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
panic_if((curTick() - lastNonIdleTick) >=
computeUnit->idleCUTimeout,
"CU%d has been idle for %d ticks at tick %d",
computeUnit->cu_id, computeUnit->idleCUTimeout,
curTick());
}
computeUnit->idleWfs--;
assert(computeUnit->idleWfs >= 0);
}
}
status = newStatus;
}
void
Wavefront::start(uint64_t _wf_dyn_id, Addr init_pc)
{
wfDynId = _wf_dyn_id;
_pc = init_pc;
status = S_RUNNING;
vecReads.resize(maxVgprs, 0);
}
bool
Wavefront::isGmInstruction(GPUDynInstPtr ii)
{
if (ii->isGlobalMem() ||
(ii->isFlat() && ii->executedAs() == enums::SC_GLOBAL)) {
return true;
}
return false;
}
bool
Wavefront::isLmInstruction(GPUDynInstPtr ii)
{
if (ii->isLocalMem() ||
(ii->isFlat() && ii->executedAs() == enums::SC_GROUP)) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstSleep()
{
if (instructionBuffer.empty())
return false;
GPUDynInstPtr ii = instructionBuffer.front();
if (ii->isSleep()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstWaitcnt()
{
if (instructionBuffer.empty())
return false;
GPUDynInstPtr ii = instructionBuffer.front();
if (ii->isWaitcnt()) {
// waitcnt is a scalar
assert(ii->isScalar());
return true;
}
return false;
}
bool
Wavefront::isOldestInstScalarALU()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
|| ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
(ii->isKernArgSeg() && ii->isLoad()))) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstVectorALU()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && !ii->isScalar() && (ii->isNop() ||
ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
|| (ii->isKernArgSeg() && ii->isLoad()))) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstBarrier()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isBarrier()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstGMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && !ii->isScalar() && ii->isGlobalMem()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstScalarMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isScalar() && ii->isGlobalMem()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstLMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isLocalMem()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstPrivMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isPrivateSeg()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstFlatMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isFlat()) {
return true;
}
return false;
}
bool
Wavefront::stopFetch()
{
for (auto it : instructionBuffer) {
GPUDynInstPtr ii = it;
if (ii->isReturn() || ii->isBranch() ||
ii->isEndOfKernel()) {
return true;
}
}
return false;
}
void
Wavefront::freeResources()
{
execUnitId = -1;
}
void Wavefront::validateRequestCounters()
{
panic_if(wrGmReqsInPipe < 0 || rdGmReqsInPipe < 0 ||
wrLmReqsInPipe < 0 || rdLmReqsInPipe < 0 ||
outstandingReqs < 0,
"Negative requests in pipe for WF%d for slot%d"
" and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
" Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
" Outstanding Reqs=%d\n",
wfDynId, wfSlotId, simdId, rdGmReqsInPipe, wrGmReqsInPipe,
rdLmReqsInPipe, wrLmReqsInPipe, outstandingReqs);
}
void
Wavefront::reserveGmResource(GPUDynInstPtr ii)
{
if (!ii->isScalar()) {
if (ii->isLoad()) {
rdGmReqsInPipe++;
} else if (ii->isStore()) {
wrGmReqsInPipe++;
} else if (ii->isAtomic() || ii->isMemSync()) {
rdGmReqsInPipe++;
wrGmReqsInPipe++;
} else {
panic("Invalid memory operation!\n");
}
execUnitId = globalMem;
} else {
if (ii->isLoad()) {
scalarRdGmReqsInPipe++;
} else if (ii->isStore()) {
scalarWrGmReqsInPipe++;
} else if (ii->isAtomic() || ii->isMemSync()) {
scalarWrGmReqsInPipe++;
scalarRdGmReqsInPipe++;
} else {
panic("Invalid memory operation!\n");
}
execUnitId = scalarMem;
}
}
void
Wavefront::reserveLmResource(GPUDynInstPtr ii)
{
fatal_if(ii->isScalar(),
"Scalar instructions can not access Shared memory!!!");
if (ii->isLoad()) {
rdLmReqsInPipe++;
} else if (ii->isStore()) {
wrLmReqsInPipe++;
} else if (ii->isAtomic() || ii->isMemSync()) {
wrLmReqsInPipe++;
rdLmReqsInPipe++;
} else {
panic("Invalid memory operation!\n");
}
execUnitId = localMem;
}
std::vector<int>
Wavefront::reserveResources()
{
// vector of execution unit IDs to return to schedule stage
// this return is only used for debugging and an assertion...
std::vector<int> execUnitIds;
// Get current instruction
GPUDynInstPtr ii = instructionBuffer.front();
assert(ii);
// Single precision ALU or Branch or Return or Special instruction
if (ii->isALU() || ii->isSpecialOp() ||
ii->isBranch() || ii->isNop() ||
(ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
ii->isReturn() || ii->isEndOfKernel()) {
if (!ii->isScalar()) {
execUnitId = simdId;
} else {
execUnitId = scalarAluGlobalIdx;
}
// this is to enforce a fixed number of cycles per issue slot per SIMD
} else if (ii->isBarrier()) {
execUnitId = ii->isScalar() ? scalarAluGlobalIdx : simdId;
} else if (ii->isFlat()) {
assert(!ii->isScalar());
reserveLmResource(ii);
// add execUnitId, reserved by reserveLmResource, list before it is
// overwriten by reserveGmResource
execUnitIds.push_back(execUnitId);
flatLmUnitId = execUnitId;
reserveGmResource(ii);
flatGmUnitId = execUnitId;
execUnitIds.push_back(flatGmUnitId);
execUnitId = -1;
} else if (ii->isGlobalMem()) {
reserveGmResource(ii);
} else if (ii->isLocalMem()) {
reserveLmResource(ii);
} else if (ii->isPrivateSeg()) {
fatal_if(ii->isScalar(),
"Scalar instructions can not access Private memory!!!");
reserveGmResource(ii);
} else {
panic("reserveResources -> Couldn't process op!\n");
}
if (execUnitId != -1) {
execUnitIds.push_back(execUnitId);
}
assert(execUnitIds.size());
return execUnitIds;
}
void
Wavefront::exec()
{
// ---- Exit if wavefront is inactive ----------------------------- //
if (status == S_STOPPED || status == S_RETURNING ||
status==S_STALLED || instructionBuffer.empty()) {
return;
}
if (status == S_WAITCNT) {
/**
* if this wave is in S_WAITCNT state, then
* it should enter exec() precisely one time
* before the waitcnts are satisfied, in order
* to execute the waitcnt instruction itself
* thus we assert that the waitcnt is the
* oldest instruction. if we enter exec() with
* active waitcnts, and we're not executing
* the waitcnt instruction, something must be
* wrong
*/
assert(isOldestInstWaitcnt());
}
// Get current instruction
GPUDynInstPtr ii = instructionBuffer.front();
const Addr old_pc = pc();
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
"(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId,
wfDynId, ii->disassemble(), old_pc, ii->seqNum());
ii->execute(ii);
// delete the dynamic instruction from the pipeline map
computeUnit->deleteFromPipeMap(this);
// update the instruction stats in the CU
computeUnit->updateInstStats(ii);
// inform VRF of instruction execution to schedule write-back
// and scoreboard ready for registers
if (!ii->isScalar()) {
computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
}
computeUnit->srf[simdId]->waveExecuteInst(this, ii);
computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecRegOperands());
computeUnit->shader->incVectorInstDstOperand(ii->numDstVecRegOperands());
computeUnit->stats.numInstrExecuted++;
stats.numInstrExecuted++;
computeUnit->instExecPerSimd[simdId]++;
computeUnit->stats.execRateDist.sample(
computeUnit->stats.totalCycles.value() -
computeUnit->lastExecCycle[simdId]);
computeUnit->lastExecCycle[simdId] =
computeUnit->stats.totalCycles.value();
if (lastInstExec) {
computeUnit->stats.instInterleave[simdId].
sample(computeUnit->instExecPerSimd[simdId] - lastInstExec);
}
lastInstExec = computeUnit->instExecPerSimd[simdId];
// want to track:
// number of reads that occur per value written
// vector RAW dependency tracking
for (const auto& srcVecOp : ii->srcVecRegOperands()) {
for (const auto& virtIdx : srcVecOp.virtIndices()) {
// This check should never fail, but to be safe we check
if (rawDist.find(virtIdx) != rawDist.end()) {
stats.vecRawDistance.sample(stats.numInstrExecuted.value() -
rawDist[virtIdx]);
}
// increment number of reads to this register
vecReads[virtIdx]++;
}
}
for (const auto& dstVecOp : ii->dstVecRegOperands()) {
for (const auto& virtIdx : dstVecOp.virtIndices()) {
// rawDist is set on writes, but will not be set for the first
// write to each physical register
if (rawDist.find(virtIdx) != rawDist.end()) {
// Sample the number of reads that were performed
stats.readsPerWrite.sample(vecReads[virtIdx]);
}
// on a write, reset count of reads to 0
vecReads[virtIdx] = 0;
rawDist[virtIdx] = stats.numInstrExecuted.value();
}
}
if (pc() == old_pc) {
// PC not modified by instruction, proceed to next
_gpuISA.advancePC(ii);
instructionBuffer.pop_front();
} else {
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave%d %s taken branch\n",
computeUnit->cu_id, simdId, wfSlotId, wfDynId,
ii->disassemble());
discardFetch();
}
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
computeUnit->cu_id, simdId, wfSlotId, wfDynId, pc());
if (computeUnit->shader->hsail_mode==Shader::SIMT) {
const int num_active_lanes = execMask().count();
computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
computeUnit->stats.numVecOpsExecuted += num_active_lanes;
if (ii->isF16() && ii->isALU()) {
if (ii->isF32() || ii->isF64()) {
fatal("Instruction is tagged as both (1) F16, and (2)"
"either F32 or F64.");
}
computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes;
if (ii->isFMA()) {
computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAC()) {
computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAD()) {
computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
}
if (ii->isF32() && ii->isALU()) {
if (ii->isF16() || ii->isF64()) {
fatal("Instruction is tagged as both (1) F32, and (2)"
"either F16 or F64.");
}
computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes;
if (ii->isFMA()) {
computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAC()) {
computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAD()) {
computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
}
if (ii->isF64() && ii->isALU()) {
if (ii->isF16() || ii->isF32()) {
fatal("Instruction is tagged as both (1) F64, and (2)"
"either F16 or F32.");
}
computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes;
if (ii->isFMA()) {
computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAC()) {
computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAD()) {
computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
}
if (isGmInstruction(ii)) {
computeUnit->stats.activeLanesPerGMemInstrDist.sample(
num_active_lanes);
} else if (isLmInstruction(ii)) {
computeUnit->stats.activeLanesPerLMemInstrDist.sample(
num_active_lanes);
}
}
/**
* we return here to avoid spurious errors related to flat insts
* and their address segment resolution.
*/
if (execMask().none() && ii->isFlat()) {
computeUnit->getTokenManager()->recvTokens(1);
return;
}
// Update Vector ALU pipeline and other resources
bool flat_as_gm = false;
bool flat_as_lm = false;
if (ii->isFlat()) {
flat_as_gm = (ii->executedAs() == enums::SC_GLOBAL) ||
(ii->executedAs() == enums::SC_PRIVATE);
flat_as_lm = (ii->executedAs() == enums::SC_GROUP);
}
// Single precision ALU or Branch or Return or Special instruction
// Note, we use the same timing regardless of SP or DP ALU operation.
if (ii->isALU() || ii->isSpecialOp() ||
ii->isBranch() || ii->isNop() ||
(ii->isKernArgSeg() && ii->isLoad()) ||
ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
// this is to enforce a fixed number of cycles per issue slot per SIMD
if (!ii->isScalar()) {
computeUnit->vectorALUs[simdId].set(computeUnit->
cyclesToTicks(computeUnit->issuePeriod));
} else {
computeUnit->scalarALUs[scalarAlu].set(computeUnit->
cyclesToTicks(computeUnit->issuePeriod));
}
// Barrier on Scalar ALU
} else if (ii->isBarrier()) {
computeUnit->scalarALUs[scalarAlu].set(computeUnit->
cyclesToTicks(computeUnit->issuePeriod));
// GM or Flat as GM Load
} else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
if (!ii->isScalar()) {
computeUnit->vrfToGlobalMemPipeBus.set(
computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency));
computeUnit->vectorGlobalMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
computeUnit->vrf_gm_bus_latency;
} else {
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
cyclesToTicks(computeUnit->srf_scm_bus_latency));
computeUnit->scalarMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
computeUnit->srf_scm_bus_latency;
}
// GM or Flat as GM Store
} else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
if (!ii->isScalar()) {
computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
computeUnit->vectorGlobalMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
(2 * computeUnit->vrf_gm_bus_latency);
} else {
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
computeUnit->scalarMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
(2 * computeUnit->srf_scm_bus_latency);
}
} else if ((ii->isAtomic() || ii->isMemSync()) &&
(ii->isGlobalMem() || flat_as_gm)) {
if (!ii->isScalar()) {
computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
computeUnit->vectorGlobalMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
(2 * computeUnit->vrf_gm_bus_latency);
} else {
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
computeUnit->scalarMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
(2 * computeUnit->srf_scm_bus_latency);
}
// LM or Flat as LM Load
} else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
cyclesToTicks(computeUnit->vrf_lm_bus_latency));
computeUnit->vectorSharedMemUnit.
set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
computeUnit->vrf_lm_bus_latency;
// LM or Flat as LM Store
} else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
computeUnit->vectorSharedMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
(2 * computeUnit->vrf_lm_bus_latency);
// LM or Flat as LM, Atomic or MemFence
} else if ((ii->isAtomic() || ii->isMemSync()) &&
(ii->isLocalMem() || flat_as_lm)) {
computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
computeUnit->vectorSharedMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
(2 * computeUnit->vrf_lm_bus_latency);
} else {
panic("Bad instruction type!\n");
}
}
GPUDynInstPtr
Wavefront::nextInstr()
{
// Read next instruction from instruction buffer
GPUDynInstPtr ii = instructionBuffer.front();
// if the WF has been dispatched in the schedule stage then
// check the next oldest instruction for readiness
if (computeUnit->pipeMap.find(ii->seqNum()) !=
computeUnit->pipeMap.end()) {
if (instructionBuffer.size() > 1) {
auto it = instructionBuffer.begin() + 1;
return *it;
} else { // No new instructions to check
return nullptr;
}
}
return ii;
}
void
Wavefront::discardFetch()
{
instructionBuffer.clear();
dropFetch |= pendingFetch;
/**
* clear the fetch buffer for this wave in order to
* remove any stale inst data
*/
computeUnit->fetchStage.fetchUnit(simdId).flushBuf(wfSlotId);
}
bool
Wavefront::waitCntsSatisfied()
{
// Both vmWaitCnt && lgkmWaitCnt uninitialized means
// waitCnt instruction has been dispatched but not executed yet: next
// instruction should be blocked until waitCnt is executed.
if (vmWaitCnt == -1 && expWaitCnt == -1 && lgkmWaitCnt == -1) {
return false;
}
/**
* If we reach here, that means an s_waitcnt instruction was executed
* and the waitcnts are set by the execute method. Check if waitcnts
* are satisfied.
*/
if (vmWaitCnt != -1) {
if (vmemInstsIssued > vmWaitCnt) {
// vmWaitCnt not satisfied
return false;
}
}
if (expWaitCnt != -1) {
if (expInstsIssued > expWaitCnt) {
// expWaitCnt not satisfied
return false;
}
}
if (lgkmWaitCnt != -1) {
if (lgkmInstsIssued > lgkmWaitCnt) {
// lgkmWaitCnt not satisfied
return false;
}
}
// if we get here all outstanding waitcnts must
// be satisfied, so we resume normal operation
clearWaitCnts();
return true;
}
bool
Wavefront::sleepDone()
{
assert(status == S_STALLED_SLEEP);
// if the sleep count has not been set, then the sleep instruction has not
// been executed yet, so we will return true without setting the wavefront
// status
if (sleepCnt == 0)
return false;
sleepCnt--;
if (sleepCnt != 0)
return false;
status = S_RUNNING;
return true;
}
void
Wavefront::setSleepTime(int sleep_time)
{
assert(sleepCnt == 0);
sleepCnt = sleep_time;
}
void
Wavefront::setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
{
// the scoreboard should have set the status
// to S_WAITCNT once a waitcnt instruction
// was marked as ready
assert(status == S_WAITCNT);
// waitcnt instruction shouldn't be sending
// negative counts
assert(vm_wait_cnt >= 0);
assert(exp_wait_cnt >= 0);
assert(lgkm_wait_cnt >= 0);
// waitcnts are a max of 15 because we have
// only 1 nibble (4 bits) to set the counts
assert(vm_wait_cnt <= 0xf);
assert(exp_wait_cnt <= 0x7);
assert(lgkm_wait_cnt <= 0x1f);
/**
* prior waitcnts should be satisfied,
* at which time the WF resets them
* back to -1, indicating they are no
* longer active
*/
assert(vmWaitCnt == -1);
assert(expWaitCnt == -1);
assert(lgkmWaitCnt == -1);
/**
* if the instruction encoding
* indicates a waitcnt of 0xf,
* that means the waitcnt is
* not being used
*/
if (vm_wait_cnt != 0xf)
vmWaitCnt = vm_wait_cnt;
if (exp_wait_cnt != 0x7)
expWaitCnt = exp_wait_cnt;
if (lgkm_wait_cnt != 0x1f)
lgkmWaitCnt = lgkm_wait_cnt;
}
void
Wavefront::clearWaitCnts()
{
// reset the waitcnts back to
// -1, indicating they are no
// longer valid
vmWaitCnt = -1;
expWaitCnt = -1;
lgkmWaitCnt = -1;
// resume running normally
status = S_RUNNING;
}
void
Wavefront::incVMemInstsIssued()
{
++vmemInstsIssued;
}
void
Wavefront::incExpInstsIssued()
{
++expInstsIssued;
}
void
Wavefront::incLGKMInstsIssued()
{
++lgkmInstsIssued;
}
void
Wavefront::decVMemInstsIssued()
{
--vmemInstsIssued;
}
void
Wavefront::decExpInstsIssued()
{
--expInstsIssued;
}
void
Wavefront::decLGKMInstsIssued()
{
--lgkmInstsIssued;
}
Addr
Wavefront::pc() const
{
return _pc;
}
void
Wavefront::pc(Addr new_pc)
{
_pc = new_pc;
}
VectorMask&
Wavefront::execMask()
{
return _execMask;
}
bool
Wavefront::execMask(int lane) const
{
return _execMask[lane];
}
void
Wavefront::freeRegisterFile()
{
/* clear busy registers */
for (int i=0; i < maxVgprs; i++) {
int vgprIdx = computeUnit->registerManager->mapVgpr(this, i);
computeUnit->vrf[simdId]->markReg(vgprIdx, false);
}
/* Free registers used by this wavefront */
uint32_t endIndex = (startVgprIndex + reservedVectorRegs - 1) %
computeUnit->vrf[simdId]->numRegs();
computeUnit->registerManager->vrfPoolMgrs[simdId]->
freeRegion(startVgprIndex, endIndex);
}
void
Wavefront::computeActualWgSz(HSAQueueEntry *task)
{
actualWgSzTotal = 1;
for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
actualWgSz[d] = std::min(workGroupSz[d], gridSz[d]
- task->wgId(d) * workGroupSz[d]);
actualWgSzTotal *= actualWgSz[d];
}
}
void
Wavefront::barrierId(int bar_id)
{
assert(bar_id >= WFBarrier::InvalidID);
assert(bar_id < computeUnit->numBarrierSlots());
barId = bar_id;
}
int
Wavefront::barrierId() const
{
return barId;
}
bool
Wavefront::hasBarrier() const
{
return barId > WFBarrier::InvalidID;
}
void
Wavefront::releaseBarrier()
{
barId = WFBarrier::InvalidID;
}
Wavefront::WavefrontStats::WavefrontStats(statistics::Group *parent)
: statistics::Group(parent),
ADD_STAT(numInstrExecuted,
"number of instructions executed by this WF slot"),
ADD_STAT(schCycles, "number of cycles spent in schedule stage"),
ADD_STAT(schStalls, "number of cycles WF is stalled in SCH stage"),
ADD_STAT(schRfAccessStalls, "number of cycles wave selected in SCH but "
"RF denied adding instruction"),
ADD_STAT(schResourceStalls, "number of cycles stalled in sch by resource"
" not available"),
ADD_STAT(schOpdNrdyStalls, "number of cycles stalled in sch waiting for "
"RF reads to complete"),
ADD_STAT(schLdsArbStalls,
"number of cycles wave stalled due to LDS-VRF arbitration"),
// FIXME: the name of the WF needs to be unique
ADD_STAT(numTimesBlockedDueWAXDependencies, "number of times the wf's "
"instructions are blocked due to WAW or WAR dependencies"),
// FIXME: the name of the WF needs to be unique
ADD_STAT(numTimesBlockedDueRAWDependencies, "number of times the wf's "
"instructions are blocked due to RAW dependencies"),
ADD_STAT(vecRawDistance,
"Count of RAW distance in dynamic instructions for this WF"),
ADD_STAT(readsPerWrite, "Count of Vector reads per write for this WF")
{
vecRawDistance.init(0, 20, 1);
readsPerWrite.init(0, 4, 1);
}
} // namespace gem5