Files
gem5/src/gpu-compute/wavefront.cc
Matthew Poremba 9f4d334644 gpu-compute: Update tokens for flat global/scratch
Memory instructions acquire coalescer tokens in the schedule stage.
Currently this is only done for buffer and flat instructions, but not
flat global or flat scratch. This change now acquires tokens for flat
global and flat scratch instructions. This provides back-pressure to the
CUs and helps to avoid deadlocks in Ruby.

The change also handles returning tokens for buffer, flat global, and
flat scratch instructions. This was previously only being done for
normal flat instructions leading to deadlocks in some applications when
the tokens were exhausted.

To simplify the logic, added a needsToken() method to GPUDynInst which
return if the instruction is buffer or any flat segment.

The waitcnts were also incorrect for flat global and flat scratch. We
should always decrement vmem and exp count for stores and only normal
flat instructions should decrement lgkm. Currently vmem/exp are not
decremented for flat global and flat scratch which can lead to deadlock.
This change set fixes this by always decrementing vmem/exp and lgkm only
for normal flat instructions.

Change-Id: I673f4ac6121e4b5a5e8491bc9130c6d825d95fc5
2023-10-10 09:48:16 -05:00

1500 lines
52 KiB
C++

/*
* Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "gpu-compute/wavefront.hh"
#include "base/bitfield.hh"
#include "debug/GPUExec.hh"
#include "debug/GPUInitAbi.hh"
#include "debug/WavefrontStack.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/simple_pool_manager.hh"
#include "gpu-compute/vector_register_file.hh"
namespace gem5
{
Wavefront::Wavefront(const Params &p)
: SimObject(p), wfSlotId(p.wf_slot_id), simdId(p.simdId),
maxIbSize(p.max_ib_size), _gpuISA(*this),
vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
sleepCnt(0), barId(WFBarrier::InvalidID), stats(this)
{
lastTrace = 0;
execUnitId = -1;
status = S_STOPPED;
reservedVectorRegs = 0;
reservedScalarRegs = 0;
startVgprIndex = 0;
startSgprIndex = 0;
outstandingReqs = 0;
outstandingReqsWrGm = 0;
outstandingReqsWrLm = 0;
outstandingReqsRdGm = 0;
outstandingReqsRdLm = 0;
rdLmReqsInPipe = 0;
rdGmReqsInPipe = 0;
wrLmReqsInPipe = 0;
wrGmReqsInPipe = 0;
scalarRdGmReqsInPipe = 0;
scalarWrGmReqsInPipe = 0;
scalarOutstandingReqsRdGm = 0;
scalarOutstandingReqsWrGm = 0;
lastNonIdleTick = 0;
ldsChunk = nullptr;
memTraceBusy = 0;
oldVgprTcnt = 0xffffffffffffffffll;
oldDgprTcnt = 0xffffffffffffffffll;
oldVgpr.resize(p.wf_size);
pendingFetch = false;
dropFetch = false;
maxVgprs = 0;
maxSgprs = 0;
lastAddr.resize(p.wf_size);
workItemFlatId.resize(p.wf_size);
oldDgpr.resize(p.wf_size);
for (int i = 0; i < 3; ++i) {
workItemId[i].resize(p.wf_size);
}
_execMask.set();
rawDist.clear();
lastInstExec = 0;
vecReads.clear();
}
void
Wavefront::init()
{
reservedVectorRegs = 0;
reservedScalarRegs = 0;
startVgprIndex = 0;
startSgprIndex = 0;
scalarAlu = computeUnit->mapWaveToScalarAlu(this);
scalarAluGlobalIdx = computeUnit->mapWaveToScalarAluGlobalIdx(this);
globalMem = computeUnit->mapWaveToGlobalMem(this);
localMem = computeUnit->mapWaveToLocalMem(this);
scalarMem = computeUnit->mapWaveToScalarMem(this);
}
void
Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
{
int regInitIdx = 0;
// Iterate over all the init fields and check which
// bits are enabled. Useful information can be found here:
// https://github.com/ROCm-Developer-Tools/ROCm-ComputeABI-Doc/
// blob/master/AMDGPU-ABI.md
for (int en_bit = 0; en_bit < NumScalarInitFields; ++en_bit) {
if (task->sgprBitEnabled(en_bit)) {
int physSgprIdx = 0;
uint32_t wiCount = 0;
uint32_t firstWave = 0;
int orderedAppendTerm = 0;
int numWfsInWg = 0;
uint32_t finalValue = 0;
Addr host_disp_pkt_addr = task->hostDispPktAddr();
Addr kernarg_addr = task->kernargAddr();
Addr hidden_priv_base(0);
switch (en_bit) {
case PrivateSegBuf:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->amdQueue.scratch_resource_descriptor[0]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting PrivateSegBuffer: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->amdQueue.scratch_resource_descriptor[0]);
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->amdQueue.scratch_resource_descriptor[1]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting PrivateSegBuffer: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->amdQueue.scratch_resource_descriptor[1]);
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->amdQueue.scratch_resource_descriptor[2]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting PrivateSegBuffer: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->amdQueue.scratch_resource_descriptor[2]);
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->amdQueue.scratch_resource_descriptor[3]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting PrivateSegBuffer: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->amdQueue.scratch_resource_descriptor[3]);
break;
case DispatchPtr:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(host_disp_pkt_addr, 31, 0));
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting DispatchPtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(host_disp_pkt_addr, 31, 0));
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(host_disp_pkt_addr, 63, 32));
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting DispatchPtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(host_disp_pkt_addr, 63, 32));
++regInitIdx;
break;
case QueuePtr:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(task->hostAMDQueueAddr, 31, 0));
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting QueuePtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(task->hostAMDQueueAddr, 31, 0));
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(task->hostAMDQueueAddr, 63, 32));
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting QueuePtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(task->hostAMDQueueAddr, 63, 32));
++regInitIdx;
break;
case KernargSegPtr:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(kernarg_addr, 31, 0));
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting KernargSegPtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(kernarg_addr, 31, 0));
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(kernarg_addr, 63, 32));
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting KernargSegPtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(kernarg_addr, 63, 32));
++regInitIdx;
break;
case DispatchId:
physSgprIdx
= computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->dispatchId());
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting DispatchId: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->dispatchId());
// Dispatch ID in gem5 is an int. Set upper 32-bits to zero.
physSgprIdx
= computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx, 0);
++regInitIdx;
break;
case FlatScratchInit:
physSgprIdx
= computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
(TheGpuISA::ScalarRegU32)(task->amdQueue
.scratch_backing_memory_location & 0xffffffff));
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting FlatScratch Addr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
(TheGpuISA::ScalarRegU32)(task->amdQueue
.scratch_backing_memory_location & 0xffffffff));
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
// This vallue should be sizeof(DWORD) aligned, that is
// 4 byte aligned
computeUnit->srf[simdId]->write(physSgprIdx,
task->amdQueue.scratch_workitem_byte_size);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting FlatScratch size: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->amdQueue.scratch_workitem_byte_size);
/**
* Since flat scratch init is needed for this kernel, this
* kernel is going to have flat memory instructions and we
* need to initialize the hidden private base for this queue.
* scratch_resource_descriptor[0] has this queue's scratch
* base address. scratch_backing_memory_location has the
* offset to this queue's scratch base address from the
* SH_HIDDEN_PRIVATE_BASE_VMID. Ideally, we only require this
* queue's scratch base address for address calculation
* (stored in scratch_resource_descriptor[0]). But that
* address calculation shoule be done by first finding the
* queue's scratch base address using the calculation
* "SH_HIDDEN_PRIVATE_BASE_VMID + offset". So, we initialize
* SH_HIDDEN_PRIVATE_BASE_VMID.
*
* For more details see:
* http://rocm-documentation.readthedocs.io/en/latest/
* ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
*
* https://github.com/ROCm-Developer-Tools/
* ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
* #flat-addressing
*/
hidden_priv_base =
(uint64_t)task->amdQueue.scratch_resource_descriptor[0] |
(((uint64_t)task->amdQueue.scratch_resource_descriptor[1]
& 0x000000000000ffff) << 32);
computeUnit->shader->initShHiddenPrivateBase(
hidden_priv_base,
task->amdQueue.scratch_backing_memory_location);
break;
case PrivateSegSize:
physSgprIdx
= computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->privMemPerItem());
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting private segment size: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->privMemPerItem());
break;
case GridWorkgroupCountX:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
wiCount = ((task->gridSize(0) +
task->wgSize(0) - 1) /
task->wgSize(0));
computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting num WG X: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, wiCount);
break;
case GridWorkgroupCountY:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
wiCount = ((task->gridSize(1) +
task->wgSize(1) - 1) /
task->wgSize(1));
computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting num WG Y: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, wiCount);
break;
case GridWorkgroupCountZ:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
wiCount = ((task->gridSize(2) +
task->wgSize(2) - 1) /
task->wgSize(2));
computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting num WG Z: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, wiCount);
break;
case WorkgroupIdX:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
workGroupId[0]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting WG ID X: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, workGroupId[0]);
break;
case WorkgroupIdY:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
workGroupId[1]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting WG ID Y: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, workGroupId[1]);
break;
case WorkgroupIdZ:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
workGroupId[2]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting WG ID Z: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
break;
case PrivSegWaveByteOffset:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
/**
* the compute_tmpring_size_wavesize specifies the number of
* kB allocated per wavefront, hence the multiplication by
* 1024.
*
* to get the per wavefront offset into the scratch
* memory, we also multiply this by the wfId. the wfId stored
* in the Wavefront class, however, is the wave ID within the
* WG, whereas here we need the global WFID because the
* scratch space will be divided amongst all waves in the
* kernel. to get the global ID we multiply the WGID by
* the WG size, then add the WFID of the wave within its WG.
*/
computeUnit->srf[simdId]->write(physSgprIdx, 1024 *
(wgId * (wgSz / 64) + wfId) *
task->amdQueue.compute_tmpring_size_wavesize);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting Private Seg Offset: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
1024 * (wgId * (wgSz / 64) + wfId) *
task->amdQueue.compute_tmpring_size_wavesize);
break;
case WorkgroupInfo:
firstWave = (wfId == 0) ? 1 : 0;
numWfsInWg = divCeil(wgSizeInWorkItems,
computeUnit->wfSize());
finalValue = firstWave << ((sizeof(uint32_t) * 8) - 1);
finalValue |= (orderedAppendTerm << 6);
finalValue |= numWfsInWg;
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->
write(physSgprIdx, finalValue);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting WG Info: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, finalValue);
break;
default:
fatal("SGPR enable bit %i not supported\n", en_bit);
break;
}
}
}
regInitIdx = 0;
// iterate over all the init fields and check which
// bits are enabled
for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) {
if (task->vgprBitEnabled(en_bit)) {
uint32_t physVgprIdx = 0;
TheGpuISA::VecRegContainerU32 raw_vgpr;
switch (en_bit) {
case WorkitemIdX:
{
physVgprIdx = computeUnit->registerManager
->mapVgpr(this, regInitIdx);
TheGpuISA::VecElemU32 *vgpr_x
= raw_vgpr.as<TheGpuISA::VecElemU32>();
for (int lane = 0; lane < workItemId[0].size(); ++lane) {
vgpr_x[lane] = workItemId[0][lane];
}
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
rawDist[regInitIdx] = 0;
++regInitIdx;
}
break;
case WorkitemIdY:
{
physVgprIdx = computeUnit->registerManager
->mapVgpr(this, regInitIdx);
TheGpuISA::VecElemU32 *vgpr_y
= raw_vgpr.as<TheGpuISA::VecElemU32>();
for (int lane = 0; lane < workItemId[1].size(); ++lane) {
vgpr_y[lane] = workItemId[1][lane];
}
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
rawDist[regInitIdx] = 0;
++regInitIdx;
}
break;
case WorkitemIdZ:
{
physVgprIdx = computeUnit->registerManager->
mapVgpr(this, regInitIdx);
TheGpuISA::VecElemU32 *vgpr_z
= raw_vgpr.as<TheGpuISA::VecElemU32>();
for (int lane = 0; lane < workItemId[2].size(); ++lane) {
vgpr_z[lane] = workItemId[2][lane];
}
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
rawDist[regInitIdx] = 0;
++regInitIdx;
}
break;
}
}
}
}
void
Wavefront::resizeRegFiles(int num_vregs, int num_sregs)
{
maxVgprs = num_vregs;
maxSgprs = num_sregs;
}
Wavefront::~Wavefront()
{
}
void
Wavefront::setStatus(status_e newStatus)
{
if (computeUnit->idleCUTimeout > 0) {
// Wavefront's status transitions to stalled or stopped
if ((newStatus == S_STOPPED || newStatus == S_STALLED ||
newStatus == S_WAITCNT || newStatus == S_BARRIER) &&
(status != newStatus)) {
computeUnit->idleWfs++;
assert(computeUnit->idleWfs <=
(computeUnit->shader->n_wf * computeUnit->numVectorALUs));
if (computeUnit->idleWfs ==
(computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
lastNonIdleTick = curTick();
}
// Wavefront's status transitions to an active state (from
// a stopped or stalled state)
} else if ((status == S_STOPPED || status == S_STALLED ||
status == S_WAITCNT || status == S_BARRIER) &&
(status != newStatus)) {
// if all WFs in the CU were idle then check if the idleness
// period exceeded the timeout threshold
if (computeUnit->idleWfs ==
(computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
panic_if((curTick() - lastNonIdleTick) >=
computeUnit->idleCUTimeout,
"CU%d has been idle for %d ticks at tick %d",
computeUnit->cu_id, computeUnit->idleCUTimeout,
curTick());
}
computeUnit->idleWfs--;
assert(computeUnit->idleWfs >= 0);
}
}
status = newStatus;
}
void
Wavefront::start(uint64_t _wf_dyn_id, Addr init_pc)
{
wfDynId = _wf_dyn_id;
_pc = init_pc;
status = S_RUNNING;
vecReads.resize(maxVgprs, 0);
}
bool
Wavefront::isGmInstruction(GPUDynInstPtr ii)
{
if (ii->isGlobalMem() ||
(ii->isFlat() && ii->executedAs() == enums::SC_GLOBAL)) {
return true;
}
return false;
}
bool
Wavefront::isLmInstruction(GPUDynInstPtr ii)
{
if (ii->isLocalMem() ||
(ii->isFlat() && ii->executedAs() == enums::SC_GROUP)) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstSleep()
{
if (instructionBuffer.empty())
return false;
GPUDynInstPtr ii = instructionBuffer.front();
if (ii->isSleep()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstWaitcnt()
{
if (instructionBuffer.empty())
return false;
GPUDynInstPtr ii = instructionBuffer.front();
if (ii->isWaitcnt()) {
// waitcnt is a scalar
assert(ii->isScalar());
return true;
}
return false;
}
bool
Wavefront::isOldestInstScalarALU()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
|| ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
(ii->isKernArgSeg() && ii->isLoad()))) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstVectorALU()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && !ii->isScalar() && (ii->isNop() ||
ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
|| (ii->isKernArgSeg() && ii->isLoad()))) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstBarrier()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isBarrier()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstGMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && !ii->isScalar() && ii->isGlobalMem()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstScalarMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isScalar() && ii->isGlobalMem()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstLMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isLocalMem()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstPrivMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isPrivateSeg()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstFlatMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isFlat()) {
return true;
}
return false;
}
bool
Wavefront::stopFetch()
{
for (auto it : instructionBuffer) {
GPUDynInstPtr ii = it;
if (ii->isReturn() || ii->isBranch() ||
ii->isEndOfKernel()) {
return true;
}
}
return false;
}
void
Wavefront::freeResources()
{
execUnitId = -1;
}
void Wavefront::validateRequestCounters()
{
panic_if(wrGmReqsInPipe < 0 || rdGmReqsInPipe < 0 ||
wrLmReqsInPipe < 0 || rdLmReqsInPipe < 0 ||
outstandingReqs < 0,
"Negative requests in pipe for WF%d for slot%d"
" and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
" Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
" Outstanding Reqs=%d\n",
wfDynId, wfSlotId, simdId, rdGmReqsInPipe, wrGmReqsInPipe,
rdLmReqsInPipe, wrLmReqsInPipe, outstandingReqs);
}
void
Wavefront::reserveGmResource(GPUDynInstPtr ii)
{
if (!ii->isScalar()) {
if (ii->isLoad()) {
rdGmReqsInPipe++;
} else if (ii->isStore()) {
wrGmReqsInPipe++;
} else if (ii->isAtomic() || ii->isMemSync()) {
rdGmReqsInPipe++;
wrGmReqsInPipe++;
} else {
panic("Invalid memory operation!\n");
}
execUnitId = globalMem;
} else {
if (ii->isLoad()) {
scalarRdGmReqsInPipe++;
} else if (ii->isStore()) {
scalarWrGmReqsInPipe++;
} else if (ii->isAtomic() || ii->isMemSync()) {
scalarWrGmReqsInPipe++;
scalarRdGmReqsInPipe++;
} else {
panic("Invalid memory operation!\n");
}
execUnitId = scalarMem;
}
}
void
Wavefront::reserveLmResource(GPUDynInstPtr ii)
{
fatal_if(ii->isScalar(),
"Scalar instructions can not access Shared memory!!!");
if (ii->isLoad()) {
rdLmReqsInPipe++;
} else if (ii->isStore()) {
wrLmReqsInPipe++;
} else if (ii->isAtomic() || ii->isMemSync()) {
wrLmReqsInPipe++;
rdLmReqsInPipe++;
} else {
panic("Invalid memory operation!\n");
}
execUnitId = localMem;
}
std::vector<int>
Wavefront::reserveResources()
{
// vector of execution unit IDs to return to schedule stage
// this return is only used for debugging and an assertion...
std::vector<int> execUnitIds;
// Get current instruction
GPUDynInstPtr ii = instructionBuffer.front();
assert(ii);
// Single precision ALU or Branch or Return or Special instruction
if (ii->isALU() || ii->isSpecialOp() ||
ii->isBranch() || ii->isNop() ||
(ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
ii->isReturn() || ii->isEndOfKernel()) {
if (!ii->isScalar()) {
execUnitId = simdId;
} else {
execUnitId = scalarAluGlobalIdx;
}
// this is to enforce a fixed number of cycles per issue slot per SIMD
} else if (ii->isBarrier()) {
execUnitId = ii->isScalar() ? scalarAluGlobalIdx : simdId;
} else if (ii->isFlat()) {
assert(!ii->isScalar());
reserveLmResource(ii);
// add execUnitId, reserved by reserveLmResource, list before it is
// overwriten by reserveGmResource
execUnitIds.push_back(execUnitId);
flatLmUnitId = execUnitId;
reserveGmResource(ii);
flatGmUnitId = execUnitId;
execUnitIds.push_back(flatGmUnitId);
execUnitId = -1;
} else if (ii->isGlobalMem()) {
reserveGmResource(ii);
} else if (ii->isLocalMem()) {
reserveLmResource(ii);
} else if (ii->isPrivateSeg()) {
fatal_if(ii->isScalar(),
"Scalar instructions can not access Private memory!!!");
reserveGmResource(ii);
} else {
panic("reserveResources -> Couldn't process op!\n");
}
if (execUnitId != -1) {
execUnitIds.push_back(execUnitId);
}
assert(execUnitIds.size());
return execUnitIds;
}
void
Wavefront::exec()
{
// ---- Exit if wavefront is inactive ----------------------------- //
if (status == S_STOPPED || status == S_RETURNING ||
status==S_STALLED || instructionBuffer.empty()) {
return;
}
if (status == S_WAITCNT) {
/**
* if this wave is in S_WAITCNT state, then
* it should enter exec() precisely one time
* before the waitcnts are satisfied, in order
* to execute the waitcnt instruction itself
* thus we assert that the waitcnt is the
* oldest instruction. if we enter exec() with
* active waitcnts, and we're not executing
* the waitcnt instruction, something must be
* wrong
*/
assert(isOldestInstWaitcnt());
}
// Get current instruction
GPUDynInstPtr ii = instructionBuffer.front();
const Addr old_pc = pc();
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
"(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId,
wfDynId, ii->disassemble(), old_pc, ii->seqNum());
ii->execute(ii);
// delete the dynamic instruction from the pipeline map
computeUnit->deleteFromPipeMap(this);
// update the instruction stats in the CU
computeUnit->updateInstStats(ii);
// inform VRF of instruction execution to schedule write-back
// and scoreboard ready for registers
if (!ii->isScalar()) {
computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
}
computeUnit->srf[simdId]->waveExecuteInst(this, ii);
computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecRegOperands());
computeUnit->shader->incVectorInstDstOperand(ii->numDstVecRegOperands());
computeUnit->stats.numInstrExecuted++;
stats.numInstrExecuted++;
computeUnit->instExecPerSimd[simdId]++;
computeUnit->stats.execRateDist.sample(
computeUnit->stats.totalCycles.value() -
computeUnit->lastExecCycle[simdId]);
computeUnit->lastExecCycle[simdId] =
computeUnit->stats.totalCycles.value();
if (lastInstExec) {
computeUnit->stats.instInterleave[simdId].
sample(computeUnit->instExecPerSimd[simdId] - lastInstExec);
}
lastInstExec = computeUnit->instExecPerSimd[simdId];
// want to track:
// number of reads that occur per value written
// vector RAW dependency tracking
for (const auto& srcVecOp : ii->srcVecRegOperands()) {
for (const auto& virtIdx : srcVecOp.virtIndices()) {
// This check should never fail, but to be safe we check
if (rawDist.find(virtIdx) != rawDist.end()) {
stats.vecRawDistance.sample(stats.numInstrExecuted.value() -
rawDist[virtIdx]);
}
// increment number of reads to this register
vecReads[virtIdx]++;
}
}
for (const auto& dstVecOp : ii->dstVecRegOperands()) {
for (const auto& virtIdx : dstVecOp.virtIndices()) {
// rawDist is set on writes, but will not be set for the first
// write to each physical register
if (rawDist.find(virtIdx) != rawDist.end()) {
// Sample the number of reads that were performed
stats.readsPerWrite.sample(vecReads[virtIdx]);
}
// on a write, reset count of reads to 0
vecReads[virtIdx] = 0;
rawDist[virtIdx] = stats.numInstrExecuted.value();
}
}
if (pc() == old_pc) {
// PC not modified by instruction, proceed to next
_gpuISA.advancePC(ii);
instructionBuffer.pop_front();
} else {
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave%d %s taken branch\n",
computeUnit->cu_id, simdId, wfSlotId, wfDynId,
ii->disassemble());
discardFetch();
}
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
computeUnit->cu_id, simdId, wfSlotId, wfDynId, pc());
if (computeUnit->shader->hsail_mode==Shader::SIMT) {
const int num_active_lanes = execMask().count();
computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
computeUnit->stats.numVecOpsExecuted += num_active_lanes;
if (ii->isF16() && ii->isALU()) {
if (ii->isF32() || ii->isF64()) {
fatal("Instruction is tagged as both (1) F16, and (2)"
"either F32 or F64.");
}
computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes;
if (ii->isFMA()) {
computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAC()) {
computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAD()) {
computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
}
if (ii->isF32() && ii->isALU()) {
if (ii->isF16() || ii->isF64()) {
fatal("Instruction is tagged as both (1) F32, and (2)"
"either F16 or F64.");
}
computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes;
if (ii->isFMA()) {
computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAC()) {
computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAD()) {
computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
}
if (ii->isF64() && ii->isALU()) {
if (ii->isF16() || ii->isF32()) {
fatal("Instruction is tagged as both (1) F64, and (2)"
"either F16 or F32.");
}
computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes;
if (ii->isFMA()) {
computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAC()) {
computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAD()) {
computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
}
if (isGmInstruction(ii)) {
computeUnit->stats.activeLanesPerGMemInstrDist.sample(
num_active_lanes);
} else if (isLmInstruction(ii)) {
computeUnit->stats.activeLanesPerLMemInstrDist.sample(
num_active_lanes);
}
}
/**
* we return here to avoid spurious errors related to flat insts
* and their address segment resolution.
*/
if (execMask().none() && ii->needsToken()) {
computeUnit->getTokenManager()->recvTokens(1);
return;
}
// Update Vector ALU pipeline and other resources
bool flat_as_gm = false;
bool flat_as_lm = false;
if (ii->isFlat()) {
flat_as_gm = (ii->executedAs() == enums::SC_GLOBAL) ||
(ii->executedAs() == enums::SC_PRIVATE);
flat_as_lm = (ii->executedAs() == enums::SC_GROUP);
}
// Single precision ALU or Branch or Return or Special instruction
// Note, we use the same timing regardless of SP or DP ALU operation.
if (ii->isALU() || ii->isSpecialOp() ||
ii->isBranch() || ii->isNop() ||
(ii->isKernArgSeg() && ii->isLoad()) ||
ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
// this is to enforce a fixed number of cycles per issue slot per SIMD
if (!ii->isScalar()) {
computeUnit->vectorALUs[simdId].set(computeUnit->
cyclesToTicks(computeUnit->issuePeriod));
} else {
computeUnit->scalarALUs[scalarAlu].set(computeUnit->
cyclesToTicks(computeUnit->issuePeriod));
}
// Barrier on Scalar ALU
} else if (ii->isBarrier()) {
computeUnit->scalarALUs[scalarAlu].set(computeUnit->
cyclesToTicks(computeUnit->issuePeriod));
// GM or Flat as GM Load
} else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
if (!ii->isScalar()) {
computeUnit->vrfToGlobalMemPipeBus.set(
computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency));
computeUnit->vectorGlobalMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
computeUnit->vrf_gm_bus_latency;
} else {
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
cyclesToTicks(computeUnit->srf_scm_bus_latency));
computeUnit->scalarMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
computeUnit->srf_scm_bus_latency;
}
// GM or Flat as GM Store
} else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
if (!ii->isScalar()) {
computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
computeUnit->vectorGlobalMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
(2 * computeUnit->vrf_gm_bus_latency);
} else {
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
computeUnit->scalarMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
(2 * computeUnit->srf_scm_bus_latency);
}
} else if ((ii->isAtomic() || ii->isMemSync()) &&
(ii->isGlobalMem() || flat_as_gm)) {
if (!ii->isScalar()) {
computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
computeUnit->vectorGlobalMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
(2 * computeUnit->vrf_gm_bus_latency);
} else {
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
computeUnit->scalarMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
(2 * computeUnit->srf_scm_bus_latency);
}
// LM or Flat as LM Load
} else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
cyclesToTicks(computeUnit->vrf_lm_bus_latency));
computeUnit->vectorSharedMemUnit.
set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
computeUnit->vrf_lm_bus_latency;
// LM or Flat as LM Store
} else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
computeUnit->vectorSharedMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
(2 * computeUnit->vrf_lm_bus_latency);
// LM or Flat as LM, Atomic or MemFence
} else if ((ii->isAtomic() || ii->isMemSync()) &&
(ii->isLocalMem() || flat_as_lm)) {
computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
computeUnit->vectorSharedMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
(2 * computeUnit->vrf_lm_bus_latency);
} else {
panic("Bad instruction type!\n");
}
}
GPUDynInstPtr
Wavefront::nextInstr()
{
// Read next instruction from instruction buffer
GPUDynInstPtr ii = instructionBuffer.front();
// if the WF has been dispatched in the schedule stage then
// check the next oldest instruction for readiness
if (computeUnit->pipeMap.find(ii->seqNum()) !=
computeUnit->pipeMap.end()) {
if (instructionBuffer.size() > 1) {
auto it = instructionBuffer.begin() + 1;
return *it;
} else { // No new instructions to check
return nullptr;
}
}
return ii;
}
void
Wavefront::discardFetch()
{
instructionBuffer.clear();
dropFetch |= pendingFetch;
/**
* clear the fetch buffer for this wave in order to
* remove any stale inst data
*/
computeUnit->fetchStage.fetchUnit(simdId).flushBuf(wfSlotId);
}
bool
Wavefront::waitCntsSatisfied()
{
// Both vmWaitCnt && lgkmWaitCnt uninitialized means
// waitCnt instruction has been dispatched but not executed yet: next
// instruction should be blocked until waitCnt is executed.
if (vmWaitCnt == -1 && expWaitCnt == -1 && lgkmWaitCnt == -1) {
return false;
}
/**
* If we reach here, that means an s_waitcnt instruction was executed
* and the waitcnts are set by the execute method. Check if waitcnts
* are satisfied.
*/
if (vmWaitCnt != -1) {
if (vmemInstsIssued > vmWaitCnt) {
// vmWaitCnt not satisfied
return false;
}
}
if (expWaitCnt != -1) {
if (expInstsIssued > expWaitCnt) {
// expWaitCnt not satisfied
return false;
}
}
if (lgkmWaitCnt != -1) {
if (lgkmInstsIssued > lgkmWaitCnt) {
// lgkmWaitCnt not satisfied
return false;
}
}
// if we get here all outstanding waitcnts must
// be satisfied, so we resume normal operation
clearWaitCnts();
return true;
}
bool
Wavefront::sleepDone()
{
assert(status == S_STALLED_SLEEP);
// if the sleep count has not been set, then the sleep instruction has not
// been executed yet, so we will return true without setting the wavefront
// status
if (sleepCnt == 0)
return false;
sleepCnt--;
if (sleepCnt != 0)
return false;
status = S_RUNNING;
return true;
}
void
Wavefront::setSleepTime(int sleep_time)
{
assert(sleepCnt == 0);
sleepCnt = sleep_time;
}
void
Wavefront::setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
{
// the scoreboard should have set the status
// to S_WAITCNT once a waitcnt instruction
// was marked as ready
assert(status == S_WAITCNT);
// waitcnt instruction shouldn't be sending
// negative counts
assert(vm_wait_cnt >= 0);
assert(exp_wait_cnt >= 0);
assert(lgkm_wait_cnt >= 0);
// waitcnts are a max of 15 because we have
// only 1 nibble (4 bits) to set the counts
assert(vm_wait_cnt <= 0xf);
assert(exp_wait_cnt <= 0x7);
assert(lgkm_wait_cnt <= 0x1f);
/**
* prior waitcnts should be satisfied,
* at which time the WF resets them
* back to -1, indicating they are no
* longer active
*/
assert(vmWaitCnt == -1);
assert(expWaitCnt == -1);
assert(lgkmWaitCnt == -1);
/**
* if the instruction encoding
* indicates a waitcnt of 0xf,
* that means the waitcnt is
* not being used
*/
if (vm_wait_cnt != 0xf)
vmWaitCnt = vm_wait_cnt;
if (exp_wait_cnt != 0x7)
expWaitCnt = exp_wait_cnt;
if (lgkm_wait_cnt != 0x1f)
lgkmWaitCnt = lgkm_wait_cnt;
}
void
Wavefront::clearWaitCnts()
{
// reset the waitcnts back to
// -1, indicating they are no
// longer valid
vmWaitCnt = -1;
expWaitCnt = -1;
lgkmWaitCnt = -1;
// resume running normally
status = S_RUNNING;
}
void
Wavefront::incVMemInstsIssued()
{
++vmemInstsIssued;
}
void
Wavefront::incExpInstsIssued()
{
++expInstsIssued;
}
void
Wavefront::incLGKMInstsIssued()
{
++lgkmInstsIssued;
}
void
Wavefront::decVMemInstsIssued()
{
--vmemInstsIssued;
}
void
Wavefront::decExpInstsIssued()
{
--expInstsIssued;
}
void
Wavefront::decLGKMInstsIssued()
{
--lgkmInstsIssued;
}
Addr
Wavefront::pc() const
{
return _pc;
}
void
Wavefront::pc(Addr new_pc)
{
_pc = new_pc;
}
VectorMask&
Wavefront::execMask()
{
return _execMask;
}
bool
Wavefront::execMask(int lane) const
{
return _execMask[lane];
}
void
Wavefront::freeRegisterFile()
{
/* clear busy registers */
for (int i=0; i < maxVgprs; i++) {
int vgprIdx = computeUnit->registerManager->mapVgpr(this, i);
computeUnit->vrf[simdId]->markReg(vgprIdx, false);
}
/* Free registers used by this wavefront */
uint32_t endIndex = (startVgprIndex + reservedVectorRegs - 1) %
computeUnit->vrf[simdId]->numRegs();
computeUnit->registerManager->vrfPoolMgrs[simdId]->
freeRegion(startVgprIndex, endIndex);
}
void
Wavefront::computeActualWgSz(HSAQueueEntry *task)
{
actualWgSzTotal = 1;
for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
actualWgSz[d] = std::min(workGroupSz[d], gridSz[d]
- task->wgId(d) * workGroupSz[d]);
actualWgSzTotal *= actualWgSz[d];
}
}
void
Wavefront::barrierId(int bar_id)
{
assert(bar_id >= WFBarrier::InvalidID);
assert(bar_id < computeUnit->numBarrierSlots());
barId = bar_id;
}
int
Wavefront::barrierId() const
{
return barId;
}
bool
Wavefront::hasBarrier() const
{
return barId > WFBarrier::InvalidID;
}
void
Wavefront::releaseBarrier()
{
barId = WFBarrier::InvalidID;
}
Wavefront::WavefrontStats::WavefrontStats(statistics::Group *parent)
: statistics::Group(parent),
ADD_STAT(numInstrExecuted,
"number of instructions executed by this WF slot"),
ADD_STAT(schCycles, "number of cycles spent in schedule stage"),
ADD_STAT(schStalls, "number of cycles WF is stalled in SCH stage"),
ADD_STAT(schRfAccessStalls, "number of cycles wave selected in SCH but "
"RF denied adding instruction"),
ADD_STAT(schResourceStalls, "number of cycles stalled in sch by resource"
" not available"),
ADD_STAT(schOpdNrdyStalls, "number of cycles stalled in sch waiting for "
"RF reads to complete"),
ADD_STAT(schLdsArbStalls,
"number of cycles wave stalled due to LDS-VRF arbitration"),
// FIXME: the name of the WF needs to be unique
ADD_STAT(numTimesBlockedDueWAXDependencies, "number of times the wf's "
"instructions are blocked due to WAW or WAR dependencies"),
// FIXME: the name of the WF needs to be unique
ADD_STAT(numTimesBlockedDueRAWDependencies, "number of times the wf's "
"instructions are blocked due to RAW dependencies"),
ADD_STAT(vecRawDistance,
"Count of RAW distance in dynamic instructions for this WF"),
ADD_STAT(readsPerWrite, "Count of Vector reads per write for this WF")
{
vecRawDistance.init(0, 20, 1);
readsPerWrite.init(0, 4, 1);
}
} // namespace gem5