Files
gem5/src/gpu-compute/wavefront.cc
Matthew Poremba c1803eafac arch-vega: Architected flat scratch and scratch insts
Architected flat scratch is added in MI300 which store the scratch base
address in dedicated registers rather than in SGPRs. These registers are
used by scratch_ instructions. These are flat instruction which
explicitly target the private memory aperture. These instructions have a
different address calculation than global_ instructions.

This change implements architected flat scratch support, fixes the
address calculation of scratch_ instructions, and implements decodings
for some scratch_ instructions. Previous flat_ instructions which happen
to access the private memory aperture have no change in address
calculation. Since scratch_ instructions are identical to flat_
instruction except for address calculation, the decodings simply reuse
existing flat_ instruction definitions.

Change-Id: I1e1d15a2fbcc7a4a678157c35608f4f22b359e21
2024-05-16 09:23:03 -07:00

1525 lines
53 KiB
C++

/*
* Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "gpu-compute/wavefront.hh"
#include "base/bitfield.hh"
#include "debug/GPUExec.hh"
#include "debug/GPUInitAbi.hh"
#include "debug/WavefrontStack.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/register_file_cache.hh"
#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/simple_pool_manager.hh"
#include "gpu-compute/vector_register_file.hh"
namespace gem5
{
Wavefront::Wavefront(const Params &p)
: SimObject(p), wfSlotId(p.wf_slot_id), simdId(p.simdId),
maxIbSize(p.max_ib_size), _gpuISA(*this),
vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
sleepCnt(0), barId(WFBarrier::InvalidID), stats(this)
{
lastTrace = 0;
execUnitId = -1;
status = S_STOPPED;
reservedVectorRegs = 0;
reservedScalarRegs = 0;
startVgprIndex = 0;
startSgprIndex = 0;
outstandingReqs = 0;
outstandingReqsWrGm = 0;
outstandingReqsWrLm = 0;
outstandingReqsRdGm = 0;
outstandingReqsRdLm = 0;
rdLmReqsInPipe = 0;
rdGmReqsInPipe = 0;
wrLmReqsInPipe = 0;
wrGmReqsInPipe = 0;
scalarRdGmReqsInPipe = 0;
scalarWrGmReqsInPipe = 0;
scalarOutstandingReqsRdGm = 0;
scalarOutstandingReqsWrGm = 0;
lastNonIdleTick = 0;
ldsChunk = nullptr;
memTraceBusy = 0;
oldVgprTcnt = 0xffffffffffffffffll;
oldDgprTcnt = 0xffffffffffffffffll;
oldVgpr.resize(p.wf_size);
pendingFetch = false;
dropFetch = false;
maxVgprs = 0;
maxSgprs = 0;
lastAddr.resize(p.wf_size);
workItemFlatId.resize(p.wf_size);
oldDgpr.resize(p.wf_size);
for (int i = 0; i < 3; ++i) {
workItemId[i].resize(p.wf_size);
}
_execMask.set();
rawDist.clear();
lastInstExec = 0;
vecReads.clear();
}
void
Wavefront::init()
{
reservedVectorRegs = 0;
reservedScalarRegs = 0;
startVgprIndex = 0;
startSgprIndex = 0;
scalarAlu = computeUnit->mapWaveToScalarAlu(this);
scalarAluGlobalIdx = computeUnit->mapWaveToScalarAluGlobalIdx(this);
globalMem = computeUnit->mapWaveToGlobalMem(this);
localMem = computeUnit->mapWaveToLocalMem(this);
scalarMem = computeUnit->mapWaveToScalarMem(this);
}
void
Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
{
int regInitIdx = 0;
gfxVersion = task->gfxVersion();
// Iterate over all the init fields and check which
// bits are enabled. Useful information can be found here:
// https://github.com/ROCm-Developer-Tools/ROCm-ComputeABI-Doc/
// blob/master/AMDGPU-ABI.md
for (int en_bit = 0; en_bit < NumScalarInitFields; ++en_bit) {
if (task->sgprBitEnabled(en_bit)) {
int physSgprIdx = 0;
uint32_t firstWave = 0;
int orderedAppendTerm = 0;
int numWfsInWg = 0;
uint32_t finalValue = 0;
Addr host_disp_pkt_addr = task->hostDispPktAddr();
Addr kernarg_addr = task->kernargAddr();
Addr hidden_priv_base(0);
switch (en_bit) {
case PrivateSegBuf:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->amdQueue.scratch_resource_descriptor[0]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting PrivateSegBuffer: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->amdQueue.scratch_resource_descriptor[0]);
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->amdQueue.scratch_resource_descriptor[1]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting PrivateSegBuffer: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->amdQueue.scratch_resource_descriptor[1]);
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->amdQueue.scratch_resource_descriptor[2]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting PrivateSegBuffer: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->amdQueue.scratch_resource_descriptor[2]);
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->amdQueue.scratch_resource_descriptor[3]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting PrivateSegBuffer: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->amdQueue.scratch_resource_descriptor[3]);
break;
case DispatchPtr:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(host_disp_pkt_addr, 31, 0));
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting DispatchPtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(host_disp_pkt_addr, 31, 0));
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(host_disp_pkt_addr, 63, 32));
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting DispatchPtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(host_disp_pkt_addr, 63, 32));
++regInitIdx;
break;
case QueuePtr:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(task->hostAMDQueueAddr, 31, 0));
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting QueuePtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(task->hostAMDQueueAddr, 31, 0));
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(task->hostAMDQueueAddr, 63, 32));
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting QueuePtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(task->hostAMDQueueAddr, 63, 32));
++regInitIdx;
break;
case KernargSegPtr:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(kernarg_addr, 31, 0));
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting KernargSegPtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(kernarg_addr, 31, 0));
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
bits(kernarg_addr, 63, 32));
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting KernargSegPtr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
bits(kernarg_addr, 63, 32));
++regInitIdx;
break;
case DispatchId:
physSgprIdx
= computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->dispatchId());
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting DispatchId: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->dispatchId());
// Dispatch ID in gem5 is an int. Set upper 32-bits to zero.
physSgprIdx
= computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx, 0);
++regInitIdx;
break;
case FlatScratchInit:
physSgprIdx
= computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
(TheGpuISA::ScalarRegU32)(task->amdQueue
.scratch_backing_memory_location & 0xffffffff));
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting FlatScratch Addr: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
(TheGpuISA::ScalarRegU32)(task->amdQueue
.scratch_backing_memory_location & 0xffffffff));
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
// This vallue should be sizeof(DWORD) aligned, that is
// 4 byte aligned
computeUnit->srf[simdId]->write(physSgprIdx,
task->amdQueue.scratch_workitem_byte_size);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting FlatScratch size: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->amdQueue.scratch_workitem_byte_size);
/**
* Since flat scratch init is needed for this kernel, this
* kernel is going to have flat memory instructions and we
* need to initialize the hidden private base for this queue.
* scratch_resource_descriptor[0] has this queue's scratch
* base address. scratch_backing_memory_location has the
* offset to this queue's scratch base address from the
* SH_HIDDEN_PRIVATE_BASE_VMID. Ideally, we only require this
* queue's scratch base address for address calculation
* (stored in scratch_resource_descriptor[0]). But that
* address calculation shoule be done by first finding the
* queue's scratch base address using the calculation
* "SH_HIDDEN_PRIVATE_BASE_VMID + offset". So, we initialize
* SH_HIDDEN_PRIVATE_BASE_VMID.
*
* For more details see:
* http://rocm-documentation.readthedocs.io/en/latest/
* ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
*
* https://github.com/ROCm-Developer-Tools/
* ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
* #flat-addressing
*/
hidden_priv_base =
(uint64_t)task->amdQueue.scratch_resource_descriptor[0] |
(((uint64_t)task->amdQueue.scratch_resource_descriptor[1]
& 0x000000000000ffff) << 32);
computeUnit->shader->initShHiddenPrivateBase(
hidden_priv_base,
task->amdQueue.scratch_backing_memory_location);
break;
case PrivateSegSize:
physSgprIdx
= computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
task->privMemPerItem());
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting private segment size: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
task->privMemPerItem());
break;
case WorkgroupIdX:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
workGroupId[0]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting WG ID X: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, workGroupId[0]);
break;
case WorkgroupIdY:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
workGroupId[1]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting WG ID Y: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, workGroupId[1]);
break;
case WorkgroupIdZ:
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->write(physSgprIdx,
workGroupId[2]);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting WG ID Z: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
break;
case PrivSegWaveByteOffset:
// For architected flat scratch, this enable is reused to set
// the FLAT_SCRATCH register pair to the scratch backing
// memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch
if (task->gfxVersion() == GfxVersion::gfx942) {
Addr arch_flat_scratch =
task->amdQueue.scratch_backing_memory_location;
computeUnit->srf[simdId]->write(
VegaISA::REG_FLAT_SCRATCH_HI,
bits(arch_flat_scratch, 63, 32));
computeUnit->srf[simdId]->write(
VegaISA::REG_FLAT_SCRATCH_LO,
bits(arch_flat_scratch, 31, 0));
break;
}
// Not architected flat scratch. Write the scratch wavefront
// offset: https://llvm.org/docs/AMDGPUUsage.html
// #amdgpu-amdhsa-initial-kernel-execution-state
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
/**
* the compute_tmpring_size_wavesize specifies the number of
* kB allocated per wavefront, hence the multiplication by
* 1024.
*
* to get the per wavefront offset into the scratch
* memory, we also multiply this by the wfId. the wfId stored
* in the Wavefront class, however, is the wave ID within the
* WG, whereas here we need the global WFID because the
* scratch space will be divided amongst all waves in the
* kernel. to get the global ID we multiply the WGID by
* the WG size, then add the WFID of the wave within its WG.
*/
computeUnit->srf[simdId]->write(physSgprIdx, 1024 *
(wgId * (wgSz / 64) + wfId) *
task->amdQueue.compute_tmpring_size_wavesize);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting Private Seg Offset: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx,
1024 * (wgId * (wgSz / 64) + wfId) *
task->amdQueue.compute_tmpring_size_wavesize);
break;
case WorkgroupInfo:
firstWave = (wfId == 0) ? 1 : 0;
numWfsInWg = divCeil(wgSizeInWorkItems,
computeUnit->wfSize());
finalValue = firstWave << ((sizeof(uint32_t) * 8) - 1);
finalValue |= (orderedAppendTerm << 6);
finalValue |= numWfsInWg;
physSgprIdx =
computeUnit->registerManager->mapSgpr(this, regInitIdx);
computeUnit->srf[simdId]->
write(physSgprIdx, finalValue);
++regInitIdx;
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
"Setting WG Info: s[%d] = %x\n",
computeUnit->cu_id, simdId,
wfSlotId, wfDynId, physSgprIdx, finalValue);
break;
default:
fatal("SGPR enable bit %i not supported\n", en_bit);
break;
}
}
}
// Save the offset to the first accumulation VGPR number from HSA task.
accumOffset = task->accumOffset();
regInitIdx = 0;
// VGPRs are initialized to the work item IDs for a given thread. There
// are two ways to initialize the IDs based on number of dimensions. ISAs
// will either have packed work-item IDs or not. LLVM lists them here:
// https://llvm.org/docs/AMDGPUUsage.html#amdgpu-processor-table
// Default to false and set to true for gem5 supported ISAs.
bool packed_work_item_id = false;
if (task->gfxVersion() == GfxVersion::gfx90a ||
task->gfxVersion() == GfxVersion::gfx942) {
packed_work_item_id = true;
}
// For ISAs with packed work item IDs, only one VGPR is used and the
// (X,Y,Z) dimensions are packed into a single 32-bit VGPR with 10-bits
// for each dimension
if (packed_work_item_id) {
TheGpuISA::VecRegContainerU32 raw_vgpr;
TheGpuISA::VecElemU32 *packed_vgpr
= raw_vgpr.as<TheGpuISA::VecElemU32>();
uint32_t physVgprIdx = computeUnit->registerManager
->mapVgpr(this, regInitIdx);
for (int lane = 0; lane < workItemId[0].size(); ++lane) {
packed_vgpr[lane] = workItemId[0][lane] & 0x3ff;
}
if (task->vgprBitEnabled(1)) {
for (int lane = 0; lane < workItemId[1].size(); ++lane) {
packed_vgpr[lane] |= ((workItemId[1][lane] & 0x3ff) << 10);
}
}
if (task->vgprBitEnabled(2)) {
for (int lane = 0; lane < workItemId[2].size(); ++lane) {
packed_vgpr[lane] |= ((workItemId[2][lane] & 0x3ff) << 20);
}
}
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
return;
}
// For ISAs with non-packed work item IDs, map and initialize one VGPR
// per dimensions. Do this by iterating over all the init fields and
// checking which bits are enabled.
for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) {
if (task->vgprBitEnabled(en_bit)) {
uint32_t physVgprIdx = 0;
TheGpuISA::VecRegContainerU32 raw_vgpr;
switch (en_bit) {
case WorkitemIdX:
{
physVgprIdx = computeUnit->registerManager
->mapVgpr(this, regInitIdx);
TheGpuISA::VecElemU32 *vgpr_x
= raw_vgpr.as<TheGpuISA::VecElemU32>();
for (int lane = 0; lane < workItemId[0].size(); ++lane) {
vgpr_x[lane] = workItemId[0][lane];
}
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
rawDist[regInitIdx] = 0;
++regInitIdx;
}
break;
case WorkitemIdY:
{
physVgprIdx = computeUnit->registerManager
->mapVgpr(this, regInitIdx);
TheGpuISA::VecElemU32 *vgpr_y
= raw_vgpr.as<TheGpuISA::VecElemU32>();
for (int lane = 0; lane < workItemId[1].size(); ++lane) {
vgpr_y[lane] = workItemId[1][lane];
}
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
rawDist[regInitIdx] = 0;
++regInitIdx;
}
break;
case WorkitemIdZ:
{
physVgprIdx = computeUnit->registerManager->
mapVgpr(this, regInitIdx);
TheGpuISA::VecElemU32 *vgpr_z
= raw_vgpr.as<TheGpuISA::VecElemU32>();
for (int lane = 0; lane < workItemId[2].size(); ++lane) {
vgpr_z[lane] = workItemId[2][lane];
}
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
rawDist[regInitIdx] = 0;
++regInitIdx;
}
break;
}
}
}
}
void
Wavefront::resizeRegFiles(int num_vregs, int num_sregs)
{
maxVgprs = num_vregs;
maxSgprs = num_sregs;
}
Wavefront::~Wavefront()
{
}
void
Wavefront::setStatus(status_e newStatus)
{
if (computeUnit->idleCUTimeout > 0) {
// Wavefront's status transitions to stalled or stopped
if ((newStatus == S_STOPPED || newStatus == S_STALLED ||
newStatus == S_WAITCNT || newStatus == S_BARRIER) &&
(status != newStatus)) {
computeUnit->idleWfs++;
assert(computeUnit->idleWfs <=
(computeUnit->shader->n_wf * computeUnit->numVectorALUs));
if (computeUnit->idleWfs ==
(computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
lastNonIdleTick = curTick();
}
// Wavefront's status transitions to an active state (from
// a stopped or stalled state)
} else if ((status == S_STOPPED || status == S_STALLED ||
status == S_WAITCNT || status == S_BARRIER) &&
(status != newStatus)) {
// if all WFs in the CU were idle then check if the idleness
// period exceeded the timeout threshold
if (computeUnit->idleWfs ==
(computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
panic_if((curTick() - lastNonIdleTick) >=
computeUnit->idleCUTimeout,
"CU%d has been idle for %d ticks at tick %d",
computeUnit->cu_id, computeUnit->idleCUTimeout,
curTick());
}
computeUnit->idleWfs--;
assert(computeUnit->idleWfs >= 0);
}
}
status = newStatus;
}
void
Wavefront::start(uint64_t _wf_dyn_id, Addr init_pc)
{
wfDynId = _wf_dyn_id;
_pc = init_pc;
status = S_RUNNING;
vecReads.resize(maxVgprs, 0);
}
bool
Wavefront::isGmInstruction(GPUDynInstPtr ii)
{
if (ii->isGlobalMem() ||
(ii->isFlat() && ii->executedAs() == enums::SC_GLOBAL)) {
return true;
}
return false;
}
bool
Wavefront::isLmInstruction(GPUDynInstPtr ii)
{
if (ii->isLocalMem() ||
(ii->isFlat() && ii->executedAs() == enums::SC_GROUP)) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstSleep()
{
if (instructionBuffer.empty())
return false;
GPUDynInstPtr ii = instructionBuffer.front();
if (ii->isSleep()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstWaitcnt()
{
if (instructionBuffer.empty())
return false;
GPUDynInstPtr ii = instructionBuffer.front();
if (ii->isWaitcnt()) {
// waitcnt is a scalar
assert(ii->isScalar());
return true;
}
return false;
}
bool
Wavefront::isOldestInstScalarALU()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
|| ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
(ii->isKernArgSeg() && ii->isLoad()))) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstVectorALU()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && !ii->isScalar() && (ii->isNop() ||
ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
|| (ii->isKernArgSeg() && ii->isLoad()))) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstBarrier()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isBarrier()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstGMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && !ii->isScalar() && ii->isGlobalMem()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstScalarMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isScalar() && ii->isGlobalMem()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstLMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isLocalMem()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstPrivMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isPrivateSeg()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstFlatMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isFlat()) {
return true;
}
return false;
}
bool
Wavefront::stopFetch()
{
for (auto it : instructionBuffer) {
GPUDynInstPtr ii = it;
if (ii->isReturn() || ii->isBranch() ||
ii->isEndOfKernel()) {
return true;
}
}
return false;
}
void
Wavefront::freeResources()
{
execUnitId = -1;
}
void Wavefront::validateRequestCounters()
{
panic_if(wrGmReqsInPipe < 0 || rdGmReqsInPipe < 0 ||
wrLmReqsInPipe < 0 || rdLmReqsInPipe < 0 ||
outstandingReqs < 0,
"Negative requests in pipe for WF%d for slot%d"
" and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
" Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
" Outstanding Reqs=%d\n",
wfDynId, wfSlotId, simdId, rdGmReqsInPipe, wrGmReqsInPipe,
rdLmReqsInPipe, wrLmReqsInPipe, outstandingReqs);
}
void
Wavefront::reserveGmResource(GPUDynInstPtr ii)
{
if (!ii->isScalar()) {
if (ii->isLoad()) {
rdGmReqsInPipe++;
} else if (ii->isStore()) {
wrGmReqsInPipe++;
} else if (ii->isAtomic() || ii->isMemSync()) {
rdGmReqsInPipe++;
wrGmReqsInPipe++;
} else {
panic("Invalid memory operation!\n");
}
execUnitId = globalMem;
} else {
if (ii->isLoad()) {
scalarRdGmReqsInPipe++;
} else if (ii->isStore()) {
scalarWrGmReqsInPipe++;
} else if (ii->isAtomic() || ii->isMemSync()) {
scalarWrGmReqsInPipe++;
scalarRdGmReqsInPipe++;
} else {
panic("Invalid memory operation!\n");
}
execUnitId = scalarMem;
}
}
void
Wavefront::reserveLmResource(GPUDynInstPtr ii)
{
fatal_if(ii->isScalar(),
"Scalar instructions can not access Shared memory!!!");
if (ii->isLoad()) {
rdLmReqsInPipe++;
} else if (ii->isStore()) {
wrLmReqsInPipe++;
} else if (ii->isAtomic() || ii->isMemSync()) {
wrLmReqsInPipe++;
rdLmReqsInPipe++;
} else {
panic("Invalid memory operation!\n");
}
execUnitId = localMem;
}
std::vector<int>
Wavefront::reserveResources()
{
// vector of execution unit IDs to return to schedule stage
// this return is only used for debugging and an assertion...
std::vector<int> execUnitIds;
// Get current instruction
GPUDynInstPtr ii = instructionBuffer.front();
assert(ii);
// Single precision ALU or Branch or Return or Special instruction
if (ii->isALU() || ii->isSpecialOp() ||
ii->isBranch() || ii->isNop() ||
(ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
ii->isReturn() || ii->isEndOfKernel()) {
if (!ii->isScalar()) {
execUnitId = simdId;
} else {
execUnitId = scalarAluGlobalIdx;
}
// this is to enforce a fixed number of cycles per issue slot per SIMD
} else if (ii->isBarrier()) {
execUnitId = ii->isScalar() ? scalarAluGlobalIdx : simdId;
} else if (ii->isFlat()) {
assert(!ii->isScalar());
reserveLmResource(ii);
// add execUnitId, reserved by reserveLmResource, list before it is
// overwriten by reserveGmResource
execUnitIds.push_back(execUnitId);
flatLmUnitId = execUnitId;
reserveGmResource(ii);
flatGmUnitId = execUnitId;
execUnitIds.push_back(flatGmUnitId);
execUnitId = -1;
} else if (ii->isGlobalMem()) {
reserveGmResource(ii);
} else if (ii->isLocalMem()) {
reserveLmResource(ii);
} else if (ii->isPrivateSeg()) {
fatal_if(ii->isScalar(),
"Scalar instructions can not access Private memory!!!");
reserveGmResource(ii);
} else {
panic("reserveResources -> Couldn't process op!\n");
}
if (execUnitId != -1) {
execUnitIds.push_back(execUnitId);
}
assert(execUnitIds.size());
return execUnitIds;
}
void
Wavefront::exec()
{
// ---- Exit if wavefront is inactive ----------------------------- //
if (status == S_STOPPED || status == S_RETURNING ||
status==S_STALLED || instructionBuffer.empty()) {
return;
}
if (status == S_WAITCNT) {
/**
* if this wave is in S_WAITCNT state, then
* it should enter exec() precisely one time
* before the waitcnts are satisfied, in order
* to execute the waitcnt instruction itself
* thus we assert that the waitcnt is the
* oldest instruction. if we enter exec() with
* active waitcnts, and we're not executing
* the waitcnt instruction, something must be
* wrong
*/
assert(isOldestInstWaitcnt());
}
// Get current instruction
GPUDynInstPtr ii = instructionBuffer.front();
const Addr old_pc = pc();
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
"(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId,
wfDynId, ii->disassemble(), old_pc, ii->seqNum());
ii->execute(ii);
// delete the dynamic instruction from the pipeline map
computeUnit->deleteFromPipeMap(this);
// update the instruction stats in the CU
computeUnit->updateInstStats(ii);
// inform VRF of instruction execution to schedule write-back
// and scoreboard ready for registers
if (!ii->isScalar()) {
computeUnit->rfc[simdId]->waveExecuteInst(this, ii);
computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
}
computeUnit->srf[simdId]->waveExecuteInst(this, ii);
computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecRegOperands());
computeUnit->shader->incVectorInstDstOperand(ii->numDstVecRegOperands());
computeUnit->stats.numInstrExecuted++;
stats.numInstrExecuted++;
computeUnit->instExecPerSimd[simdId]++;
computeUnit->stats.execRateDist.sample(
computeUnit->stats.totalCycles.value() -
computeUnit->lastExecCycle[simdId]);
computeUnit->lastExecCycle[simdId] =
computeUnit->stats.totalCycles.value();
if (lastInstExec) {
computeUnit->stats.instInterleave[simdId].
sample(computeUnit->instExecPerSimd[simdId] - lastInstExec);
}
lastInstExec = computeUnit->instExecPerSimd[simdId];
// want to track:
// number of reads that occur per value written
// vector RAW dependency tracking
for (const auto& srcVecOp : ii->srcVecRegOperands()) {
for (const auto& virtIdx : srcVecOp.virtIndices()) {
// This check should never fail, but to be safe we check
if (rawDist.find(virtIdx) != rawDist.end()) {
stats.vecRawDistance.sample(stats.numInstrExecuted.value() -
rawDist[virtIdx]);
}
// increment number of reads to this register
vecReads[virtIdx]++;
}
}
for (const auto& dstVecOp : ii->dstVecRegOperands()) {
for (const auto& virtIdx : dstVecOp.virtIndices()) {
// rawDist is set on writes, but will not be set for the first
// write to each physical register
if (rawDist.find(virtIdx) != rawDist.end()) {
// Sample the number of reads that were performed
stats.readsPerWrite.sample(vecReads[virtIdx]);
}
// on a write, reset count of reads to 0
vecReads[virtIdx] = 0;
rawDist[virtIdx] = stats.numInstrExecuted.value();
}
}
if (pc() == old_pc) {
// PC not modified by instruction, proceed to next
_gpuISA.advancePC(ii);
instructionBuffer.pop_front();
} else {
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave%d %s taken branch\n",
computeUnit->cu_id, simdId, wfSlotId, wfDynId,
ii->disassemble());
discardFetch();
}
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
computeUnit->cu_id, simdId, wfSlotId, wfDynId, pc());
if (computeUnit->shader->hsail_mode==Shader::SIMT) {
const int num_active_lanes = execMask().count();
computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
computeUnit->stats.numVecOpsExecuted += num_active_lanes;
if (ii->isF16() && ii->isALU()) {
if (ii->isF32() || ii->isF64()) {
fatal("Instruction is tagged as both (1) F16, and (2)"
"either F32 or F64.");
}
computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes;
if (ii->isFMA()) {
computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAC()) {
computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAD()) {
computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
}
if (ii->isF32() && ii->isALU()) {
if (ii->isF16() || ii->isF64()) {
fatal("Instruction is tagged as both (1) F32, and (2)"
"either F16 or F64.");
}
computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes;
if (ii->isFMA()) {
computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAC()) {
computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAD()) {
computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
}
if (ii->isF64() && ii->isALU()) {
if (ii->isF16() || ii->isF32()) {
fatal("Instruction is tagged as both (1) F64, and (2)"
"either F16 or F32.");
}
computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes;
if (ii->isFMA()) {
computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAC()) {
computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
else if (ii->isMAD()) {
computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes;
computeUnit->stats.numVecOpsExecutedTwoOpFP
+= num_active_lanes;
}
}
if (isGmInstruction(ii)) {
computeUnit->stats.activeLanesPerGMemInstrDist.sample(
num_active_lanes);
} else if (isLmInstruction(ii)) {
computeUnit->stats.activeLanesPerLMemInstrDist.sample(
num_active_lanes);
}
}
/**
* we return here to avoid spurious errors related to flat insts
* and their address segment resolution.
*/
if (execMask().none() && ii->needsToken()) {
computeUnit->getTokenManager()->recvTokens(1);
return;
}
// Update Vector ALU pipeline and other resources
bool flat_as_gm = false;
bool flat_as_lm = false;
if (ii->isFlat()) {
flat_as_gm = (ii->executedAs() == enums::SC_GLOBAL) ||
(ii->executedAs() == enums::SC_PRIVATE);
flat_as_lm = (ii->executedAs() == enums::SC_GROUP);
}
// Single precision ALU or Branch or Return or Special instruction
// Note, we use the same timing regardless of SP or DP ALU operation.
if (ii->isALU() || ii->isSpecialOp() ||
ii->isBranch() || ii->isNop() ||
(ii->isKernArgSeg() && ii->isLoad()) ||
ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
// this is to enforce a fixed number of cycles per issue slot per SIMD
if (!ii->isScalar()) {
computeUnit->vectorALUs[simdId].set(computeUnit->
cyclesToTicks(computeUnit->issuePeriod));
} else {
computeUnit->scalarALUs[scalarAlu].set(computeUnit->
cyclesToTicks(computeUnit->issuePeriod));
}
// Barrier on Scalar ALU
} else if (ii->isBarrier()) {
computeUnit->scalarALUs[scalarAlu].set(computeUnit->
cyclesToTicks(computeUnit->issuePeriod));
// GM or Flat as GM Load
} else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
if (!ii->isScalar()) {
computeUnit->vrfToGlobalMemPipeBus.set(
computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency));
computeUnit->vectorGlobalMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
computeUnit->vrf_gm_bus_latency;
} else {
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
cyclesToTicks(computeUnit->srf_scm_bus_latency));
computeUnit->scalarMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
computeUnit->srf_scm_bus_latency;
}
// GM or Flat as GM Store
} else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
if (!ii->isScalar()) {
computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
computeUnit->vectorGlobalMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
(2 * computeUnit->vrf_gm_bus_latency);
} else {
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
computeUnit->scalarMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
(2 * computeUnit->srf_scm_bus_latency);
}
} else if ((ii->isAtomic() || ii->isMemSync()) &&
(ii->isGlobalMem() || flat_as_gm)) {
if (!ii->isScalar()) {
computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
computeUnit->vectorGlobalMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
(2 * computeUnit->vrf_gm_bus_latency);
} else {
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
computeUnit->scalarMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
(2 * computeUnit->srf_scm_bus_latency);
}
// LM or Flat as LM Load
} else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
cyclesToTicks(computeUnit->vrf_lm_bus_latency));
computeUnit->vectorSharedMemUnit.
set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
computeUnit->vrf_lm_bus_latency;
// LM or Flat as LM Store
} else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
computeUnit->vectorSharedMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
(2 * computeUnit->vrf_lm_bus_latency);
// LM or Flat as LM, Atomic or MemFence
} else if ((ii->isAtomic() || ii->isMemSync()) &&
(ii->isLocalMem() || flat_as_lm)) {
computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
computeUnit->vectorSharedMemUnit.
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
(2 * computeUnit->vrf_lm_bus_latency);
} else {
panic("Bad instruction type!\n");
}
}
GPUDynInstPtr
Wavefront::nextInstr()
{
// Read next instruction from instruction buffer
GPUDynInstPtr ii = instructionBuffer.front();
// if the WF has been dispatched in the schedule stage then
// check the next oldest instruction for readiness
if (computeUnit->pipeMap.find(ii->seqNum()) !=
computeUnit->pipeMap.end()) {
if (instructionBuffer.size() > 1) {
auto it = instructionBuffer.begin() + 1;
return *it;
} else { // No new instructions to check
return nullptr;
}
}
return ii;
}
void
Wavefront::discardFetch()
{
instructionBuffer.clear();
dropFetch |= pendingFetch;
/**
* clear the fetch buffer for this wave in order to
* remove any stale inst data
*/
computeUnit->fetchStage.fetchUnit(simdId).flushBuf(wfSlotId);
}
bool
Wavefront::waitCntsSatisfied()
{
// Both vmWaitCnt && lgkmWaitCnt uninitialized means
// waitCnt instruction has been dispatched but not executed yet: next
// instruction should be blocked until waitCnt is executed.
if (vmWaitCnt == -1 && expWaitCnt == -1 && lgkmWaitCnt == -1) {
return false;
}
/**
* If we reach here, that means an s_waitcnt instruction was executed
* and the waitcnts are set by the execute method. Check if waitcnts
* are satisfied.
*/
if (vmWaitCnt != -1) {
if (vmemInstsIssued > vmWaitCnt) {
// vmWaitCnt not satisfied
return false;
}
}
if (expWaitCnt != -1) {
if (expInstsIssued > expWaitCnt) {
// expWaitCnt not satisfied
return false;
}
}
if (lgkmWaitCnt != -1) {
if (lgkmInstsIssued > lgkmWaitCnt) {
// lgkmWaitCnt not satisfied
return false;
}
}
// if we get here all outstanding waitcnts must
// be satisfied, so we resume normal operation
clearWaitCnts();
return true;
}
bool
Wavefront::sleepDone()
{
assert(status == S_STALLED_SLEEP);
// if the sleep count has not been set, then the sleep instruction has not
// been executed yet, so we will return true without setting the wavefront
// status
if (sleepCnt == 0)
return false;
sleepCnt--;
if (sleepCnt != 0)
return false;
status = S_RUNNING;
return true;
}
void
Wavefront::setSleepTime(int sleep_time)
{
assert(sleepCnt == 0);
sleepCnt = sleep_time;
}
void
Wavefront::setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
{
// the scoreboard should have set the status
// to S_WAITCNT once a waitcnt instruction
// was marked as ready
assert(status == S_WAITCNT);
// waitcnt instruction shouldn't be sending
// negative counts
assert(vm_wait_cnt >= 0);
assert(exp_wait_cnt >= 0);
assert(lgkm_wait_cnt >= 0);
// waitcnts are a max of 15 because we have
// only 1 nibble (4 bits) to set the counts
assert(vm_wait_cnt <= 0xf);
assert(exp_wait_cnt <= 0x7);
assert(lgkm_wait_cnt <= 0x1f);
/**
* prior waitcnts should be satisfied,
* at which time the WF resets them
* back to -1, indicating they are no
* longer active
*/
assert(vmWaitCnt == -1);
assert(expWaitCnt == -1);
assert(lgkmWaitCnt == -1);
/**
* if the instruction encoding
* indicates a waitcnt of 0xf,
* that means the waitcnt is
* not being used
*/
if (vm_wait_cnt != 0xf)
vmWaitCnt = vm_wait_cnt;
if (exp_wait_cnt != 0x7)
expWaitCnt = exp_wait_cnt;
if (lgkm_wait_cnt != 0x1f)
lgkmWaitCnt = lgkm_wait_cnt;
}
void
Wavefront::clearWaitCnts()
{
// reset the waitcnts back to
// -1, indicating they are no
// longer valid
vmWaitCnt = -1;
expWaitCnt = -1;
lgkmWaitCnt = -1;
// resume running normally
status = S_RUNNING;
}
void
Wavefront::incVMemInstsIssued()
{
++vmemInstsIssued;
}
void
Wavefront::incExpInstsIssued()
{
++expInstsIssued;
}
void
Wavefront::incLGKMInstsIssued()
{
++lgkmInstsIssued;
}
void
Wavefront::decVMemInstsIssued()
{
--vmemInstsIssued;
}
void
Wavefront::decExpInstsIssued()
{
--expInstsIssued;
}
void
Wavefront::decLGKMInstsIssued()
{
--lgkmInstsIssued;
}
Addr
Wavefront::pc() const
{
return _pc;
}
void
Wavefront::pc(Addr new_pc)
{
_pc = new_pc;
}
VectorMask&
Wavefront::execMask()
{
return _execMask;
}
bool
Wavefront::execMask(int lane) const
{
return _execMask[lane];
}
void
Wavefront::freeRegisterFile()
{
/* clear busy registers */
for (int i=0; i < maxVgprs; i++) {
int vgprIdx = computeUnit->registerManager->mapVgpr(this, i);
computeUnit->vrf[simdId]->markReg(vgprIdx, false);
}
/* Free registers used by this wavefront */
uint32_t endIndex = (startVgprIndex + reservedVectorRegs - 1) %
computeUnit->vrf[simdId]->numRegs();
computeUnit->registerManager->vrfPoolMgrs[simdId]->
freeRegion(startVgprIndex, endIndex);
}
void
Wavefront::computeActualWgSz(HSAQueueEntry *task)
{
actualWgSzTotal = 1;
for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
actualWgSz[d] = std::min(workGroupSz[d], gridSz[d]
- task->wgId(d) * workGroupSz[d]);
actualWgSzTotal *= actualWgSz[d];
}
}
void
Wavefront::barrierId(int bar_id)
{
assert(bar_id >= WFBarrier::InvalidID);
assert(bar_id < computeUnit->numBarrierSlots());
barId = bar_id;
}
int
Wavefront::barrierId() const
{
return barId;
}
bool
Wavefront::hasBarrier() const
{
return barId > WFBarrier::InvalidID;
}
void
Wavefront::releaseBarrier()
{
barId = WFBarrier::InvalidID;
}
Wavefront::WavefrontStats::WavefrontStats(statistics::Group *parent)
: statistics::Group(parent),
ADD_STAT(numInstrExecuted,
"number of instructions executed by this WF slot"),
ADD_STAT(schCycles, "number of cycles spent in schedule stage"),
ADD_STAT(schStalls, "number of cycles WF is stalled in SCH stage"),
ADD_STAT(schRfAccessStalls, "number of cycles wave selected in SCH but "
"RF denied adding instruction"),
ADD_STAT(schResourceStalls, "number of cycles stalled in sch by resource"
" not available"),
ADD_STAT(schOpdNrdyStalls, "number of cycles stalled in sch waiting for "
"RF reads to complete"),
ADD_STAT(schLdsArbStalls,
"number of cycles wave stalled due to LDS-VRF arbitration"),
// FIXME: the name of the WF needs to be unique
ADD_STAT(numTimesBlockedDueWAXDependencies, "number of times the wf's "
"instructions are blocked due to WAW or WAR dependencies"),
// FIXME: the name of the WF needs to be unique
ADD_STAT(numTimesBlockedDueRAWDependencies, "number of times the wf's "
"instructions are blocked due to RAW dependencies"),
ADD_STAT(vecRawDistance,
"Count of RAW distance in dynamic instructions for this WF"),
ADD_STAT(readsPerWrite, "Count of Vector reads per write for this WF")
{
vecRawDistance.init(0, 20, 1);
readsPerWrite.init(0, 4, 1);
}
} // namespace gem5