Architected flat scratch is added in MI300 which store the scratch base address in dedicated registers rather than in SGPRs. These registers are used by scratch_ instructions. These are flat instruction which explicitly target the private memory aperture. These instructions have a different address calculation than global_ instructions. This change implements architected flat scratch support, fixes the address calculation of scratch_ instructions, and implements decodings for some scratch_ instructions. Previous flat_ instructions which happen to access the private memory aperture have no change in address calculation. Since scratch_ instructions are identical to flat_ instruction except for address calculation, the decodings simply reuse existing flat_ instruction definitions. Change-Id: I1e1d15a2fbcc7a4a678157c35608f4f22b359e21
1525 lines
53 KiB
C++
1525 lines
53 KiB
C++
/*
|
|
* Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from this
|
|
* software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "gpu-compute/wavefront.hh"
|
|
|
|
#include "base/bitfield.hh"
|
|
#include "debug/GPUExec.hh"
|
|
#include "debug/GPUInitAbi.hh"
|
|
#include "debug/WavefrontStack.hh"
|
|
#include "gpu-compute/compute_unit.hh"
|
|
#include "gpu-compute/gpu_dyn_inst.hh"
|
|
#include "gpu-compute/register_file_cache.hh"
|
|
#include "gpu-compute/scalar_register_file.hh"
|
|
#include "gpu-compute/shader.hh"
|
|
#include "gpu-compute/simple_pool_manager.hh"
|
|
#include "gpu-compute/vector_register_file.hh"
|
|
|
|
namespace gem5
|
|
{
|
|
|
|
Wavefront::Wavefront(const Params &p)
|
|
: SimObject(p), wfSlotId(p.wf_slot_id), simdId(p.simdId),
|
|
maxIbSize(p.max_ib_size), _gpuISA(*this),
|
|
vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
|
|
vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
|
|
sleepCnt(0), barId(WFBarrier::InvalidID), stats(this)
|
|
{
|
|
lastTrace = 0;
|
|
execUnitId = -1;
|
|
status = S_STOPPED;
|
|
reservedVectorRegs = 0;
|
|
reservedScalarRegs = 0;
|
|
startVgprIndex = 0;
|
|
startSgprIndex = 0;
|
|
outstandingReqs = 0;
|
|
outstandingReqsWrGm = 0;
|
|
outstandingReqsWrLm = 0;
|
|
outstandingReqsRdGm = 0;
|
|
outstandingReqsRdLm = 0;
|
|
rdLmReqsInPipe = 0;
|
|
rdGmReqsInPipe = 0;
|
|
wrLmReqsInPipe = 0;
|
|
wrGmReqsInPipe = 0;
|
|
scalarRdGmReqsInPipe = 0;
|
|
scalarWrGmReqsInPipe = 0;
|
|
scalarOutstandingReqsRdGm = 0;
|
|
scalarOutstandingReqsWrGm = 0;
|
|
lastNonIdleTick = 0;
|
|
ldsChunk = nullptr;
|
|
|
|
memTraceBusy = 0;
|
|
oldVgprTcnt = 0xffffffffffffffffll;
|
|
oldDgprTcnt = 0xffffffffffffffffll;
|
|
oldVgpr.resize(p.wf_size);
|
|
|
|
pendingFetch = false;
|
|
dropFetch = false;
|
|
maxVgprs = 0;
|
|
maxSgprs = 0;
|
|
|
|
lastAddr.resize(p.wf_size);
|
|
workItemFlatId.resize(p.wf_size);
|
|
oldDgpr.resize(p.wf_size);
|
|
for (int i = 0; i < 3; ++i) {
|
|
workItemId[i].resize(p.wf_size);
|
|
}
|
|
|
|
_execMask.set();
|
|
rawDist.clear();
|
|
lastInstExec = 0;
|
|
vecReads.clear();
|
|
}
|
|
|
|
void
|
|
Wavefront::init()
|
|
{
|
|
reservedVectorRegs = 0;
|
|
reservedScalarRegs = 0;
|
|
startVgprIndex = 0;
|
|
startSgprIndex = 0;
|
|
|
|
scalarAlu = computeUnit->mapWaveToScalarAlu(this);
|
|
scalarAluGlobalIdx = computeUnit->mapWaveToScalarAluGlobalIdx(this);
|
|
globalMem = computeUnit->mapWaveToGlobalMem(this);
|
|
localMem = computeUnit->mapWaveToLocalMem(this);
|
|
scalarMem = computeUnit->mapWaveToScalarMem(this);
|
|
}
|
|
|
|
void
|
|
Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
|
|
{
|
|
int regInitIdx = 0;
|
|
gfxVersion = task->gfxVersion();
|
|
|
|
// Iterate over all the init fields and check which
|
|
// bits are enabled. Useful information can be found here:
|
|
// https://github.com/ROCm-Developer-Tools/ROCm-ComputeABI-Doc/
|
|
// blob/master/AMDGPU-ABI.md
|
|
for (int en_bit = 0; en_bit < NumScalarInitFields; ++en_bit) {
|
|
|
|
if (task->sgprBitEnabled(en_bit)) {
|
|
int physSgprIdx = 0;
|
|
uint32_t firstWave = 0;
|
|
int orderedAppendTerm = 0;
|
|
int numWfsInWg = 0;
|
|
uint32_t finalValue = 0;
|
|
Addr host_disp_pkt_addr = task->hostDispPktAddr();
|
|
Addr kernarg_addr = task->kernargAddr();
|
|
Addr hidden_priv_base(0);
|
|
|
|
switch (en_bit) {
|
|
case PrivateSegBuf:
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
task->amdQueue.scratch_resource_descriptor[0]);
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting PrivateSegBuffer: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
task->amdQueue.scratch_resource_descriptor[0]);
|
|
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
task->amdQueue.scratch_resource_descriptor[1]);
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting PrivateSegBuffer: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
task->amdQueue.scratch_resource_descriptor[1]);
|
|
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
task->amdQueue.scratch_resource_descriptor[2]);
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting PrivateSegBuffer: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
task->amdQueue.scratch_resource_descriptor[2]);
|
|
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
task->amdQueue.scratch_resource_descriptor[3]);
|
|
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting PrivateSegBuffer: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
task->amdQueue.scratch_resource_descriptor[3]);
|
|
break;
|
|
case DispatchPtr:
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
bits(host_disp_pkt_addr, 31, 0));
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting DispatchPtr: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
bits(host_disp_pkt_addr, 31, 0));
|
|
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
bits(host_disp_pkt_addr, 63, 32));
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting DispatchPtr: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
bits(host_disp_pkt_addr, 63, 32));
|
|
|
|
++regInitIdx;
|
|
break;
|
|
case QueuePtr:
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
bits(task->hostAMDQueueAddr, 31, 0));
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting QueuePtr: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
bits(task->hostAMDQueueAddr, 31, 0));
|
|
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
bits(task->hostAMDQueueAddr, 63, 32));
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting QueuePtr: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
bits(task->hostAMDQueueAddr, 63, 32));
|
|
|
|
++regInitIdx;
|
|
break;
|
|
case KernargSegPtr:
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
bits(kernarg_addr, 31, 0));
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting KernargSegPtr: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
bits(kernarg_addr, 31, 0));
|
|
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
bits(kernarg_addr, 63, 32));
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting KernargSegPtr: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
bits(kernarg_addr, 63, 32));
|
|
|
|
++regInitIdx;
|
|
break;
|
|
case DispatchId:
|
|
physSgprIdx
|
|
= computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
task->dispatchId());
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting DispatchId: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
task->dispatchId());
|
|
|
|
// Dispatch ID in gem5 is an int. Set upper 32-bits to zero.
|
|
physSgprIdx
|
|
= computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx, 0);
|
|
++regInitIdx;
|
|
break;
|
|
case FlatScratchInit:
|
|
physSgprIdx
|
|
= computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
(TheGpuISA::ScalarRegU32)(task->amdQueue
|
|
.scratch_backing_memory_location & 0xffffffff));
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting FlatScratch Addr: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
(TheGpuISA::ScalarRegU32)(task->amdQueue
|
|
.scratch_backing_memory_location & 0xffffffff));
|
|
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
// This vallue should be sizeof(DWORD) aligned, that is
|
|
// 4 byte aligned
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
task->amdQueue.scratch_workitem_byte_size);
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting FlatScratch size: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
task->amdQueue.scratch_workitem_byte_size);
|
|
/**
|
|
* Since flat scratch init is needed for this kernel, this
|
|
* kernel is going to have flat memory instructions and we
|
|
* need to initialize the hidden private base for this queue.
|
|
* scratch_resource_descriptor[0] has this queue's scratch
|
|
* base address. scratch_backing_memory_location has the
|
|
* offset to this queue's scratch base address from the
|
|
* SH_HIDDEN_PRIVATE_BASE_VMID. Ideally, we only require this
|
|
* queue's scratch base address for address calculation
|
|
* (stored in scratch_resource_descriptor[0]). But that
|
|
* address calculation shoule be done by first finding the
|
|
* queue's scratch base address using the calculation
|
|
* "SH_HIDDEN_PRIVATE_BASE_VMID + offset". So, we initialize
|
|
* SH_HIDDEN_PRIVATE_BASE_VMID.
|
|
*
|
|
* For more details see:
|
|
* http://rocm-documentation.readthedocs.io/en/latest/
|
|
* ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
|
|
*
|
|
* https://github.com/ROCm-Developer-Tools/
|
|
* ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
|
|
* #flat-addressing
|
|
*/
|
|
hidden_priv_base =
|
|
(uint64_t)task->amdQueue.scratch_resource_descriptor[0] |
|
|
(((uint64_t)task->amdQueue.scratch_resource_descriptor[1]
|
|
& 0x000000000000ffff) << 32);
|
|
computeUnit->shader->initShHiddenPrivateBase(
|
|
hidden_priv_base,
|
|
task->amdQueue.scratch_backing_memory_location);
|
|
break;
|
|
case PrivateSegSize:
|
|
physSgprIdx
|
|
= computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
task->privMemPerItem());
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting private segment size: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
task->privMemPerItem());
|
|
break;
|
|
case WorkgroupIdX:
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
workGroupId[0]);
|
|
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting WG ID X: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx, workGroupId[0]);
|
|
break;
|
|
case WorkgroupIdY:
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
workGroupId[1]);
|
|
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting WG ID Y: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx, workGroupId[1]);
|
|
break;
|
|
case WorkgroupIdZ:
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
workGroupId[2]);
|
|
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting WG ID Z: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
|
|
break;
|
|
case PrivSegWaveByteOffset:
|
|
|
|
// For architected flat scratch, this enable is reused to set
|
|
// the FLAT_SCRATCH register pair to the scratch backing
|
|
// memory: https://llvm.org/docs/AMDGPUUsage.html#flat-scratch
|
|
if (task->gfxVersion() == GfxVersion::gfx942) {
|
|
Addr arch_flat_scratch =
|
|
task->amdQueue.scratch_backing_memory_location;
|
|
computeUnit->srf[simdId]->write(
|
|
VegaISA::REG_FLAT_SCRATCH_HI,
|
|
bits(arch_flat_scratch, 63, 32));
|
|
computeUnit->srf[simdId]->write(
|
|
VegaISA::REG_FLAT_SCRATCH_LO,
|
|
bits(arch_flat_scratch, 31, 0));
|
|
|
|
break;
|
|
}
|
|
|
|
// Not architected flat scratch. Write the scratch wavefront
|
|
// offset: https://llvm.org/docs/AMDGPUUsage.html
|
|
// #amdgpu-amdhsa-initial-kernel-execution-state
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
|
|
/**
|
|
* the compute_tmpring_size_wavesize specifies the number of
|
|
* kB allocated per wavefront, hence the multiplication by
|
|
* 1024.
|
|
*
|
|
* to get the per wavefront offset into the scratch
|
|
* memory, we also multiply this by the wfId. the wfId stored
|
|
* in the Wavefront class, however, is the wave ID within the
|
|
* WG, whereas here we need the global WFID because the
|
|
* scratch space will be divided amongst all waves in the
|
|
* kernel. to get the global ID we multiply the WGID by
|
|
* the WG size, then add the WFID of the wave within its WG.
|
|
*/
|
|
computeUnit->srf[simdId]->write(physSgprIdx, 1024 *
|
|
(wgId * (wgSz / 64) + wfId) *
|
|
task->amdQueue.compute_tmpring_size_wavesize);
|
|
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting Private Seg Offset: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
1024 * (wgId * (wgSz / 64) + wfId) *
|
|
task->amdQueue.compute_tmpring_size_wavesize);
|
|
break;
|
|
case WorkgroupInfo:
|
|
firstWave = (wfId == 0) ? 1 : 0;
|
|
numWfsInWg = divCeil(wgSizeInWorkItems,
|
|
computeUnit->wfSize());
|
|
finalValue = firstWave << ((sizeof(uint32_t) * 8) - 1);
|
|
finalValue |= (orderedAppendTerm << 6);
|
|
finalValue |= numWfsInWg;
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->
|
|
write(physSgprIdx, finalValue);
|
|
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting WG Info: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx, finalValue);
|
|
break;
|
|
default:
|
|
fatal("SGPR enable bit %i not supported\n", en_bit);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Save the offset to the first accumulation VGPR number from HSA task.
|
|
accumOffset = task->accumOffset();
|
|
|
|
regInitIdx = 0;
|
|
|
|
// VGPRs are initialized to the work item IDs for a given thread. There
|
|
// are two ways to initialize the IDs based on number of dimensions. ISAs
|
|
// will either have packed work-item IDs or not. LLVM lists them here:
|
|
// https://llvm.org/docs/AMDGPUUsage.html#amdgpu-processor-table
|
|
// Default to false and set to true for gem5 supported ISAs.
|
|
bool packed_work_item_id = false;
|
|
|
|
if (task->gfxVersion() == GfxVersion::gfx90a ||
|
|
task->gfxVersion() == GfxVersion::gfx942) {
|
|
packed_work_item_id = true;
|
|
}
|
|
|
|
// For ISAs with packed work item IDs, only one VGPR is used and the
|
|
// (X,Y,Z) dimensions are packed into a single 32-bit VGPR with 10-bits
|
|
// for each dimension
|
|
if (packed_work_item_id) {
|
|
TheGpuISA::VecRegContainerU32 raw_vgpr;
|
|
TheGpuISA::VecElemU32 *packed_vgpr
|
|
= raw_vgpr.as<TheGpuISA::VecElemU32>();
|
|
|
|
uint32_t physVgprIdx = computeUnit->registerManager
|
|
->mapVgpr(this, regInitIdx);
|
|
for (int lane = 0; lane < workItemId[0].size(); ++lane) {
|
|
packed_vgpr[lane] = workItemId[0][lane] & 0x3ff;
|
|
}
|
|
if (task->vgprBitEnabled(1)) {
|
|
for (int lane = 0; lane < workItemId[1].size(); ++lane) {
|
|
packed_vgpr[lane] |= ((workItemId[1][lane] & 0x3ff) << 10);
|
|
}
|
|
}
|
|
if (task->vgprBitEnabled(2)) {
|
|
for (int lane = 0; lane < workItemId[2].size(); ++lane) {
|
|
packed_vgpr[lane] |= ((workItemId[2][lane] & 0x3ff) << 20);
|
|
}
|
|
}
|
|
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
|
|
|
|
return;
|
|
}
|
|
|
|
// For ISAs with non-packed work item IDs, map and initialize one VGPR
|
|
// per dimensions. Do this by iterating over all the init fields and
|
|
// checking which bits are enabled.
|
|
for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) {
|
|
if (task->vgprBitEnabled(en_bit)) {
|
|
uint32_t physVgprIdx = 0;
|
|
TheGpuISA::VecRegContainerU32 raw_vgpr;
|
|
|
|
switch (en_bit) {
|
|
case WorkitemIdX:
|
|
{
|
|
physVgprIdx = computeUnit->registerManager
|
|
->mapVgpr(this, regInitIdx);
|
|
TheGpuISA::VecElemU32 *vgpr_x
|
|
= raw_vgpr.as<TheGpuISA::VecElemU32>();
|
|
|
|
for (int lane = 0; lane < workItemId[0].size(); ++lane) {
|
|
vgpr_x[lane] = workItemId[0][lane];
|
|
}
|
|
|
|
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
|
|
rawDist[regInitIdx] = 0;
|
|
++regInitIdx;
|
|
}
|
|
break;
|
|
case WorkitemIdY:
|
|
{
|
|
physVgprIdx = computeUnit->registerManager
|
|
->mapVgpr(this, regInitIdx);
|
|
TheGpuISA::VecElemU32 *vgpr_y
|
|
= raw_vgpr.as<TheGpuISA::VecElemU32>();
|
|
|
|
for (int lane = 0; lane < workItemId[1].size(); ++lane) {
|
|
vgpr_y[lane] = workItemId[1][lane];
|
|
}
|
|
|
|
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
|
|
rawDist[regInitIdx] = 0;
|
|
++regInitIdx;
|
|
}
|
|
break;
|
|
case WorkitemIdZ:
|
|
{
|
|
physVgprIdx = computeUnit->registerManager->
|
|
mapVgpr(this, regInitIdx);
|
|
TheGpuISA::VecElemU32 *vgpr_z
|
|
= raw_vgpr.as<TheGpuISA::VecElemU32>();
|
|
|
|
for (int lane = 0; lane < workItemId[2].size(); ++lane) {
|
|
vgpr_z[lane] = workItemId[2][lane];
|
|
}
|
|
|
|
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
|
|
rawDist[regInitIdx] = 0;
|
|
++regInitIdx;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
Wavefront::resizeRegFiles(int num_vregs, int num_sregs)
|
|
{
|
|
maxVgprs = num_vregs;
|
|
maxSgprs = num_sregs;
|
|
}
|
|
|
|
Wavefront::~Wavefront()
|
|
{
|
|
}
|
|
|
|
void
|
|
Wavefront::setStatus(status_e newStatus)
|
|
{
|
|
if (computeUnit->idleCUTimeout > 0) {
|
|
// Wavefront's status transitions to stalled or stopped
|
|
if ((newStatus == S_STOPPED || newStatus == S_STALLED ||
|
|
newStatus == S_WAITCNT || newStatus == S_BARRIER) &&
|
|
(status != newStatus)) {
|
|
computeUnit->idleWfs++;
|
|
assert(computeUnit->idleWfs <=
|
|
(computeUnit->shader->n_wf * computeUnit->numVectorALUs));
|
|
if (computeUnit->idleWfs ==
|
|
(computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
|
|
lastNonIdleTick = curTick();
|
|
}
|
|
// Wavefront's status transitions to an active state (from
|
|
// a stopped or stalled state)
|
|
} else if ((status == S_STOPPED || status == S_STALLED ||
|
|
status == S_WAITCNT || status == S_BARRIER) &&
|
|
(status != newStatus)) {
|
|
// if all WFs in the CU were idle then check if the idleness
|
|
// period exceeded the timeout threshold
|
|
if (computeUnit->idleWfs ==
|
|
(computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
|
|
panic_if((curTick() - lastNonIdleTick) >=
|
|
computeUnit->idleCUTimeout,
|
|
"CU%d has been idle for %d ticks at tick %d",
|
|
computeUnit->cu_id, computeUnit->idleCUTimeout,
|
|
curTick());
|
|
}
|
|
computeUnit->idleWfs--;
|
|
assert(computeUnit->idleWfs >= 0);
|
|
}
|
|
}
|
|
status = newStatus;
|
|
}
|
|
|
|
void
|
|
Wavefront::start(uint64_t _wf_dyn_id, Addr init_pc)
|
|
{
|
|
wfDynId = _wf_dyn_id;
|
|
_pc = init_pc;
|
|
|
|
status = S_RUNNING;
|
|
|
|
vecReads.resize(maxVgprs, 0);
|
|
}
|
|
|
|
bool
|
|
Wavefront::isGmInstruction(GPUDynInstPtr ii)
|
|
{
|
|
if (ii->isGlobalMem() ||
|
|
(ii->isFlat() && ii->executedAs() == enums::SC_GLOBAL)) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isLmInstruction(GPUDynInstPtr ii)
|
|
{
|
|
if (ii->isLocalMem() ||
|
|
(ii->isFlat() && ii->executedAs() == enums::SC_GROUP)) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstSleep()
|
|
{
|
|
if (instructionBuffer.empty())
|
|
return false;
|
|
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (ii->isSleep()) {
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstWaitcnt()
|
|
{
|
|
if (instructionBuffer.empty())
|
|
return false;
|
|
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (ii->isWaitcnt()) {
|
|
// waitcnt is a scalar
|
|
assert(ii->isScalar());
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstScalarALU()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
|
|
|| ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
|
|
(ii->isKernArgSeg() && ii->isLoad()))) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstVectorALU()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && !ii->isScalar() && (ii->isNop() ||
|
|
ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
|
|
|| (ii->isKernArgSeg() && ii->isLoad()))) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstBarrier()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && ii->isBarrier()) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstGMem()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && !ii->isScalar() && ii->isGlobalMem()) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstScalarMem()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && ii->isScalar() && ii->isGlobalMem()) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstLMem()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && ii->isLocalMem()) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstPrivMem()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && ii->isPrivateSeg()) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstFlatMem()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && ii->isFlat()) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::stopFetch()
|
|
{
|
|
for (auto it : instructionBuffer) {
|
|
GPUDynInstPtr ii = it;
|
|
if (ii->isReturn() || ii->isBranch() ||
|
|
ii->isEndOfKernel()) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
void
|
|
Wavefront::freeResources()
|
|
{
|
|
execUnitId = -1;
|
|
}
|
|
|
|
void Wavefront::validateRequestCounters()
|
|
{
|
|
panic_if(wrGmReqsInPipe < 0 || rdGmReqsInPipe < 0 ||
|
|
wrLmReqsInPipe < 0 || rdLmReqsInPipe < 0 ||
|
|
outstandingReqs < 0,
|
|
"Negative requests in pipe for WF%d for slot%d"
|
|
" and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
|
|
" Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
|
|
" Outstanding Reqs=%d\n",
|
|
wfDynId, wfSlotId, simdId, rdGmReqsInPipe, wrGmReqsInPipe,
|
|
rdLmReqsInPipe, wrLmReqsInPipe, outstandingReqs);
|
|
}
|
|
|
|
void
|
|
Wavefront::reserveGmResource(GPUDynInstPtr ii)
|
|
{
|
|
if (!ii->isScalar()) {
|
|
if (ii->isLoad()) {
|
|
rdGmReqsInPipe++;
|
|
} else if (ii->isStore()) {
|
|
wrGmReqsInPipe++;
|
|
} else if (ii->isAtomic() || ii->isMemSync()) {
|
|
rdGmReqsInPipe++;
|
|
wrGmReqsInPipe++;
|
|
} else {
|
|
panic("Invalid memory operation!\n");
|
|
}
|
|
execUnitId = globalMem;
|
|
} else {
|
|
if (ii->isLoad()) {
|
|
scalarRdGmReqsInPipe++;
|
|
} else if (ii->isStore()) {
|
|
scalarWrGmReqsInPipe++;
|
|
} else if (ii->isAtomic() || ii->isMemSync()) {
|
|
scalarWrGmReqsInPipe++;
|
|
scalarRdGmReqsInPipe++;
|
|
} else {
|
|
panic("Invalid memory operation!\n");
|
|
}
|
|
execUnitId = scalarMem;
|
|
}
|
|
}
|
|
|
|
void
|
|
Wavefront::reserveLmResource(GPUDynInstPtr ii)
|
|
{
|
|
fatal_if(ii->isScalar(),
|
|
"Scalar instructions can not access Shared memory!!!");
|
|
if (ii->isLoad()) {
|
|
rdLmReqsInPipe++;
|
|
} else if (ii->isStore()) {
|
|
wrLmReqsInPipe++;
|
|
} else if (ii->isAtomic() || ii->isMemSync()) {
|
|
wrLmReqsInPipe++;
|
|
rdLmReqsInPipe++;
|
|
} else {
|
|
panic("Invalid memory operation!\n");
|
|
}
|
|
execUnitId = localMem;
|
|
}
|
|
|
|
std::vector<int>
|
|
Wavefront::reserveResources()
|
|
{
|
|
// vector of execution unit IDs to return to schedule stage
|
|
// this return is only used for debugging and an assertion...
|
|
std::vector<int> execUnitIds;
|
|
|
|
// Get current instruction
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
assert(ii);
|
|
|
|
// Single precision ALU or Branch or Return or Special instruction
|
|
if (ii->isALU() || ii->isSpecialOp() ||
|
|
ii->isBranch() || ii->isNop() ||
|
|
(ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
|
|
ii->isReturn() || ii->isEndOfKernel()) {
|
|
if (!ii->isScalar()) {
|
|
execUnitId = simdId;
|
|
} else {
|
|
execUnitId = scalarAluGlobalIdx;
|
|
}
|
|
// this is to enforce a fixed number of cycles per issue slot per SIMD
|
|
} else if (ii->isBarrier()) {
|
|
execUnitId = ii->isScalar() ? scalarAluGlobalIdx : simdId;
|
|
} else if (ii->isFlat()) {
|
|
assert(!ii->isScalar());
|
|
reserveLmResource(ii);
|
|
// add execUnitId, reserved by reserveLmResource, list before it is
|
|
// overwriten by reserveGmResource
|
|
execUnitIds.push_back(execUnitId);
|
|
flatLmUnitId = execUnitId;
|
|
reserveGmResource(ii);
|
|
flatGmUnitId = execUnitId;
|
|
execUnitIds.push_back(flatGmUnitId);
|
|
execUnitId = -1;
|
|
} else if (ii->isGlobalMem()) {
|
|
reserveGmResource(ii);
|
|
} else if (ii->isLocalMem()) {
|
|
reserveLmResource(ii);
|
|
} else if (ii->isPrivateSeg()) {
|
|
fatal_if(ii->isScalar(),
|
|
"Scalar instructions can not access Private memory!!!");
|
|
reserveGmResource(ii);
|
|
} else {
|
|
panic("reserveResources -> Couldn't process op!\n");
|
|
}
|
|
|
|
if (execUnitId != -1) {
|
|
execUnitIds.push_back(execUnitId);
|
|
}
|
|
assert(execUnitIds.size());
|
|
return execUnitIds;
|
|
}
|
|
|
|
void
|
|
Wavefront::exec()
|
|
{
|
|
// ---- Exit if wavefront is inactive ----------------------------- //
|
|
|
|
if (status == S_STOPPED || status == S_RETURNING ||
|
|
status==S_STALLED || instructionBuffer.empty()) {
|
|
return;
|
|
}
|
|
|
|
if (status == S_WAITCNT) {
|
|
/**
|
|
* if this wave is in S_WAITCNT state, then
|
|
* it should enter exec() precisely one time
|
|
* before the waitcnts are satisfied, in order
|
|
* to execute the waitcnt instruction itself
|
|
* thus we assert that the waitcnt is the
|
|
* oldest instruction. if we enter exec() with
|
|
* active waitcnts, and we're not executing
|
|
* the waitcnt instruction, something must be
|
|
* wrong
|
|
*/
|
|
assert(isOldestInstWaitcnt());
|
|
}
|
|
|
|
// Get current instruction
|
|
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
const Addr old_pc = pc();
|
|
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
|
|
"(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId,
|
|
wfDynId, ii->disassemble(), old_pc, ii->seqNum());
|
|
|
|
ii->execute(ii);
|
|
// delete the dynamic instruction from the pipeline map
|
|
computeUnit->deleteFromPipeMap(this);
|
|
// update the instruction stats in the CU
|
|
computeUnit->updateInstStats(ii);
|
|
|
|
// inform VRF of instruction execution to schedule write-back
|
|
// and scoreboard ready for registers
|
|
if (!ii->isScalar()) {
|
|
computeUnit->rfc[simdId]->waveExecuteInst(this, ii);
|
|
computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
|
|
}
|
|
computeUnit->srf[simdId]->waveExecuteInst(this, ii);
|
|
|
|
computeUnit->shader->incVectorInstSrcOperand(ii->numSrcVecRegOperands());
|
|
computeUnit->shader->incVectorInstDstOperand(ii->numDstVecRegOperands());
|
|
computeUnit->stats.numInstrExecuted++;
|
|
stats.numInstrExecuted++;
|
|
computeUnit->instExecPerSimd[simdId]++;
|
|
computeUnit->stats.execRateDist.sample(
|
|
computeUnit->stats.totalCycles.value() -
|
|
computeUnit->lastExecCycle[simdId]);
|
|
computeUnit->lastExecCycle[simdId] =
|
|
computeUnit->stats.totalCycles.value();
|
|
|
|
if (lastInstExec) {
|
|
computeUnit->stats.instInterleave[simdId].
|
|
sample(computeUnit->instExecPerSimd[simdId] - lastInstExec);
|
|
}
|
|
lastInstExec = computeUnit->instExecPerSimd[simdId];
|
|
|
|
// want to track:
|
|
// number of reads that occur per value written
|
|
|
|
// vector RAW dependency tracking
|
|
for (const auto& srcVecOp : ii->srcVecRegOperands()) {
|
|
for (const auto& virtIdx : srcVecOp.virtIndices()) {
|
|
// This check should never fail, but to be safe we check
|
|
if (rawDist.find(virtIdx) != rawDist.end()) {
|
|
stats.vecRawDistance.sample(stats.numInstrExecuted.value() -
|
|
rawDist[virtIdx]);
|
|
}
|
|
// increment number of reads to this register
|
|
vecReads[virtIdx]++;
|
|
}
|
|
}
|
|
|
|
for (const auto& dstVecOp : ii->dstVecRegOperands()) {
|
|
for (const auto& virtIdx : dstVecOp.virtIndices()) {
|
|
// rawDist is set on writes, but will not be set for the first
|
|
// write to each physical register
|
|
if (rawDist.find(virtIdx) != rawDist.end()) {
|
|
// Sample the number of reads that were performed
|
|
stats.readsPerWrite.sample(vecReads[virtIdx]);
|
|
}
|
|
// on a write, reset count of reads to 0
|
|
vecReads[virtIdx] = 0;
|
|
|
|
rawDist[virtIdx] = stats.numInstrExecuted.value();
|
|
}
|
|
}
|
|
|
|
if (pc() == old_pc) {
|
|
// PC not modified by instruction, proceed to next
|
|
_gpuISA.advancePC(ii);
|
|
instructionBuffer.pop_front();
|
|
} else {
|
|
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave%d %s taken branch\n",
|
|
computeUnit->cu_id, simdId, wfSlotId, wfDynId,
|
|
ii->disassemble());
|
|
discardFetch();
|
|
}
|
|
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
|
|
computeUnit->cu_id, simdId, wfSlotId, wfDynId, pc());
|
|
|
|
if (computeUnit->shader->hsail_mode==Shader::SIMT) {
|
|
const int num_active_lanes = execMask().count();
|
|
computeUnit->stats.controlFlowDivergenceDist.sample(num_active_lanes);
|
|
computeUnit->stats.numVecOpsExecuted += num_active_lanes;
|
|
|
|
if (ii->isF16() && ii->isALU()) {
|
|
if (ii->isF32() || ii->isF64()) {
|
|
fatal("Instruction is tagged as both (1) F16, and (2)"
|
|
"either F32 or F64.");
|
|
}
|
|
computeUnit->stats.numVecOpsExecutedF16 += num_active_lanes;
|
|
if (ii->isFMA()) {
|
|
computeUnit->stats.numVecOpsExecutedFMA16 += num_active_lanes;
|
|
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
|
+= num_active_lanes;
|
|
}
|
|
else if (ii->isMAC()) {
|
|
computeUnit->stats.numVecOpsExecutedMAC16 += num_active_lanes;
|
|
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
|
+= num_active_lanes;
|
|
}
|
|
else if (ii->isMAD()) {
|
|
computeUnit->stats.numVecOpsExecutedMAD16 += num_active_lanes;
|
|
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
|
+= num_active_lanes;
|
|
}
|
|
}
|
|
if (ii->isF32() && ii->isALU()) {
|
|
if (ii->isF16() || ii->isF64()) {
|
|
fatal("Instruction is tagged as both (1) F32, and (2)"
|
|
"either F16 or F64.");
|
|
}
|
|
computeUnit->stats.numVecOpsExecutedF32 += num_active_lanes;
|
|
if (ii->isFMA()) {
|
|
computeUnit->stats.numVecOpsExecutedFMA32 += num_active_lanes;
|
|
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
|
+= num_active_lanes;
|
|
}
|
|
else if (ii->isMAC()) {
|
|
computeUnit->stats.numVecOpsExecutedMAC32 += num_active_lanes;
|
|
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
|
+= num_active_lanes;
|
|
}
|
|
else if (ii->isMAD()) {
|
|
computeUnit->stats.numVecOpsExecutedMAD32 += num_active_lanes;
|
|
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
|
+= num_active_lanes;
|
|
}
|
|
}
|
|
if (ii->isF64() && ii->isALU()) {
|
|
if (ii->isF16() || ii->isF32()) {
|
|
fatal("Instruction is tagged as both (1) F64, and (2)"
|
|
"either F16 or F32.");
|
|
}
|
|
computeUnit->stats.numVecOpsExecutedF64 += num_active_lanes;
|
|
if (ii->isFMA()) {
|
|
computeUnit->stats.numVecOpsExecutedFMA64 += num_active_lanes;
|
|
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
|
+= num_active_lanes;
|
|
}
|
|
else if (ii->isMAC()) {
|
|
computeUnit->stats.numVecOpsExecutedMAC64 += num_active_lanes;
|
|
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
|
+= num_active_lanes;
|
|
}
|
|
else if (ii->isMAD()) {
|
|
computeUnit->stats.numVecOpsExecutedMAD64 += num_active_lanes;
|
|
computeUnit->stats.numVecOpsExecutedTwoOpFP
|
|
+= num_active_lanes;
|
|
}
|
|
}
|
|
if (isGmInstruction(ii)) {
|
|
computeUnit->stats.activeLanesPerGMemInstrDist.sample(
|
|
num_active_lanes);
|
|
} else if (isLmInstruction(ii)) {
|
|
computeUnit->stats.activeLanesPerLMemInstrDist.sample(
|
|
num_active_lanes);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* we return here to avoid spurious errors related to flat insts
|
|
* and their address segment resolution.
|
|
*/
|
|
if (execMask().none() && ii->needsToken()) {
|
|
computeUnit->getTokenManager()->recvTokens(1);
|
|
return;
|
|
}
|
|
|
|
// Update Vector ALU pipeline and other resources
|
|
bool flat_as_gm = false;
|
|
bool flat_as_lm = false;
|
|
if (ii->isFlat()) {
|
|
flat_as_gm = (ii->executedAs() == enums::SC_GLOBAL) ||
|
|
(ii->executedAs() == enums::SC_PRIVATE);
|
|
flat_as_lm = (ii->executedAs() == enums::SC_GROUP);
|
|
}
|
|
|
|
// Single precision ALU or Branch or Return or Special instruction
|
|
// Note, we use the same timing regardless of SP or DP ALU operation.
|
|
if (ii->isALU() || ii->isSpecialOp() ||
|
|
ii->isBranch() || ii->isNop() ||
|
|
(ii->isKernArgSeg() && ii->isLoad()) ||
|
|
ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
|
|
// this is to enforce a fixed number of cycles per issue slot per SIMD
|
|
if (!ii->isScalar()) {
|
|
computeUnit->vectorALUs[simdId].set(computeUnit->
|
|
cyclesToTicks(computeUnit->issuePeriod));
|
|
} else {
|
|
computeUnit->scalarALUs[scalarAlu].set(computeUnit->
|
|
cyclesToTicks(computeUnit->issuePeriod));
|
|
}
|
|
// Barrier on Scalar ALU
|
|
} else if (ii->isBarrier()) {
|
|
computeUnit->scalarALUs[scalarAlu].set(computeUnit->
|
|
cyclesToTicks(computeUnit->issuePeriod));
|
|
// GM or Flat as GM Load
|
|
} else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
|
|
if (!ii->isScalar()) {
|
|
computeUnit->vrfToGlobalMemPipeBus.set(
|
|
computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency));
|
|
computeUnit->vectorGlobalMemUnit.
|
|
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
|
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
|
|
computeUnit->vrf_gm_bus_latency;
|
|
} else {
|
|
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
|
|
cyclesToTicks(computeUnit->srf_scm_bus_latency));
|
|
computeUnit->scalarMemUnit.
|
|
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
|
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
|
|
computeUnit->srf_scm_bus_latency;
|
|
}
|
|
// GM or Flat as GM Store
|
|
} else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
|
|
if (!ii->isScalar()) {
|
|
computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
|
|
cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
|
|
computeUnit->vectorGlobalMemUnit.
|
|
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
|
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
|
|
(2 * computeUnit->vrf_gm_bus_latency);
|
|
} else {
|
|
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
|
|
cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
|
|
computeUnit->scalarMemUnit.
|
|
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
|
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
|
|
(2 * computeUnit->srf_scm_bus_latency);
|
|
}
|
|
} else if ((ii->isAtomic() || ii->isMemSync()) &&
|
|
(ii->isGlobalMem() || flat_as_gm)) {
|
|
if (!ii->isScalar()) {
|
|
computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
|
|
cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
|
|
computeUnit->vectorGlobalMemUnit.
|
|
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
|
computeUnit->stats.instCyclesVMemPerSimd[simdId] +=
|
|
(2 * computeUnit->vrf_gm_bus_latency);
|
|
} else {
|
|
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
|
|
cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
|
|
computeUnit->scalarMemUnit.
|
|
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
|
computeUnit->stats.instCyclesScMemPerSimd[simdId] +=
|
|
(2 * computeUnit->srf_scm_bus_latency);
|
|
}
|
|
// LM or Flat as LM Load
|
|
} else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
|
|
computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
|
|
cyclesToTicks(computeUnit->vrf_lm_bus_latency));
|
|
computeUnit->vectorSharedMemUnit.
|
|
set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod));
|
|
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
|
|
computeUnit->vrf_lm_bus_latency;
|
|
// LM or Flat as LM Store
|
|
} else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
|
|
computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
|
|
cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
|
|
computeUnit->vectorSharedMemUnit.
|
|
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
|
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
|
|
(2 * computeUnit->vrf_lm_bus_latency);
|
|
// LM or Flat as LM, Atomic or MemFence
|
|
} else if ((ii->isAtomic() || ii->isMemSync()) &&
|
|
(ii->isLocalMem() || flat_as_lm)) {
|
|
computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
|
|
cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
|
|
computeUnit->vectorSharedMemUnit.
|
|
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
|
computeUnit->stats.instCyclesLdsPerSimd[simdId] +=
|
|
(2 * computeUnit->vrf_lm_bus_latency);
|
|
} else {
|
|
panic("Bad instruction type!\n");
|
|
}
|
|
}
|
|
|
|
GPUDynInstPtr
|
|
Wavefront::nextInstr()
|
|
{
|
|
// Read next instruction from instruction buffer
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
// if the WF has been dispatched in the schedule stage then
|
|
// check the next oldest instruction for readiness
|
|
if (computeUnit->pipeMap.find(ii->seqNum()) !=
|
|
computeUnit->pipeMap.end()) {
|
|
if (instructionBuffer.size() > 1) {
|
|
auto it = instructionBuffer.begin() + 1;
|
|
return *it;
|
|
} else { // No new instructions to check
|
|
return nullptr;
|
|
}
|
|
}
|
|
return ii;
|
|
}
|
|
|
|
void
|
|
Wavefront::discardFetch()
|
|
{
|
|
instructionBuffer.clear();
|
|
dropFetch |= pendingFetch;
|
|
|
|
/**
|
|
* clear the fetch buffer for this wave in order to
|
|
* remove any stale inst data
|
|
*/
|
|
computeUnit->fetchStage.fetchUnit(simdId).flushBuf(wfSlotId);
|
|
}
|
|
|
|
bool
|
|
Wavefront::waitCntsSatisfied()
|
|
{
|
|
// Both vmWaitCnt && lgkmWaitCnt uninitialized means
|
|
// waitCnt instruction has been dispatched but not executed yet: next
|
|
// instruction should be blocked until waitCnt is executed.
|
|
if (vmWaitCnt == -1 && expWaitCnt == -1 && lgkmWaitCnt == -1) {
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* If we reach here, that means an s_waitcnt instruction was executed
|
|
* and the waitcnts are set by the execute method. Check if waitcnts
|
|
* are satisfied.
|
|
*/
|
|
if (vmWaitCnt != -1) {
|
|
if (vmemInstsIssued > vmWaitCnt) {
|
|
// vmWaitCnt not satisfied
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (expWaitCnt != -1) {
|
|
if (expInstsIssued > expWaitCnt) {
|
|
// expWaitCnt not satisfied
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (lgkmWaitCnt != -1) {
|
|
if (lgkmInstsIssued > lgkmWaitCnt) {
|
|
// lgkmWaitCnt not satisfied
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// if we get here all outstanding waitcnts must
|
|
// be satisfied, so we resume normal operation
|
|
clearWaitCnts();
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
Wavefront::sleepDone()
|
|
{
|
|
assert(status == S_STALLED_SLEEP);
|
|
|
|
// if the sleep count has not been set, then the sleep instruction has not
|
|
// been executed yet, so we will return true without setting the wavefront
|
|
// status
|
|
if (sleepCnt == 0)
|
|
return false;
|
|
|
|
sleepCnt--;
|
|
if (sleepCnt != 0)
|
|
return false;
|
|
|
|
status = S_RUNNING;
|
|
return true;
|
|
}
|
|
|
|
void
|
|
Wavefront::setSleepTime(int sleep_time)
|
|
{
|
|
assert(sleepCnt == 0);
|
|
sleepCnt = sleep_time;
|
|
}
|
|
|
|
void
|
|
Wavefront::setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
|
|
{
|
|
// the scoreboard should have set the status
|
|
// to S_WAITCNT once a waitcnt instruction
|
|
// was marked as ready
|
|
assert(status == S_WAITCNT);
|
|
|
|
// waitcnt instruction shouldn't be sending
|
|
// negative counts
|
|
assert(vm_wait_cnt >= 0);
|
|
assert(exp_wait_cnt >= 0);
|
|
assert(lgkm_wait_cnt >= 0);
|
|
// waitcnts are a max of 15 because we have
|
|
// only 1 nibble (4 bits) to set the counts
|
|
assert(vm_wait_cnt <= 0xf);
|
|
assert(exp_wait_cnt <= 0x7);
|
|
assert(lgkm_wait_cnt <= 0x1f);
|
|
|
|
/**
|
|
* prior waitcnts should be satisfied,
|
|
* at which time the WF resets them
|
|
* back to -1, indicating they are no
|
|
* longer active
|
|
*/
|
|
assert(vmWaitCnt == -1);
|
|
assert(expWaitCnt == -1);
|
|
assert(lgkmWaitCnt == -1);
|
|
|
|
/**
|
|
* if the instruction encoding
|
|
* indicates a waitcnt of 0xf,
|
|
* that means the waitcnt is
|
|
* not being used
|
|
*/
|
|
if (vm_wait_cnt != 0xf)
|
|
vmWaitCnt = vm_wait_cnt;
|
|
|
|
if (exp_wait_cnt != 0x7)
|
|
expWaitCnt = exp_wait_cnt;
|
|
|
|
if (lgkm_wait_cnt != 0x1f)
|
|
lgkmWaitCnt = lgkm_wait_cnt;
|
|
}
|
|
|
|
void
|
|
Wavefront::clearWaitCnts()
|
|
{
|
|
// reset the waitcnts back to
|
|
// -1, indicating they are no
|
|
// longer valid
|
|
vmWaitCnt = -1;
|
|
expWaitCnt = -1;
|
|
lgkmWaitCnt = -1;
|
|
|
|
// resume running normally
|
|
status = S_RUNNING;
|
|
}
|
|
|
|
void
|
|
Wavefront::incVMemInstsIssued()
|
|
{
|
|
++vmemInstsIssued;
|
|
}
|
|
|
|
void
|
|
Wavefront::incExpInstsIssued()
|
|
{
|
|
++expInstsIssued;
|
|
}
|
|
|
|
void
|
|
Wavefront::incLGKMInstsIssued()
|
|
{
|
|
++lgkmInstsIssued;
|
|
}
|
|
|
|
void
|
|
Wavefront::decVMemInstsIssued()
|
|
{
|
|
--vmemInstsIssued;
|
|
}
|
|
|
|
void
|
|
Wavefront::decExpInstsIssued()
|
|
{
|
|
--expInstsIssued;
|
|
}
|
|
|
|
void
|
|
Wavefront::decLGKMInstsIssued()
|
|
{
|
|
--lgkmInstsIssued;
|
|
}
|
|
|
|
Addr
|
|
Wavefront::pc() const
|
|
{
|
|
return _pc;
|
|
}
|
|
|
|
void
|
|
Wavefront::pc(Addr new_pc)
|
|
{
|
|
_pc = new_pc;
|
|
}
|
|
|
|
VectorMask&
|
|
Wavefront::execMask()
|
|
{
|
|
return _execMask;
|
|
}
|
|
|
|
bool
|
|
Wavefront::execMask(int lane) const
|
|
{
|
|
return _execMask[lane];
|
|
}
|
|
|
|
void
|
|
Wavefront::freeRegisterFile()
|
|
{
|
|
/* clear busy registers */
|
|
for (int i=0; i < maxVgprs; i++) {
|
|
int vgprIdx = computeUnit->registerManager->mapVgpr(this, i);
|
|
computeUnit->vrf[simdId]->markReg(vgprIdx, false);
|
|
}
|
|
|
|
/* Free registers used by this wavefront */
|
|
uint32_t endIndex = (startVgprIndex + reservedVectorRegs - 1) %
|
|
computeUnit->vrf[simdId]->numRegs();
|
|
computeUnit->registerManager->vrfPoolMgrs[simdId]->
|
|
freeRegion(startVgprIndex, endIndex);
|
|
}
|
|
|
|
void
|
|
Wavefront::computeActualWgSz(HSAQueueEntry *task)
|
|
{
|
|
actualWgSzTotal = 1;
|
|
for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
|
|
actualWgSz[d] = std::min(workGroupSz[d], gridSz[d]
|
|
- task->wgId(d) * workGroupSz[d]);
|
|
actualWgSzTotal *= actualWgSz[d];
|
|
}
|
|
}
|
|
|
|
void
|
|
Wavefront::barrierId(int bar_id)
|
|
{
|
|
assert(bar_id >= WFBarrier::InvalidID);
|
|
assert(bar_id < computeUnit->numBarrierSlots());
|
|
barId = bar_id;
|
|
}
|
|
|
|
int
|
|
Wavefront::barrierId() const
|
|
{
|
|
return barId;
|
|
}
|
|
|
|
bool
|
|
Wavefront::hasBarrier() const
|
|
{
|
|
return barId > WFBarrier::InvalidID;
|
|
}
|
|
|
|
void
|
|
Wavefront::releaseBarrier()
|
|
{
|
|
barId = WFBarrier::InvalidID;
|
|
}
|
|
|
|
Wavefront::WavefrontStats::WavefrontStats(statistics::Group *parent)
|
|
: statistics::Group(parent),
|
|
ADD_STAT(numInstrExecuted,
|
|
"number of instructions executed by this WF slot"),
|
|
ADD_STAT(schCycles, "number of cycles spent in schedule stage"),
|
|
ADD_STAT(schStalls, "number of cycles WF is stalled in SCH stage"),
|
|
ADD_STAT(schRfAccessStalls, "number of cycles wave selected in SCH but "
|
|
"RF denied adding instruction"),
|
|
ADD_STAT(schResourceStalls, "number of cycles stalled in sch by resource"
|
|
" not available"),
|
|
ADD_STAT(schOpdNrdyStalls, "number of cycles stalled in sch waiting for "
|
|
"RF reads to complete"),
|
|
ADD_STAT(schLdsArbStalls,
|
|
"number of cycles wave stalled due to LDS-VRF arbitration"),
|
|
// FIXME: the name of the WF needs to be unique
|
|
ADD_STAT(numTimesBlockedDueWAXDependencies, "number of times the wf's "
|
|
"instructions are blocked due to WAW or WAR dependencies"),
|
|
// FIXME: the name of the WF needs to be unique
|
|
ADD_STAT(numTimesBlockedDueRAWDependencies, "number of times the wf's "
|
|
"instructions are blocked due to RAW dependencies"),
|
|
ADD_STAT(vecRawDistance,
|
|
"Count of RAW distance in dynamic instructions for this WF"),
|
|
ADD_STAT(readsPerWrite, "Count of Vector reads per write for this WF")
|
|
{
|
|
vecRawDistance.init(0, 20, 1);
|
|
readsPerWrite.init(0, 4, 1);
|
|
}
|
|
|
|
} // namespace gem5
|