Change-Id: Ibe46970f3ba25d62ca2ade5cbc2054ad746b2254 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29912 Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com> Tested-by: kokoro <noreply+kokoro@google.com>
1417 lines
50 KiB
C++
1417 lines
50 KiB
C++
/*
|
|
* Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* For use for simulation and test purposes only
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from this
|
|
* software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "gpu-compute/wavefront.hh"
|
|
|
|
#include "debug/GPUExec.hh"
|
|
#include "debug/GPUInitAbi.hh"
|
|
#include "debug/WavefrontStack.hh"
|
|
#include "gpu-compute/compute_unit.hh"
|
|
#include "gpu-compute/gpu_dyn_inst.hh"
|
|
#include "gpu-compute/scalar_register_file.hh"
|
|
#include "gpu-compute/shader.hh"
|
|
#include "gpu-compute/simple_pool_manager.hh"
|
|
#include "gpu-compute/vector_register_file.hh"
|
|
|
|
Wavefront*
|
|
WavefrontParams::create()
|
|
{
|
|
return new Wavefront(this);
|
|
}
|
|
|
|
Wavefront::Wavefront(const Params *p)
|
|
: SimObject(p), wfSlotId(p->wf_slot_id), simdId(p->simdId),
|
|
maxIbSize(p->max_ib_size), _gpuISA(*this),
|
|
vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1)
|
|
{
|
|
lastTrace = 0;
|
|
execUnitId = -1;
|
|
status = S_STOPPED;
|
|
reservedVectorRegs = 0;
|
|
reservedScalarRegs = 0;
|
|
startVgprIndex = 0;
|
|
startSgprIndex = 0;
|
|
outstandingReqs = 0;
|
|
outstandingReqsWrGm = 0;
|
|
outstandingReqsWrLm = 0;
|
|
outstandingReqsRdGm = 0;
|
|
outstandingReqsRdLm = 0;
|
|
rdLmReqsInPipe = 0;
|
|
rdGmReqsInPipe = 0;
|
|
wrLmReqsInPipe = 0;
|
|
wrGmReqsInPipe = 0;
|
|
scalarRdGmReqsInPipe = 0;
|
|
scalarWrGmReqsInPipe = 0;
|
|
scalarOutstandingReqsRdGm = 0;
|
|
scalarOutstandingReqsWrGm = 0;
|
|
lastNonIdleTick = 0;
|
|
barrierCnt = 0;
|
|
oldBarrierCnt = 0;
|
|
stalledAtBarrier = false;
|
|
ldsChunk = nullptr;
|
|
|
|
memTraceBusy = 0;
|
|
oldVgprTcnt = 0xffffffffffffffffll;
|
|
oldDgprTcnt = 0xffffffffffffffffll;
|
|
oldVgpr.resize(p->wf_size);
|
|
|
|
pendingFetch = false;
|
|
dropFetch = false;
|
|
maxVgprs = 0;
|
|
maxSgprs = 0;
|
|
|
|
lastAddr.resize(p->wf_size);
|
|
workItemFlatId.resize(p->wf_size);
|
|
oldDgpr.resize(p->wf_size);
|
|
barCnt.resize(p->wf_size);
|
|
for (int i = 0; i < 3; ++i) {
|
|
workItemId[i].resize(p->wf_size);
|
|
}
|
|
|
|
_execMask.set();
|
|
rawDist.clear();
|
|
lastInstExec = 0;
|
|
vecReads.clear();
|
|
}
|
|
|
|
void
|
|
Wavefront::regStats()
|
|
{
|
|
SimObject::regStats();
|
|
|
|
// FIXME: the name of the WF needs to be unique
|
|
numTimesBlockedDueWAXDependencies
|
|
.name(name() + ".timesBlockedDueWAXDependencies")
|
|
.desc("number of times the wf's instructions are blocked due to WAW "
|
|
"or WAR dependencies")
|
|
;
|
|
|
|
// FIXME: the name of the WF needs to be unique
|
|
numTimesBlockedDueRAWDependencies
|
|
.name(name() + ".timesBlockedDueRAWDependencies")
|
|
.desc("number of times the wf's instructions are blocked due to RAW "
|
|
"dependencies")
|
|
;
|
|
|
|
numInstrExecuted
|
|
.name(name() + ".num_instr_executed")
|
|
.desc("number of instructions executed by this WF slot")
|
|
;
|
|
|
|
schCycles
|
|
.name(name() + ".sch_cycles")
|
|
.desc("number of cycles spent in schedule stage")
|
|
;
|
|
|
|
schStalls
|
|
.name(name() + ".sch_stalls")
|
|
.desc("number of cycles WF is stalled in SCH stage")
|
|
;
|
|
|
|
schRfAccessStalls
|
|
.name(name() + ".sch_rf_access_stalls")
|
|
.desc("number of cycles wave selected in SCH but RF denied adding "
|
|
"instruction")
|
|
;
|
|
|
|
schResourceStalls
|
|
.name(name() + ".sch_resource_stalls")
|
|
.desc("number of cycles stalled in sch by resource not available")
|
|
;
|
|
|
|
schOpdNrdyStalls
|
|
.name(name() + ".sch_opd_nrdy_stalls")
|
|
.desc("number of cycles stalled in sch waiting for RF reads to "
|
|
"complete")
|
|
;
|
|
|
|
schLdsArbStalls
|
|
.name(name() + ".sch_lds_arb_stalls")
|
|
.desc("number of cycles wave stalled due to LDS-VRF arbitration")
|
|
;
|
|
|
|
vecRawDistance
|
|
.init(0,20,1)
|
|
.name(name() + ".vec_raw_distance")
|
|
.desc("Count of RAW distance in dynamic instructions for this WF")
|
|
;
|
|
|
|
readsPerWrite
|
|
.init(0,4,1)
|
|
.name(name() + ".vec_reads_per_write")
|
|
.desc("Count of Vector reads per write for this WF")
|
|
;
|
|
}
|
|
|
|
void
|
|
Wavefront::init()
|
|
{
|
|
reservedVectorRegs = 0;
|
|
reservedScalarRegs = 0;
|
|
startVgprIndex = 0;
|
|
startSgprIndex = 0;
|
|
|
|
scalarAlu = computeUnit->mapWaveToScalarAlu(this);
|
|
scalarAluGlobalIdx = computeUnit->mapWaveToScalarAluGlobalIdx(this);
|
|
globalMem = computeUnit->mapWaveToGlobalMem(this);
|
|
localMem = computeUnit->mapWaveToLocalMem(this);
|
|
scalarMem = computeUnit->mapWaveToScalarMem(this);
|
|
}
|
|
|
|
void
|
|
Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
|
|
{
|
|
int regInitIdx = 0;
|
|
|
|
// iterate over all the init fields and check which
|
|
// bits are enabled
|
|
for (int en_bit = 0; en_bit < NumScalarInitFields; ++en_bit) {
|
|
|
|
if (task->sgprBitEnabled(en_bit)) {
|
|
int physSgprIdx = 0;
|
|
uint32_t wiCount = 0;
|
|
uint32_t firstWave = 0;
|
|
int orderedAppendTerm = 0;
|
|
int numWfsInWg = 0;
|
|
uint32_t finalValue = 0;
|
|
Addr host_disp_pkt_addr = task->hostDispPktAddr();
|
|
Addr kernarg_addr = task->kernargAddr();
|
|
Addr hidden_priv_base(0);
|
|
|
|
switch (en_bit) {
|
|
case PrivateSegBuf:
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
task->amdQueue.scratch_resource_descriptor[0]);
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting PrivateSegBuffer: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
task->amdQueue.scratch_resource_descriptor[0]);
|
|
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
task->amdQueue.scratch_resource_descriptor[1]);
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting PrivateSegBuffer: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
task->amdQueue.scratch_resource_descriptor[1]);
|
|
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
task->amdQueue.scratch_resource_descriptor[2]);
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting PrivateSegBuffer: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
task->amdQueue.scratch_resource_descriptor[2]);
|
|
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
task->amdQueue.scratch_resource_descriptor[3]);
|
|
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting PrivateSegBuffer: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
task->amdQueue.scratch_resource_descriptor[3]);
|
|
break;
|
|
case DispatchPtr:
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
((uint32_t*)&host_disp_pkt_addr)[0]);
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting DispatchPtr: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
((uint32_t*)&host_disp_pkt_addr)[0]);
|
|
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
((uint32_t*)&host_disp_pkt_addr)[1]);
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting DispatchPtr: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
((uint32_t*)&host_disp_pkt_addr)[1]);
|
|
|
|
++regInitIdx;
|
|
break;
|
|
case QueuePtr:
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
((uint32_t*)&task->hostAMDQueueAddr)[0]);
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting QueuePtr: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
((uint32_t*)&task->hostAMDQueueAddr)[0]);
|
|
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
((uint32_t*)&task->hostAMDQueueAddr)[1]);
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting QueuePtr: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
((uint32_t*)&task->hostAMDQueueAddr)[1]);
|
|
|
|
++regInitIdx;
|
|
break;
|
|
case KernargSegPtr:
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
((uint32_t*)&kernarg_addr)[0]);
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting KernargSegPtr: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
((uint32_t*)kernarg_addr)[0]);
|
|
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
((uint32_t*)&kernarg_addr)[1]);
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting KernargSegPtr: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
((uint32_t*)kernarg_addr)[1]);
|
|
|
|
++regInitIdx;
|
|
break;
|
|
case FlatScratchInit:
|
|
physSgprIdx
|
|
= computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
(TheGpuISA::ScalarRegU32)(task->amdQueue
|
|
.scratch_backing_memory_location & 0xffffffff));
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting FlatScratch Addr: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
(TheGpuISA::ScalarRegU32)(task->amdQueue
|
|
.scratch_backing_memory_location & 0xffffffff));
|
|
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
// This vallue should be sizeof(DWORD) aligned, that is
|
|
// 4 byte aligned
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
task->amdQueue.scratch_workitem_byte_size);
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting FlatScratch size: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
task->amdQueue.scratch_workitem_byte_size);
|
|
/**
|
|
* Since flat scratch init is needed for this kernel, this
|
|
* kernel is going to have flat memory instructions and we
|
|
* need to initialize the hidden private base for this queue.
|
|
* scratch_resource_descriptor[0] has this queue's scratch
|
|
* base address. scratch_backing_memory_location has the
|
|
* offset to this queue's scratch base address from the
|
|
* SH_HIDDEN_PRIVATE_BASE_VMID. Ideally, we only require this
|
|
* queue's scratch base address for address calculation
|
|
* (stored in scratch_resource_descriptor[0]). But that
|
|
* address calculation shoule be done by first finding the
|
|
* queue's scratch base address using the calculation
|
|
* "SH_HIDDEN_PRIVATE_BASE_VMID + offset". So, we initialize
|
|
* SH_HIDDEN_PRIVATE_BASE_VMID.
|
|
*
|
|
* For more details see:
|
|
* http://rocm-documentation.readthedocs.io/en/latest/
|
|
* ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
|
|
*
|
|
* https://github.com/ROCm-Developer-Tools/
|
|
* ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
|
|
* #flat-addressing
|
|
*/
|
|
hidden_priv_base =
|
|
(uint64_t)task->amdQueue.scratch_resource_descriptor[0] |
|
|
(((uint64_t)task->amdQueue.scratch_resource_descriptor[1]
|
|
& 0x000000000000ffff) << 32);
|
|
computeUnit->shader->initShHiddenPrivateBase(
|
|
hidden_priv_base,
|
|
task->amdQueue.scratch_backing_memory_location);
|
|
break;
|
|
case GridWorkgroupCountX:
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
wiCount = ((task->gridSize(0) +
|
|
task->wgSize(0) - 1) /
|
|
task->wgSize(0));
|
|
computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
|
|
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting num WG X: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx, wiCount);
|
|
break;
|
|
case GridWorkgroupCountY:
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
wiCount = ((task->gridSize(1) +
|
|
task->wgSize(1) - 1) /
|
|
task->wgSize(1));
|
|
computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
|
|
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting num WG Y: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx, wiCount);
|
|
break;
|
|
case GridWorkgroupCountZ:
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
wiCount = ((task->gridSize(2) +
|
|
task->wgSize(2) - 1) /
|
|
task->wgSize(2));
|
|
computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
|
|
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting num WG Z: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx, wiCount);
|
|
break;
|
|
case WorkgroupIdX:
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
workGroupId[0]);
|
|
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting WG ID X: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx, workGroupId[0]);
|
|
break;
|
|
case WorkgroupIdY:
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
workGroupId[1]);
|
|
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting WG ID Y: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx, workGroupId[1]);
|
|
break;
|
|
case WorkgroupIdZ:
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->write(physSgprIdx,
|
|
workGroupId[2]);
|
|
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting WG ID Z: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
|
|
break;
|
|
case PrivSegWaveByteOffset:
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
/**
|
|
* the compute_tmpring_size_wavesize specifies the number of
|
|
* kB allocated per wavefront, hence the multiplication by
|
|
* 1024.
|
|
*
|
|
* to get the per wavefront offset into the scratch
|
|
* memory, we also multiply this by the wfId. the wfId stored
|
|
* in the Wavefront class, however, is the wave ID within the
|
|
* WG, whereas here we need the global WFID because the
|
|
* scratch space will be divided amongst all waves in the
|
|
* kernel. to get the global ID we multiply the WGID by
|
|
* the WG size, then add the WFID of the wave within its WG.
|
|
*/
|
|
computeUnit->srf[simdId]->write(physSgprIdx, 1024 *
|
|
(wgId * (wgSz / 64) + wfId) *
|
|
task->amdQueue.compute_tmpring_size_wavesize);
|
|
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting Private Seg Offset: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx,
|
|
1024 * (wgId * (wgSz / 64) + wfId) *
|
|
task->amdQueue.compute_tmpring_size_wavesize);
|
|
break;
|
|
case WorkgroupInfo:
|
|
firstWave = (wfId == 0) ? 1 : 0;
|
|
numWfsInWg = divCeil(wgSizeInWorkItems,
|
|
computeUnit->wfSize());
|
|
finalValue = firstWave << ((sizeof(uint32_t) * 8) - 1);
|
|
finalValue |= (orderedAppendTerm << 6);
|
|
finalValue |= numWfsInWg;
|
|
physSgprIdx =
|
|
computeUnit->registerManager->mapSgpr(this, regInitIdx);
|
|
computeUnit->srf[simdId]->
|
|
write(physSgprIdx, finalValue);
|
|
|
|
++regInitIdx;
|
|
DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
|
|
"Setting WG Info: s[%d] = %x\n",
|
|
computeUnit->cu_id, simdId,
|
|
wfSlotId, wfDynId, physSgprIdx, finalValue);
|
|
break;
|
|
default:
|
|
fatal("SGPR enable bit %i not supported\n", en_bit);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
regInitIdx = 0;
|
|
|
|
// iterate over all the init fields and check which
|
|
// bits are enabled
|
|
for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) {
|
|
if (task->vgprBitEnabled(en_bit)) {
|
|
uint32_t physVgprIdx = 0;
|
|
TheGpuISA::VecRegContainerU32 raw_vgpr;
|
|
|
|
switch (en_bit) {
|
|
case WorkitemIdX:
|
|
{
|
|
physVgprIdx = computeUnit->registerManager
|
|
->mapVgpr(this, regInitIdx);
|
|
TheGpuISA::VecRegU32 vgpr_x
|
|
= raw_vgpr.as<TheGpuISA::VecElemU32>();
|
|
|
|
for (int lane = 0; lane < workItemId[0].size(); ++lane) {
|
|
vgpr_x[lane] = workItemId[0][lane];
|
|
}
|
|
|
|
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
|
|
rawDist[regInitIdx] = 0;
|
|
++regInitIdx;
|
|
}
|
|
break;
|
|
case WorkitemIdY:
|
|
{
|
|
physVgprIdx = computeUnit->registerManager
|
|
->mapVgpr(this, regInitIdx);
|
|
TheGpuISA::VecRegU32 vgpr_y
|
|
= raw_vgpr.as<TheGpuISA::VecElemU32>();
|
|
|
|
for (int lane = 0; lane < workItemId[1].size(); ++lane) {
|
|
vgpr_y[lane] = workItemId[1][lane];
|
|
}
|
|
|
|
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
|
|
rawDist[regInitIdx] = 0;
|
|
++regInitIdx;
|
|
}
|
|
break;
|
|
case WorkitemIdZ:
|
|
{
|
|
physVgprIdx = computeUnit->registerManager->
|
|
mapVgpr(this, regInitIdx);
|
|
TheGpuISA::VecRegU32 vgpr_z
|
|
= raw_vgpr.as<TheGpuISA::VecElemU32>();
|
|
|
|
for (int lane = 0; lane < workItemId[2].size(); ++lane) {
|
|
vgpr_z[lane] = workItemId[2][lane];
|
|
}
|
|
|
|
computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
|
|
rawDist[regInitIdx] = 0;
|
|
++regInitIdx;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
Wavefront::resizeRegFiles(int num_vregs, int num_sregs)
|
|
{
|
|
maxVgprs = num_vregs;
|
|
maxSgprs = num_sregs;
|
|
}
|
|
|
|
Wavefront::~Wavefront()
|
|
{
|
|
}
|
|
|
|
void
|
|
Wavefront::setStatus(status_e newStatus)
|
|
{
|
|
if (computeUnit->idleCUTimeout > 0) {
|
|
// Wavefront's status transitions to stalled or stopped
|
|
if ((newStatus == S_STOPPED || newStatus == S_STALLED ||
|
|
newStatus == S_WAITCNT) &&
|
|
(status != newStatus)) {
|
|
computeUnit->idleWfs++;
|
|
assert(computeUnit->idleWfs <=
|
|
(computeUnit->shader->n_wf * computeUnit->numVectorALUs));
|
|
if (computeUnit->idleWfs ==
|
|
(computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
|
|
lastNonIdleTick = curTick();
|
|
}
|
|
// Wavefront's status transitions to an active state (from
|
|
// a stopped or stalled state)
|
|
} else if ((status == S_STOPPED || status == S_STALLED ||
|
|
status == S_WAITCNT) &&
|
|
(status != newStatus)) {
|
|
// if all WFs in the CU were idle then check if the idleness
|
|
// period exceeded the timeout threshold
|
|
if (computeUnit->idleWfs ==
|
|
(computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
|
|
panic_if((curTick() - lastNonIdleTick) >=
|
|
computeUnit->idleCUTimeout,
|
|
"CU%d has been idle for %d ticks at tick %d",
|
|
computeUnit->cu_id, computeUnit->idleCUTimeout,
|
|
curTick());
|
|
}
|
|
computeUnit->idleWfs--;
|
|
assert(computeUnit->idleWfs >= 0);
|
|
}
|
|
}
|
|
status = newStatus;
|
|
}
|
|
|
|
void
|
|
Wavefront::start(uint64_t _wf_dyn_id, Addr init_pc)
|
|
{
|
|
wfDynId = _wf_dyn_id;
|
|
_pc = init_pc;
|
|
|
|
status = S_RUNNING;
|
|
|
|
vecReads.resize(maxVgprs, 0);
|
|
}
|
|
|
|
bool
|
|
Wavefront::isGmInstruction(GPUDynInstPtr ii)
|
|
{
|
|
if (ii->isGlobalMem() ||
|
|
(ii->isFlat() && ii->executedAs() == Enums::SC_GLOBAL)) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isLmInstruction(GPUDynInstPtr ii)
|
|
{
|
|
if (ii->isLocalMem() ||
|
|
(ii->isFlat() && ii->executedAs() == Enums::SC_GROUP)) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstWaitcnt()
|
|
{
|
|
if (instructionBuffer.empty())
|
|
return false;
|
|
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (ii->isWaitcnt()) {
|
|
// waitcnt is a scalar
|
|
assert(ii->isScalar());
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstScalarALU()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
|
|
|| ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
|
|
(ii->isKernArgSeg() && ii->isLoad()))) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstVectorALU()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && !ii->isScalar() && (ii->isNop() ||
|
|
ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
|
|
|| (ii->isKernArgSeg() && ii->isLoad()))) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstBarrier()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && ii->isBarrier()) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstGMem()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && !ii->isScalar() && ii->isGlobalMem()) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstScalarMem()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && ii->isScalar() && ii->isGlobalMem()) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstLMem()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && ii->isLocalMem()) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstPrivMem()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && ii->isPrivateSeg()) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstFlatMem()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && ii->isFlat()) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::stopFetch()
|
|
{
|
|
for (auto it : instructionBuffer) {
|
|
GPUDynInstPtr ii = it;
|
|
if (ii->isReturn() || ii->isBranch() ||
|
|
ii->isEndOfKernel()) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
void
|
|
Wavefront::freeResources()
|
|
{
|
|
execUnitId = -1;
|
|
}
|
|
|
|
void Wavefront::validateRequestCounters()
|
|
{
|
|
panic_if(wrGmReqsInPipe < 0 || rdGmReqsInPipe < 0 ||
|
|
wrLmReqsInPipe < 0 || rdLmReqsInPipe < 0 ||
|
|
outstandingReqs < 0,
|
|
"Negative requests in pipe for WF%d for slot%d"
|
|
" and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
|
|
" Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
|
|
" Outstanding Reqs=%d\n",
|
|
wfDynId, wfSlotId, simdId, rdGmReqsInPipe, wrGmReqsInPipe,
|
|
rdLmReqsInPipe, wrLmReqsInPipe, outstandingReqs);
|
|
}
|
|
|
|
void
|
|
Wavefront::reserveGmResource(GPUDynInstPtr ii)
|
|
{
|
|
if (!ii->isScalar()) {
|
|
if (ii->isLoad()) {
|
|
rdGmReqsInPipe++;
|
|
} else if (ii->isStore()) {
|
|
wrGmReqsInPipe++;
|
|
} else if (ii->isAtomic() || ii->isMemSync()) {
|
|
rdGmReqsInPipe++;
|
|
wrGmReqsInPipe++;
|
|
} else {
|
|
panic("Invalid memory operation!\n");
|
|
}
|
|
execUnitId = globalMem;
|
|
} else {
|
|
if (ii->isLoad()) {
|
|
scalarRdGmReqsInPipe++;
|
|
} else if (ii->isStore()) {
|
|
scalarWrGmReqsInPipe++;
|
|
} else if (ii->isAtomic() || ii->isMemSync()) {
|
|
scalarWrGmReqsInPipe++;
|
|
scalarRdGmReqsInPipe++;
|
|
} else {
|
|
panic("Invalid memory operation!\n");
|
|
}
|
|
execUnitId = scalarMem;
|
|
}
|
|
}
|
|
|
|
void
|
|
Wavefront::reserveLmResource(GPUDynInstPtr ii)
|
|
{
|
|
fatal_if(ii->isScalar(),
|
|
"Scalar instructions can not access Shared memory!!!");
|
|
if (ii->isLoad()) {
|
|
rdLmReqsInPipe++;
|
|
} else if (ii->isStore()) {
|
|
wrLmReqsInPipe++;
|
|
} else if (ii->isAtomic() || ii->isMemSync()) {
|
|
wrLmReqsInPipe++;
|
|
rdLmReqsInPipe++;
|
|
} else {
|
|
panic("Invalid memory operation!\n");
|
|
}
|
|
execUnitId = localMem;
|
|
}
|
|
|
|
std::vector<int>
|
|
Wavefront::reserveResources()
|
|
{
|
|
// vector of execution unit IDs to return to schedule stage
|
|
// this return is only used for debugging and an assertion...
|
|
std::vector<int> execUnitIds;
|
|
|
|
// Get current instruction
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
assert(ii);
|
|
|
|
// Single precision ALU or Branch or Return or Special instruction
|
|
if (ii->isALU() || ii->isSpecialOp() ||
|
|
ii->isBranch() || ii->isNop() ||
|
|
(ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
|
|
ii->isReturn() || ii->isEndOfKernel()) {
|
|
if (!ii->isScalar()) {
|
|
execUnitId = simdId;
|
|
} else {
|
|
execUnitId = scalarAluGlobalIdx;
|
|
}
|
|
// this is to enforce a fixed number of cycles per issue slot per SIMD
|
|
} else if (ii->isBarrier()) {
|
|
execUnitId = ii->isScalar() ? scalarAluGlobalIdx : simdId;
|
|
} else if (ii->isFlat()) {
|
|
assert(!ii->isScalar());
|
|
reserveLmResource(ii);
|
|
// add execUnitId, reserved by reserveLmResource, list before it is
|
|
// overwriten by reserveGmResource
|
|
execUnitIds.push_back(execUnitId);
|
|
flatLmUnitId = execUnitId;
|
|
reserveGmResource(ii);
|
|
flatGmUnitId = execUnitId;
|
|
execUnitIds.push_back(flatGmUnitId);
|
|
execUnitId = -1;
|
|
} else if (ii->isGlobalMem()) {
|
|
reserveGmResource(ii);
|
|
} else if (ii->isLocalMem()) {
|
|
reserveLmResource(ii);
|
|
} else if (ii->isPrivateSeg()) {
|
|
fatal_if(ii->isScalar(),
|
|
"Scalar instructions can not access Private memory!!!");
|
|
reserveGmResource(ii);
|
|
} else {
|
|
panic("reserveResources -> Couldn't process op!\n");
|
|
}
|
|
|
|
if (execUnitId != -1) {
|
|
execUnitIds.push_back(execUnitId);
|
|
}
|
|
assert(execUnitIds.size());
|
|
return execUnitIds;
|
|
}
|
|
|
|
void
|
|
Wavefront::exec()
|
|
{
|
|
// ---- Exit if wavefront is inactive ----------------------------- //
|
|
|
|
if (status == S_STOPPED || status == S_RETURNING ||
|
|
status==S_STALLED || instructionBuffer.empty()) {
|
|
return;
|
|
}
|
|
|
|
if (status == S_WAITCNT) {
|
|
/**
|
|
* if this wave is in S_WAITCNT state, then
|
|
* it should enter exec() precisely one time
|
|
* before the waitcnts are satisfied, in order
|
|
* to execute the waitcnt instruction itself
|
|
* thus we assert that the waitcnt is the
|
|
* oldest instruction. if we enter exec() with
|
|
* active waitcnts, and we're not executing
|
|
* the waitcnt instruction, something must be
|
|
* wrong
|
|
*/
|
|
assert(isOldestInstWaitcnt());
|
|
}
|
|
|
|
// Get current instruction
|
|
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
const Addr old_pc = pc();
|
|
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
|
|
"(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId,
|
|
wfDynId, ii->disassemble(), old_pc, ii->seqNum());
|
|
|
|
ii->execute(ii);
|
|
// delete the dynamic instruction from the pipeline map
|
|
computeUnit->deleteFromPipeMap(this);
|
|
// update the instruction stats in the CU
|
|
computeUnit->updateInstStats(ii);
|
|
|
|
// inform VRF of instruction execution to schedule write-back
|
|
// and scoreboard ready for registers
|
|
if (!ii->isScalar()) {
|
|
computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
|
|
}
|
|
computeUnit->srf[simdId]->waveExecuteInst(this, ii);
|
|
|
|
computeUnit->shader->vectorInstSrcOperand[ii->numSrcVecOperands()]++;
|
|
computeUnit->shader->vectorInstDstOperand[ii->numDstVecOperands()]++;
|
|
computeUnit->numInstrExecuted++;
|
|
numInstrExecuted++;
|
|
computeUnit->instExecPerSimd[simdId]++;
|
|
computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
|
|
computeUnit->lastExecCycle[simdId]);
|
|
computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
|
|
|
|
if (lastInstExec) {
|
|
computeUnit->instInterleave[simdId].
|
|
sample(computeUnit->instExecPerSimd[simdId] - lastInstExec);
|
|
}
|
|
lastInstExec = computeUnit->instExecPerSimd[simdId];
|
|
|
|
// want to track:
|
|
// number of reads that occur per value written
|
|
|
|
// vector RAW dependency tracking
|
|
for (int i = 0; i < ii->getNumOperands(); i++) {
|
|
if (ii->isVectorRegister(i)) {
|
|
int vgpr = ii->getRegisterIndex(i, ii);
|
|
int nReg = ii->getOperandSize(i) <= 4 ? 1 :
|
|
ii->getOperandSize(i) / 4;
|
|
for (int n = 0; n < nReg; n++) {
|
|
if (ii->isSrcOperand(i)) {
|
|
// This check should never fail, but to be safe we check
|
|
if (rawDist.find(vgpr+n) != rawDist.end()) {
|
|
vecRawDistance.
|
|
sample(numInstrExecuted.value() - rawDist[vgpr+n]);
|
|
}
|
|
// increment number of reads to this register
|
|
vecReads[vgpr+n]++;
|
|
} else if (ii->isDstOperand(i)) {
|
|
// rawDist is set on writes, but will not be set
|
|
// for the first write to each physical register
|
|
if (rawDist.find(vgpr+n) != rawDist.end()) {
|
|
// sample the number of reads that were performed
|
|
readsPerWrite.sample(vecReads[vgpr+n]);
|
|
}
|
|
// on a write, reset count of reads to 0
|
|
vecReads[vgpr+n] = 0;
|
|
|
|
rawDist[vgpr+n] = numInstrExecuted.value();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (pc() == old_pc) {
|
|
// PC not modified by instruction, proceed to next
|
|
_gpuISA.advancePC(ii);
|
|
instructionBuffer.pop_front();
|
|
} else {
|
|
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave%d %s taken branch\n",
|
|
computeUnit->cu_id, simdId, wfSlotId, wfDynId,
|
|
ii->disassemble());
|
|
discardFetch();
|
|
}
|
|
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
|
|
computeUnit->cu_id, simdId, wfSlotId, wfDynId, pc());
|
|
|
|
if (computeUnit->shader->hsail_mode==Shader::SIMT) {
|
|
const int num_active_lanes = execMask().count();
|
|
computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
|
|
computeUnit->numVecOpsExecuted += num_active_lanes;
|
|
|
|
if (ii->isF16() && ii->isALU()) {
|
|
if (ii->isF32() || ii->isF64()) {
|
|
fatal("Instruction is tagged as both (1) F16, and (2)"
|
|
"either F32 or F64.");
|
|
}
|
|
computeUnit->numVecOpsExecutedF16 += num_active_lanes;
|
|
if (ii->isFMA()) {
|
|
computeUnit->numVecOpsExecutedFMA16 += num_active_lanes;
|
|
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
|
|
}
|
|
else if (ii->isMAC()) {
|
|
computeUnit->numVecOpsExecutedMAC16 += num_active_lanes;
|
|
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
|
|
}
|
|
else if (ii->isMAD()) {
|
|
computeUnit->numVecOpsExecutedMAD16 += num_active_lanes;
|
|
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
|
|
}
|
|
}
|
|
if (ii->isF32() && ii->isALU()) {
|
|
if (ii->isF16() || ii->isF64()) {
|
|
fatal("Instruction is tagged as both (1) F32, and (2)"
|
|
"either F16 or F64.");
|
|
}
|
|
computeUnit->numVecOpsExecutedF32 += num_active_lanes;
|
|
if (ii->isFMA()) {
|
|
computeUnit->numVecOpsExecutedFMA32 += num_active_lanes;
|
|
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
|
|
}
|
|
else if (ii->isMAC()) {
|
|
computeUnit->numVecOpsExecutedMAC32 += num_active_lanes;
|
|
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
|
|
}
|
|
else if (ii->isMAD()) {
|
|
computeUnit->numVecOpsExecutedMAD32 += num_active_lanes;
|
|
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
|
|
}
|
|
}
|
|
if (ii->isF64() && ii->isALU()) {
|
|
if (ii->isF16() || ii->isF32()) {
|
|
fatal("Instruction is tagged as both (1) F64, and (2)"
|
|
"either F16 or F32.");
|
|
}
|
|
computeUnit->numVecOpsExecutedF64 += num_active_lanes;
|
|
if (ii->isFMA()) {
|
|
computeUnit->numVecOpsExecutedFMA64 += num_active_lanes;
|
|
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
|
|
}
|
|
else if (ii->isMAC()) {
|
|
computeUnit->numVecOpsExecutedMAC64 += num_active_lanes;
|
|
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
|
|
}
|
|
else if (ii->isMAD()) {
|
|
computeUnit->numVecOpsExecutedMAD64 += num_active_lanes;
|
|
computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
|
|
}
|
|
}
|
|
if (isGmInstruction(ii)) {
|
|
computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
|
|
} else if (isLmInstruction(ii)) {
|
|
computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* we return here to avoid spurious errors related to flat insts
|
|
* and their address segment resolution.
|
|
*/
|
|
if (execMask().none() && ii->isFlat()) {
|
|
computeUnit->getTokenManager()->recvTokens(1);
|
|
return;
|
|
}
|
|
|
|
// Update Vector ALU pipeline and other resources
|
|
bool flat_as_gm = false;
|
|
bool flat_as_lm = false;
|
|
if (ii->isFlat()) {
|
|
flat_as_gm = (ii->executedAs() == Enums::SC_GLOBAL) ||
|
|
(ii->executedAs() == Enums::SC_PRIVATE);
|
|
flat_as_lm = (ii->executedAs() == Enums::SC_GROUP);
|
|
}
|
|
|
|
// Single precision ALU or Branch or Return or Special instruction
|
|
// Note, we use the same timing regardless of SP or DP ALU operation.
|
|
if (ii->isALU() || ii->isSpecialOp() ||
|
|
ii->isBranch() || ii->isNop() ||
|
|
(ii->isKernArgSeg() && ii->isLoad()) ||
|
|
ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
|
|
// this is to enforce a fixed number of cycles per issue slot per SIMD
|
|
if (!ii->isScalar()) {
|
|
computeUnit->vectorALUs[simdId].set(computeUnit->
|
|
cyclesToTicks(computeUnit->issuePeriod));
|
|
} else {
|
|
computeUnit->scalarALUs[scalarAlu].set(computeUnit->
|
|
cyclesToTicks(computeUnit->issuePeriod));
|
|
}
|
|
// Barrier on Scalar ALU
|
|
} else if (ii->isBarrier()) {
|
|
computeUnit->scalarALUs[scalarAlu].set(computeUnit->
|
|
cyclesToTicks(computeUnit->issuePeriod));
|
|
// GM or Flat as GM Load
|
|
} else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
|
|
if (!ii->isScalar()) {
|
|
computeUnit->vrfToGlobalMemPipeBus.set(
|
|
computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency));
|
|
computeUnit->vectorGlobalMemUnit.
|
|
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
|
computeUnit->instCyclesVMemPerSimd[simdId] +=
|
|
computeUnit->vrf_gm_bus_latency;
|
|
} else {
|
|
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
|
|
cyclesToTicks(computeUnit->srf_scm_bus_latency));
|
|
computeUnit->scalarMemUnit.
|
|
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
|
computeUnit->instCyclesScMemPerSimd[simdId] +=
|
|
computeUnit->srf_scm_bus_latency;
|
|
}
|
|
// GM or Flat as GM Store
|
|
} else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
|
|
if (!ii->isScalar()) {
|
|
computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
|
|
cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
|
|
computeUnit->vectorGlobalMemUnit.
|
|
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
|
computeUnit->instCyclesVMemPerSimd[simdId] +=
|
|
(2 * computeUnit->vrf_gm_bus_latency);
|
|
} else {
|
|
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
|
|
cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
|
|
computeUnit->scalarMemUnit.
|
|
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
|
computeUnit->instCyclesScMemPerSimd[simdId] +=
|
|
(2 * computeUnit->srf_scm_bus_latency);
|
|
}
|
|
} else if ((ii->isAtomic() || ii->isMemSync()) &&
|
|
(ii->isGlobalMem() || flat_as_gm)) {
|
|
if (!ii->isScalar()) {
|
|
computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
|
|
cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
|
|
computeUnit->vectorGlobalMemUnit.
|
|
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
|
computeUnit->instCyclesVMemPerSimd[simdId] +=
|
|
(2 * computeUnit->vrf_gm_bus_latency);
|
|
} else {
|
|
computeUnit->srfToScalarMemPipeBus.set(computeUnit->
|
|
cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
|
|
computeUnit->scalarMemUnit.
|
|
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
|
computeUnit->instCyclesScMemPerSimd[simdId] +=
|
|
(2 * computeUnit->srf_scm_bus_latency);
|
|
}
|
|
// LM or Flat as LM Load
|
|
} else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
|
|
computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
|
|
cyclesToTicks(computeUnit->vrf_lm_bus_latency));
|
|
computeUnit->vectorSharedMemUnit.
|
|
set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod));
|
|
computeUnit->instCyclesLdsPerSimd[simdId] +=
|
|
computeUnit->vrf_lm_bus_latency;
|
|
// LM or Flat as LM Store
|
|
} else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
|
|
computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
|
|
cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
|
|
computeUnit->vectorSharedMemUnit.
|
|
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
|
computeUnit->instCyclesLdsPerSimd[simdId] +=
|
|
(2 * computeUnit->vrf_lm_bus_latency);
|
|
// LM or Flat as LM, Atomic or MemFence
|
|
} else if ((ii->isAtomic() || ii->isMemSync()) &&
|
|
(ii->isLocalMem() || flat_as_lm)) {
|
|
computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
|
|
cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
|
|
computeUnit->vectorSharedMemUnit.
|
|
set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
|
|
computeUnit->instCyclesLdsPerSimd[simdId] +=
|
|
(2 * computeUnit->vrf_lm_bus_latency);
|
|
} else {
|
|
panic("Bad instruction type!\n");
|
|
}
|
|
}
|
|
|
|
bool
|
|
Wavefront::waitingAtBarrier(int lane)
|
|
{
|
|
return barCnt[lane] < maxBarCnt;
|
|
}
|
|
|
|
GPUDynInstPtr
|
|
Wavefront::nextInstr()
|
|
{
|
|
// Read next instruction from instruction buffer
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
// if the WF has been dispatched in the schedule stage then
|
|
// check the next oldest instruction for readiness
|
|
if (computeUnit->pipeMap.find(ii->seqNum()) !=
|
|
computeUnit->pipeMap.end()) {
|
|
if (instructionBuffer.size() > 1) {
|
|
auto it = instructionBuffer.begin() + 1;
|
|
return *it;
|
|
} else { // No new instructions to check
|
|
return nullptr;
|
|
}
|
|
}
|
|
return ii;
|
|
}
|
|
|
|
void
|
|
Wavefront::discardFetch()
|
|
{
|
|
instructionBuffer.clear();
|
|
dropFetch |= pendingFetch;
|
|
|
|
/**
|
|
* clear the fetch buffer for this wave in order to
|
|
* remove any stale inst data
|
|
*/
|
|
computeUnit->fetchStage.fetchUnit(simdId).flushBuf(wfSlotId);
|
|
}
|
|
|
|
bool
|
|
Wavefront::waitCntsSatisfied()
|
|
{
|
|
// Both vmWaitCnt && lgkmWaitCnt uninitialized means
|
|
// waitCnt instruction has been dispatched but not executed yet: next
|
|
// instruction should be blocked until waitCnt is executed.
|
|
if (vmWaitCnt == -1 && expWaitCnt == -1 && lgkmWaitCnt == -1) {
|
|
return false;
|
|
}
|
|
|
|
// If we reach here, that means waitCnt instruction is executed and
|
|
// the waitcnts are set by the execute method. Check if waitcnts are
|
|
// satisfied.
|
|
|
|
// current number of vector memory ops in flight
|
|
int vm_cnt = outstandingReqsWrGm + outstandingReqsRdGm;
|
|
|
|
// current number of export insts or vector memory writes in flight
|
|
int exp_cnt = outstandingReqsWrGm;
|
|
|
|
// current number of scalar/LDS memory ops in flight
|
|
// we do not consider GDS/message ops
|
|
int lgkm_cnt = outstandingReqsWrLm + outstandingReqsRdLm +
|
|
scalarOutstandingReqsRdGm + scalarOutstandingReqsWrGm;
|
|
|
|
if (vmWaitCnt != -1) {
|
|
if (vm_cnt > vmWaitCnt) {
|
|
// vmWaitCnt not satisfied
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (expWaitCnt != -1) {
|
|
if (exp_cnt > expWaitCnt) {
|
|
// expWaitCnt not satisfied
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (lgkmWaitCnt != -1) {
|
|
if (lgkm_cnt > lgkmWaitCnt) {
|
|
// lgkmWaitCnt not satisfied
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// if we get here all outstanding waitcnts must
|
|
// be satisfied, so we resume normal operation
|
|
clearWaitCnts();
|
|
|
|
return true;
|
|
}
|
|
|
|
void
|
|
Wavefront::setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
|
|
{
|
|
// the scoreboard should have set the status
|
|
// to S_WAITCNT once a waitcnt instruction
|
|
// was marked as ready
|
|
assert(status == S_WAITCNT);
|
|
|
|
// waitcnt instruction shouldn't be sending
|
|
// negative counts
|
|
assert(vm_wait_cnt >= 0);
|
|
assert(exp_wait_cnt >= 0);
|
|
assert(lgkm_wait_cnt >= 0);
|
|
// waitcnts are a max of 15 because we have
|
|
// only 1 nibble (4 bits) to set the counts
|
|
assert(vm_wait_cnt <= 0xf);
|
|
assert(exp_wait_cnt <= 0x7);
|
|
assert(lgkm_wait_cnt <= 0x1f);
|
|
|
|
/**
|
|
* prior waitcnts should be satisfied,
|
|
* at which time the WF resets them
|
|
* back to -1, indicating they are no
|
|
* longer active
|
|
*/
|
|
assert(vmWaitCnt == -1);
|
|
assert(expWaitCnt == -1);
|
|
assert(lgkmWaitCnt == -1);
|
|
|
|
/**
|
|
* if the instruction encoding
|
|
* indicates a waitcnt of 0xf,
|
|
* that means the waitcnt is
|
|
* not being used
|
|
*/
|
|
if (vm_wait_cnt != 0xf)
|
|
vmWaitCnt = vm_wait_cnt;
|
|
|
|
if (exp_wait_cnt != 0x7)
|
|
expWaitCnt = exp_wait_cnt;
|
|
|
|
if (lgkm_wait_cnt != 0x1f)
|
|
lgkmWaitCnt = lgkm_wait_cnt;
|
|
}
|
|
|
|
void
|
|
Wavefront::clearWaitCnts()
|
|
{
|
|
// reset the waitcnts back to
|
|
// -1, indicating they are no
|
|
// longer valid
|
|
vmWaitCnt = -1;
|
|
expWaitCnt = -1;
|
|
lgkmWaitCnt = -1;
|
|
|
|
// resume running normally
|
|
status = S_RUNNING;
|
|
}
|
|
|
|
Addr
|
|
Wavefront::pc() const
|
|
{
|
|
return _pc;
|
|
}
|
|
|
|
void
|
|
Wavefront::pc(Addr new_pc)
|
|
{
|
|
_pc = new_pc;
|
|
}
|
|
|
|
VectorMask&
|
|
Wavefront::execMask()
|
|
{
|
|
return _execMask;
|
|
}
|
|
|
|
bool
|
|
Wavefront::execMask(int lane) const
|
|
{
|
|
return _execMask[lane];
|
|
}
|
|
|
|
void
|
|
Wavefront::freeRegisterFile()
|
|
{
|
|
/* clear busy registers */
|
|
for (int i=0; i < maxVgprs; i++) {
|
|
int vgprIdx = computeUnit->registerManager->mapVgpr(this, i);
|
|
computeUnit->vrf[simdId]->markReg(vgprIdx, false);
|
|
}
|
|
|
|
/* Free registers used by this wavefront */
|
|
uint32_t endIndex = (startVgprIndex + reservedVectorRegs - 1) %
|
|
computeUnit->vrf[simdId]->numRegs();
|
|
computeUnit->registerManager->vrfPoolMgrs[simdId]->
|
|
freeRegion(startVgprIndex, endIndex);
|
|
}
|
|
|
|
void
|
|
Wavefront::computeActualWgSz(HSAQueueEntry *task)
|
|
{
|
|
actualWgSzTotal = 1;
|
|
for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
|
|
actualWgSz[d] = std::min(workGroupSz[d], gridSz[d]
|
|
- task->wgId(d) * workGroupSz[d]);
|
|
actualWgSzTotal *= actualWgSz[d];
|
|
}
|
|
}
|