Files
gem5/src/gpu-compute/gpu_dyn_inst.cc
Matthew Poremba 9f4d334644 gpu-compute: Update tokens for flat global/scratch
Memory instructions acquire coalescer tokens in the schedule stage.
Currently this is only done for buffer and flat instructions, but not
flat global or flat scratch. This change now acquires tokens for flat
global and flat scratch instructions. This provides back-pressure to the
CUs and helps to avoid deadlocks in Ruby.

The change also handles returning tokens for buffer, flat global, and
flat scratch instructions. This was previously only being done for
normal flat instructions leading to deadlocks in some applications when
the tokens were exhausted.

To simplify the logic, added a needsToken() method to GPUDynInst which
return if the instruction is buffer or any flat segment.

The waitcnts were also incorrect for flat global and flat scratch. We
should always decrement vmem and exp count for stores and only normal
flat instructions should decrement lgkm. Currently vmem/exp are not
decremented for flat global and flat scratch which can lead to deadlock.
This change set fixes this by always decrementing vmem/exp and lgkm only
for normal flat instructions.

Change-Id: I673f4ac6121e4b5a5e8491bc9130c6d825d95fc5
2023-10-10 09:48:16 -05:00

1026 lines
24 KiB
C++

/*
* Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "gpu-compute/gpu_dyn_inst.hh"
#include "debug/GPUInst.hh"
#include "debug/GPUMem.hh"
#include "gpu-compute/gpu_static_inst.hh"
#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/wavefront.hh"
namespace gem5
{
GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
GPUStaticInst *static_inst, InstSeqNum instSeqNum)
: GPUExecContext(_cu, _wf), scalarAddr(0), addr(computeUnit()->wfSize(),
(Addr)0), numScalarReqs(0), isSaveRestore(false),
_staticInst(static_inst), _seqNum(instSeqNum),
maxSrcVecRegOpSize(-1), maxSrcScalarRegOpSize(-1)
{
_staticInst->initOperandInfo();
statusVector.assign(TheGpuISA::NumVecElemPerVecReg, 0);
tlbHitLevel.assign(computeUnit()->wfSize(), -1);
// vector instructions can have up to 4 source/destination operands
d_data = new uint8_t[computeUnit()->wfSize() * 4 * sizeof(double)];
a_data = new uint8_t[computeUnit()->wfSize() * 8];
x_data = new uint8_t[computeUnit()->wfSize() * 8];
// scalar loads can read up to 16 Dwords of data (see publicly
// available GCN3 ISA manual)
scalar_data = new uint8_t[16 * sizeof(uint32_t)];
for (int i = 0; i < (16 * sizeof(uint32_t)); ++i) {
scalar_data[i] = 0;
}
for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) {
a_data[i] = 0;
x_data[i] = 0;
}
for (int i = 0; i < (computeUnit()->wfSize() * 4 * sizeof(double)); ++i) {
d_data[i] = 0;
}
time = 0;
cu_id = _cu->cu_id;
if (_wf) {
simdId = _wf->simdId;
wfDynId = _wf->wfDynId;
kern_id = _wf->kernId;
wg_id = _wf->wgId;
wfSlotId = _wf->wfSlotId;
} else {
simdId = -1;
wfDynId = -1;
kern_id = -1;
wg_id = -1;
wfSlotId = -1;
}
DPRINTF(GPUInst, "%s: generating operand info for %d operands\n",
disassemble(), getNumOperands());
_staticInst->initDynOperandInfo(wavefront(), computeUnit());
}
GPUDynInst::~GPUDynInst()
{
delete[] d_data;
delete[] a_data;
delete[] x_data;
delete[] scalar_data;
delete _staticInst;
}
void
GPUDynInst::execute(GPUDynInstPtr gpuDynInst)
{
_staticInst->execute(gpuDynInst);
}
const std::vector<OperandInfo>&
GPUDynInst::srcVecRegOperands() const
{
return _staticInst->srcVecRegOperands();
}
const std::vector<OperandInfo>&
GPUDynInst::dstVecRegOperands() const
{
return _staticInst->dstVecRegOperands();
}
const std::vector<OperandInfo>&
GPUDynInst::srcScalarRegOperands() const
{
return _staticInst->srcScalarRegOperands();
}
const std::vector<OperandInfo>&
GPUDynInst::dstScalarRegOperands() const
{
return _staticInst->dstScalarRegOperands();
}
int
GPUDynInst::numSrcRegOperands()
{
return _staticInst->numSrcRegOperands();
}
int
GPUDynInst::numDstRegOperands()
{
return _staticInst->numDstRegOperands();
}
int
GPUDynInst::numSrcVecRegOperands() const
{
return _staticInst->numSrcVecOperands();
}
int
GPUDynInst::numDstVecRegOperands() const
{
return _staticInst->numDstVecOperands();
}
int
GPUDynInst::maxSrcVecRegOperandSize()
{
if (maxSrcVecRegOpSize != -1)
return maxSrcVecRegOpSize;
maxSrcVecRegOpSize = 0;
for (const auto& srcVecOp : srcVecRegOperands())
if (srcVecOp.sizeInDWords() > maxSrcVecRegOpSize)
maxSrcVecRegOpSize = srcVecOp.sizeInDWords();
return maxSrcVecRegOpSize;
}
int
GPUDynInst::numSrcVecDWords()
{
return _staticInst->numSrcVecDWords();
}
int
GPUDynInst::numDstVecDWords()
{
return _staticInst->numDstVecDWords();
}
int
GPUDynInst::numSrcScalarRegOperands() const
{
return _staticInst->numSrcScalarOperands();
}
int
GPUDynInst::numDstScalarRegOperands() const
{
return _staticInst->numDstScalarOperands();
}
int
GPUDynInst::maxSrcScalarRegOperandSize()
{
if (maxSrcScalarRegOpSize != -1)
return maxSrcScalarRegOpSize;
maxSrcScalarRegOpSize = 0;
for (const auto& srcScOp : srcScalarRegOperands())
if (srcScOp.sizeInDWords() > maxSrcScalarRegOpSize)
maxSrcScalarRegOpSize = srcScOp.sizeInDWords();
return maxSrcScalarRegOpSize;
}
int
GPUDynInst::numSrcScalarDWords()
{
return _staticInst->numSrcScalarDWords();
}
int
GPUDynInst::numDstScalarDWords()
{
return _staticInst->numDstScalarDWords();
}
int
GPUDynInst::maxOperandSize()
{
return _staticInst->maxOperandSize();
}
int
GPUDynInst::getNumOperands() const
{
return _staticInst->getNumOperands();
}
bool
GPUDynInst::hasSourceVgpr() const
{
return !srcVecRegOperands().empty();
}
bool
GPUDynInst::hasDestinationVgpr() const
{
return !dstVecRegOperands().empty();
}
bool
GPUDynInst::hasSourceSgpr() const
{
return !srcScalarRegOperands().empty();
}
bool
GPUDynInst::hasDestinationSgpr() const
{
return !dstScalarRegOperands().empty();
}
bool
GPUDynInst::isOpcode(const std::string& opcodeStr,
const std::string& extStr) const
{
return _staticInst->opcode().find(opcodeStr) != std::string::npos &&
_staticInst->opcode().find(extStr) != std::string::npos;
}
bool
GPUDynInst::isOpcode(const std::string& opcodeStr) const
{
return _staticInst->opcode().find(opcodeStr) != std::string::npos;
}
const std::string&
GPUDynInst::disassemble() const
{
return _staticInst->disassemble();
}
InstSeqNum
GPUDynInst::seqNum() const
{
return _seqNum;
}
Addr
GPUDynInst::pc()
{
return wavefront()->pc();
}
void
GPUDynInst::pc(Addr _pc)
{
wavefront()->pc(_pc);
}
enums::StorageClassType
GPUDynInst::executedAs()
{
return _staticInst->executed_as;
}
// Process a memory instruction and (if necessary) submit timing request
void
GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
{
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n",
cu->cu_id, simdId, wfSlotId, exec_mask);
_staticInst->initiateAcc(gpuDynInst);
}
void
GPUDynInst::completeAcc(GPUDynInstPtr gpuDynInst)
{
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector="
"%#x\n complete",
cu->cu_id, simdId, wfSlotId, exec_mask);
_staticInst->completeAcc(gpuDynInst);
}
/**
* accessor methods for the attributes of
* the underlying GPU static instruction
*/
bool
GPUDynInst::isALU() const
{
return _staticInst->isALU();
}
bool
GPUDynInst::isBranch() const
{
return _staticInst->isBranch();
}
bool
GPUDynInst::isCondBranch() const
{
return _staticInst->isCondBranch();
}
bool
GPUDynInst::isNop() const
{
return _staticInst->isNop();
}
bool
GPUDynInst::isEndOfKernel() const
{
return _staticInst->isEndOfKernel();
}
bool
GPUDynInst::isKernelLaunch() const
{
return _staticInst->isKernelLaunch();
}
bool
GPUDynInst::isSDWAInst() const
{
return _staticInst->isSDWAInst();
}
bool
GPUDynInst::isDPPInst() const
{
return _staticInst->isDPPInst();
}
bool
GPUDynInst::isReturn() const
{
return _staticInst->isReturn();
}
bool
GPUDynInst::isUnconditionalJump() const
{
return _staticInst->isUnconditionalJump();
}
bool
GPUDynInst::isSpecialOp() const
{
return _staticInst->isSpecialOp();
}
bool
GPUDynInst::isWaitcnt() const
{
return _staticInst->isWaitcnt();
}
bool
GPUDynInst::isSleep() const
{
return _staticInst->isSleep();
}
bool
GPUDynInst::isBarrier() const
{
return _staticInst->isBarrier();
}
bool
GPUDynInst::isMemSync() const
{
return _staticInst->isMemSync();
}
bool
GPUDynInst::isMemRef() const
{
return _staticInst->isMemRef();
}
bool
GPUDynInst::isFlat() const
{
return _staticInst->isFlat();
}
bool
GPUDynInst::isFlatGlobal() const
{
return _staticInst->isFlatGlobal();
}
bool
GPUDynInst::isFlatScratch() const
{
return _staticInst->isFlatScratch();
}
bool
GPUDynInst::isLoad() const
{
return _staticInst->isLoad();
}
bool
GPUDynInst::isStore() const
{
return _staticInst->isStore();
}
bool
GPUDynInst::isAtomic() const
{
return _staticInst->isAtomic();
}
bool
GPUDynInst::isAtomicNoRet() const
{
return _staticInst->isAtomicNoRet();
}
bool
GPUDynInst::isAtomicRet() const
{
return _staticInst->isAtomicRet();
}
bool
GPUDynInst::isVector() const
{
return !_staticInst->isScalar();
}
bool
GPUDynInst::isScalar() const
{
return _staticInst->isScalar();
}
bool
GPUDynInst::readsSCC() const
{
return _staticInst->readsSCC();
}
bool
GPUDynInst::writesSCC() const
{
return _staticInst->writesSCC();
}
bool
GPUDynInst::readsVCC() const
{
for (const auto& srcOp : _staticInst->srcOperands())
if (srcOp.isVcc())
return true;
return _staticInst->readsVCC();
}
bool
GPUDynInst::writesVCC() const
{
for (const auto& dstOp : _staticInst->dstOperands())
if (dstOp.isVcc())
return true;
return _staticInst->writesVCC();
}
bool
GPUDynInst::readsMode() const
{
return _staticInst->readsMode();
}
bool
GPUDynInst::writesMode() const
{
return _staticInst->writesMode();
}
bool
GPUDynInst::readsExec() const
{
return _staticInst->readsEXEC();
}
bool
GPUDynInst::writesExec() const
{
return _staticInst->writesEXEC();
}
bool
GPUDynInst::ignoreExec() const
{
return _staticInst->ignoreExec();
}
bool
GPUDynInst::writesExecMask() const
{
for (const auto& dstOp : _staticInst->dstOperands())
if (dstOp.isExec())
return true;
return _staticInst->writesEXEC();
}
bool
GPUDynInst::readsExecMask() const
{
for (const auto& srcOp : _staticInst->srcOperands())
if (srcOp.isExec())
return true;
return _staticInst->readsEXEC();
}
bool
GPUDynInst::writesFlatScratch() const
{
for (const auto& dstScalarOp : dstScalarRegOperands())
if (dstScalarOp.isFlatScratch())
return true;
return false;
}
bool
GPUDynInst::readsFlatScratch() const
{
for (const auto& srcScalarOp : srcScalarRegOperands())
if (srcScalarOp.isFlatScratch())
return true;
return false;
}
bool
GPUDynInst::needsToken() const
{
return isGlobalMem() || isFlat() || isFlatGlobal() || isFlatScratch();
}
bool
GPUDynInst::isAtomicAnd() const
{
return _staticInst->isAtomicAnd();
}
bool
GPUDynInst::isAtomicOr() const
{
return _staticInst->isAtomicOr();
}
bool
GPUDynInst::isAtomicXor() const
{
return _staticInst->isAtomicXor();
}
bool
GPUDynInst::isAtomicCAS() const
{
return _staticInst->isAtomicCAS();
}
bool GPUDynInst::isAtomicExch() const
{
return _staticInst->isAtomicExch();
}
bool
GPUDynInst::isAtomicAdd() const
{
return _staticInst->isAtomicAdd();
}
bool
GPUDynInst::isAtomicSub() const
{
return _staticInst->isAtomicSub();
}
bool
GPUDynInst::isAtomicInc() const
{
return _staticInst->isAtomicInc();
}
bool
GPUDynInst::isAtomicDec() const
{
return _staticInst->isAtomicDec();
}
bool
GPUDynInst::isAtomicMax() const
{
return _staticInst->isAtomicMax();
}
bool
GPUDynInst::isAtomicMin() const
{
return _staticInst->isAtomicMin();
}
bool
GPUDynInst::isArgLoad() const
{
return _staticInst->isArgLoad();
}
bool
GPUDynInst::isGlobalMem() const
{
return _staticInst->isGlobalMem();
}
bool
GPUDynInst::isLocalMem() const
{
return _staticInst->isLocalMem();
}
bool
GPUDynInst::isArgSeg() const
{
return _staticInst->isArgSeg();
}
bool
GPUDynInst::isGlobalSeg() const
{
return _staticInst->isGlobalSeg();
}
bool
GPUDynInst::isGroupSeg() const
{
return _staticInst->isGroupSeg();
}
bool
GPUDynInst::isKernArgSeg() const
{
return _staticInst->isKernArgSeg();
}
bool
GPUDynInst::isPrivateSeg() const
{
return _staticInst->isPrivateSeg();
}
bool
GPUDynInst::isReadOnlySeg() const
{
return _staticInst->isReadOnlySeg();
}
bool
GPUDynInst::isSpillSeg() const
{
return _staticInst->isSpillSeg();
}
bool
GPUDynInst::isGloballyCoherent() const
{
return _staticInst->isGloballyCoherent();
}
bool
GPUDynInst::isSystemCoherent() const
{
return _staticInst->isSystemCoherent();
}
bool
GPUDynInst::isF16() const
{
return _staticInst->isF16();
}
bool
GPUDynInst::isF32() const
{
return _staticInst->isF32();
}
bool
GPUDynInst::isF64() const
{
return _staticInst->isF64();
}
bool
GPUDynInst::isFMA() const
{
return _staticInst->isFMA();
}
bool
GPUDynInst::isMAC() const
{
return _staticInst->isMAC();
}
bool
GPUDynInst::isMAD() const
{
return _staticInst->isMAD();
}
void
GPUDynInst::doApertureCheck(const VectorMask &mask)
{
assert(mask.any());
// find the segment of the first active address, after
// that we check that all other active addresses also
// fall within the same APE
for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
if (mask[lane]) {
if (computeUnit()->shader->isLdsApe(addr[lane])) {
// group segment
staticInstruction()->executed_as = enums::SC_GROUP;
break;
} else if (computeUnit()->shader->isScratchApe(addr[lane])) {
// private segment
staticInstruction()->executed_as = enums::SC_PRIVATE;
break;
} else if (computeUnit()->shader->isGpuVmApe(addr[lane])) {
// we won't support GPUVM
fatal("flat access is in GPUVM APE\n");
} else if (bits(addr[lane], 63, 47) != 0x1FFFF &&
bits(addr[lane], 63, 47)) {
// we are in the "hole", this is a memory violation
fatal("flat access at addr %#x has a memory violation\n",
addr[lane]);
} else {
// global memory segment
staticInstruction()->executed_as = enums::SC_GLOBAL;
break;
}
}
}
// we should have found the segment
assert(executedAs() != enums::SC_NONE);
// flat accesses should not straddle multiple APEs so we
// must check that all addresses fall within the same APE
if (executedAs() == enums::SC_GROUP) {
for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
if (mask[lane]) {
// if the first valid addr we found above was LDS,
// all the rest should be
assert(computeUnit()->shader->isLdsApe(addr[lane]));
}
}
} else if (executedAs() == enums::SC_PRIVATE) {
for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
if (mask[lane]) {
// if the first valid addr we found above was private,
// all the rest should be
assert(computeUnit()->shader->isScratchApe(addr[lane]));
}
}
} else {
for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
if (mask[lane]) {
// if the first valid addr we found above was global,
// all the rest should be. because we don't have an
// explicit range of the global segment, we just make
// sure that the address fall in no other APE and that
// it is not a memory violation
assert(!computeUnit()->shader->isLdsApe(addr[lane]));
assert(!computeUnit()->shader->isScratchApe(addr[lane]));
assert(!computeUnit()->shader->isGpuVmApe(addr[lane]));
assert(!(bits(addr[lane], 63, 47) != 0x1FFFF
&& bits(addr[lane], 63, 47)));
}
}
}
}
void
GPUDynInst::resolveFlatSegment(const VectorMask &mask)
{
doApertureCheck(mask);
// Now that we know the aperature, do the following:
// 1. Transform the flat address to its segmented equivalent.
// 2. Set the execUnitId based an the aperture check.
// 3. Decrement any extra resources that were reserved. Other
// resources are released as normal, below.
if (executedAs() == enums::SC_GLOBAL) {
// no transormation for global segment
wavefront()->execUnitId = wavefront()->flatGmUnitId;
if (isLoad()) {
wavefront()->rdLmReqsInPipe--;
} else if (isStore()) {
wavefront()->wrLmReqsInPipe--;
} else if (isAtomic() || isMemSync()) {
wavefront()->wrLmReqsInPipe--;
wavefront()->rdLmReqsInPipe--;
} else {
panic("Invalid memory operation!\n");
}
} else if (executedAs() == enums::SC_GROUP) {
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
// flat address calculation goes here.
// addr[lane] = segmented address
addr[lane] = addr[lane] -
wavefront()->computeUnit->shader->ldsApe().base;
assert(addr[lane] <
wavefront()->computeUnit->getLds().getAddrRange().size());
}
}
wavefront()->execUnitId = wavefront()->flatLmUnitId;
wavefront()->decVMemInstsIssued();
if (isLoad()) {
wavefront()->rdGmReqsInPipe--;
} else if (isStore()) {
wavefront()->wrGmReqsInPipe--;
} else if (isAtomic() || isMemSync()) {
wavefront()->rdGmReqsInPipe--;
wavefront()->wrGmReqsInPipe--;
} else {
panic("Invalid memory operation!\n");
}
} else if (executedAs() == enums::SC_PRIVATE) {
/**
* Flat instructions may resolve to the private segment (scratch),
* which is backed by main memory and provides per-lane scratch
* memory. Flat addressing uses apertures - registers that specify
* the address range in the VA space where LDS/private memory is
* mapped. The value of which is set by the kernel mode driver.
* These apertures use addresses that are not used by x86 CPUs.
* When the address of a Flat operation falls into one of the
* apertures, the Flat operation is redirected to either LDS or
* to the private memory segment.
*
* For private memory the SW runtime will allocate some space in
* the VA space for each AQL queue. The base address of which is
* stored in scalar registers per the AMD GPU ABI. The amd_queue_t
* scratch_backing_memory_location provides the base address in
* memory for the queue's private segment. Various other fields
* loaded into register state during kernel launch specify per-WF
* and per-work-item offsets so that individual lanes may access
* their private segment allocation.
*
* For more details about flat addressing see:
* http://rocm-documentation.readthedocs.io/en/latest/
* ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
*
* https://github.com/ROCm-Developer-Tools/
* ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
* #flat-addressing
*/
uint32_t numSgprs = wavefront()->maxSgprs;
uint32_t physSgprIdx =
wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
numSgprs - 4);
uint32_t offset =
wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
physSgprIdx =
wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
numSgprs - 3);
uint32_t size =
wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
addr[lane] = addr[lane] + lane * size + offset +
wavefront()->computeUnit->shader->getHiddenPrivateBase() -
wavefront()->computeUnit->shader->getScratchBase();
}
}
wavefront()->execUnitId = wavefront()->flatLmUnitId;
wavefront()->decLGKMInstsIssued();
if (isLoad()) {
wavefront()->rdLmReqsInPipe--;
} else if (isStore()) {
wavefront()->wrLmReqsInPipe--;
} else if (isAtomic() || isMemSync()) {
wavefront()->wrLmReqsInPipe--;
wavefront()->rdLmReqsInPipe--;
} else {
panic("Invalid memory operation!\n");
}
} else {
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
panic("flat addr %#llx maps to bad segment %d\n",
addr[lane], executedAs());
}
}
}
}
TheGpuISA::ScalarRegU32
GPUDynInst::srcLiteral() const
{
return _staticInst->srcLiteral();
}
void
GPUDynInst::updateStats()
{
if (_staticInst->isLocalMem()) {
// access to LDS (shared) memory
cu->stats.dynamicLMemInstrCnt++;
} else if (_staticInst->isFlat()) {
cu->stats.dynamicFlatMemInstrCnt++;
} else {
// access to global memory
// update PageDivergence histogram
int number_pages_touched = cu->pagesTouched.size();
assert(number_pages_touched);
cu->stats.pageDivergenceDist.sample(number_pages_touched);
std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret;
for (auto it : cu->pagesTouched) {
// see if this page has been touched before. if not, this also
// inserts the page into the table.
ret = cu->pageAccesses
.insert(ComputeUnit::pageDataStruct::value_type(it.first,
std::make_pair(1, it.second)));
// if yes, then update the stats
if (!ret.second) {
ret.first->second.first++;
ret.first->second.second += it.second;
}
}
cu->pagesTouched.clear();
// total number of memory instructions (dynamic)
// Atomics are counted as a single memory instruction.
// this is # memory instructions per wavefronts, not per workitem
cu->stats.dynamicGMemInstrCnt++;
}
}
void
GPUDynInst::profileRoundTripTime(Tick currentTime, int hopId)
{
// Only take the first measurement in the case of coalescing
if (roundTripTime.size() > hopId)
return;
roundTripTime.push_back(currentTime);
}
void
GPUDynInst::profileLineAddressTime(Addr addr, Tick currentTime, int hopId)
{
if (lineAddressTime.count(addr)) {
if (lineAddressTime[addr].size() > hopId) {
return;
}
lineAddressTime[addr].push_back(currentTime);
} else if (hopId == 0) {
auto addressTimeVec = std::vector<Tick> { currentTime };
lineAddressTime.insert(std::make_pair(addr, addressTimeVec));
}
}
} // namespace gem5