This changeset implements the s_sleep instruction in a similar way to s_waitcnt. Change-Id: I4811c318ac2c76c485e2bfd9d93baa1205ecf183 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/39115 Maintainer: Matthew Poremba <matthew.poremba@amd.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com> Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
999 lines
24 KiB
C++
999 lines
24 KiB
C++
/*
|
|
* Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* For use for simulation and test purposes only
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from this
|
|
* software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "gpu-compute/gpu_dyn_inst.hh"
|
|
|
|
#include "debug/GPUMem.hh"
|
|
#include "gpu-compute/gpu_static_inst.hh"
|
|
#include "gpu-compute/scalar_register_file.hh"
|
|
#include "gpu-compute/shader.hh"
|
|
#include "gpu-compute/wavefront.hh"
|
|
|
|
GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
|
|
GPUStaticInst *static_inst, InstSeqNum instSeqNum)
|
|
: GPUExecContext(_cu, _wf), scalarAddr(0), addr(computeUnit()->wfSize(),
|
|
(Addr)0), numScalarReqs(0), isSaveRestore(false),
|
|
_staticInst(static_inst), _seqNum(instSeqNum)
|
|
{
|
|
statusVector.assign(TheGpuISA::NumVecElemPerVecReg, 0);
|
|
tlbHitLevel.assign(computeUnit()->wfSize(), -1);
|
|
// vector instructions can have up to 4 source/destination operands
|
|
d_data = new uint8_t[computeUnit()->wfSize() * 4 * sizeof(double)];
|
|
a_data = new uint8_t[computeUnit()->wfSize() * 8];
|
|
x_data = new uint8_t[computeUnit()->wfSize() * 8];
|
|
// scalar loads can read up to 16 Dwords of data (see publicly
|
|
// available GCN3 ISA manual)
|
|
scalar_data = new uint8_t[16 * sizeof(uint32_t)];
|
|
for (int i = 0; i < (16 * sizeof(uint32_t)); ++i) {
|
|
scalar_data[i] = 0;
|
|
}
|
|
for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) {
|
|
a_data[i] = 0;
|
|
x_data[i] = 0;
|
|
}
|
|
for (int i = 0; i < (computeUnit()->wfSize() * 4 * sizeof(double)); ++i) {
|
|
d_data[i] = 0;
|
|
}
|
|
time = 0;
|
|
|
|
cu_id = _cu->cu_id;
|
|
if (_wf) {
|
|
simdId = _wf->simdId;
|
|
wfDynId = _wf->wfDynId;
|
|
kern_id = _wf->kernId;
|
|
wg_id = _wf->wgId;
|
|
wfSlotId = _wf->wfSlotId;
|
|
} else {
|
|
simdId = -1;
|
|
wfDynId = -1;
|
|
kern_id = -1;
|
|
wg_id = -1;
|
|
wfSlotId = -1;
|
|
}
|
|
}
|
|
|
|
GPUDynInst::~GPUDynInst()
|
|
{
|
|
delete[] d_data;
|
|
delete[] a_data;
|
|
delete[] x_data;
|
|
delete[] scalar_data;
|
|
delete _staticInst;
|
|
}
|
|
|
|
void
|
|
GPUDynInst::execute(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
_staticInst->execute(gpuDynInst);
|
|
}
|
|
|
|
int
|
|
GPUDynInst::numSrcRegOperands()
|
|
{
|
|
return _staticInst->numSrcRegOperands();
|
|
}
|
|
|
|
int
|
|
GPUDynInst::numDstRegOperands()
|
|
{
|
|
return _staticInst->numDstRegOperands();
|
|
}
|
|
|
|
int
|
|
GPUDynInst::numSrcVecOperands()
|
|
{
|
|
return _staticInst->numSrcVecOperands();
|
|
}
|
|
|
|
int
|
|
GPUDynInst::numDstVecOperands()
|
|
{
|
|
return _staticInst->numDstVecOperands();
|
|
}
|
|
|
|
int
|
|
GPUDynInst::numSrcVecDWORDs()
|
|
{
|
|
return _staticInst->numSrcVecDWORDs();
|
|
}
|
|
|
|
int
|
|
GPUDynInst::numDstVecDWORDs()
|
|
{
|
|
return _staticInst->numDstVecDWORDs();
|
|
}
|
|
|
|
int
|
|
GPUDynInst::numOpdDWORDs(int operandIdx)
|
|
{
|
|
return _staticInst->numOpdDWORDs(operandIdx);
|
|
}
|
|
|
|
int
|
|
GPUDynInst::getNumOperands()
|
|
{
|
|
return _staticInst->getNumOperands();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isVectorRegister(int operandIdx)
|
|
{
|
|
return _staticInst->isVectorRegister(operandIdx);
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isScalarRegister(int operandIdx)
|
|
{
|
|
return _staticInst->isScalarRegister(operandIdx);
|
|
}
|
|
|
|
int
|
|
GPUDynInst::getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst)
|
|
{
|
|
return _staticInst->getRegisterIndex(operandIdx, gpuDynInst);
|
|
}
|
|
|
|
int
|
|
GPUDynInst::getOperandSize(int operandIdx)
|
|
{
|
|
return _staticInst->getOperandSize(operandIdx);
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isDstOperand(int operandIdx)
|
|
{
|
|
return _staticInst->isDstOperand(operandIdx);
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isSrcOperand(int operandIdx)
|
|
{
|
|
return _staticInst->isSrcOperand(operandIdx);
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::hasSourceSgpr() const
|
|
{
|
|
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
|
if (_staticInst->isScalarRegister(i) && _staticInst->isSrcOperand(i)) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::hasSourceVgpr() const
|
|
{
|
|
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
|
if (_staticInst->isVectorRegister(i) && _staticInst->isSrcOperand(i)) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::hasDestinationSgpr() const
|
|
{
|
|
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
|
if (_staticInst->isScalarRegister(i) && _staticInst->isDstOperand(i)) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::srcIsVgpr(int index) const
|
|
{
|
|
assert(index >= 0 && index < _staticInst->getNumOperands());
|
|
if (_staticInst->isVectorRegister(index) &&
|
|
_staticInst->isSrcOperand(index)) {
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::hasDestinationVgpr() const
|
|
{
|
|
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
|
if (_staticInst->isVectorRegister(i) && _staticInst->isDstOperand(i)) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isOpcode(const std::string& opcodeStr,
|
|
const std::string& extStr) const
|
|
{
|
|
return _staticInst->opcode().find(opcodeStr) != std::string::npos &&
|
|
_staticInst->opcode().find(extStr) != std::string::npos;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isOpcode(const std::string& opcodeStr) const
|
|
{
|
|
return _staticInst->opcode().find(opcodeStr) != std::string::npos;
|
|
}
|
|
|
|
const std::string&
|
|
GPUDynInst::disassemble() const
|
|
{
|
|
return _staticInst->disassemble();
|
|
}
|
|
|
|
InstSeqNum
|
|
GPUDynInst::seqNum() const
|
|
{
|
|
return _seqNum;
|
|
}
|
|
|
|
Enums::StorageClassType
|
|
GPUDynInst::executedAs()
|
|
{
|
|
return _staticInst->executed_as;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::hasVgprRawDependence(GPUDynInstPtr s)
|
|
{
|
|
assert(s);
|
|
for (int i = 0; i < getNumOperands(); ++i) {
|
|
if (isVectorRegister(i) && isSrcOperand(i)) {
|
|
for (int j = 0; j < s->getNumOperands(); ++j) {
|
|
if (s->isVectorRegister(j) && s->isDstOperand(j)) {
|
|
if (i == j)
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::hasSgprRawDependence(GPUDynInstPtr s)
|
|
{
|
|
assert(s);
|
|
for (int i = 0; i < getNumOperands(); ++i) {
|
|
if (isScalarRegister(i) && isSrcOperand(i)) {
|
|
for (int j = 0; j < s->getNumOperands(); ++j) {
|
|
if (s->isScalarRegister(j) && s->isDstOperand(j)) {
|
|
if (i == j)
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// Process a memory instruction and (if necessary) submit timing request
|
|
void
|
|
GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n",
|
|
cu->cu_id, simdId, wfSlotId, exec_mask);
|
|
|
|
_staticInst->initiateAcc(gpuDynInst);
|
|
}
|
|
|
|
void
|
|
GPUDynInst::completeAcc(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector="
|
|
"%#x\n complete",
|
|
cu->cu_id, simdId, wfSlotId, exec_mask);
|
|
|
|
_staticInst->completeAcc(gpuDynInst);
|
|
}
|
|
|
|
/**
|
|
* accessor methods for the attributes of
|
|
* the underlying GPU static instruction
|
|
*/
|
|
bool
|
|
GPUDynInst::isALU() const
|
|
{
|
|
return _staticInst->isALU();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isBranch() const
|
|
{
|
|
return _staticInst->isBranch();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isCondBranch() const
|
|
{
|
|
return _staticInst->isCondBranch();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isNop() const
|
|
{
|
|
return _staticInst->isNop();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isEndOfKernel() const
|
|
{
|
|
return _staticInst->isEndOfKernel();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isKernelLaunch() const
|
|
{
|
|
return _staticInst->isKernelLaunch();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isSDWAInst() const
|
|
{
|
|
return _staticInst->isSDWAInst();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isDPPInst() const
|
|
{
|
|
return _staticInst->isDPPInst();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isReturn() const
|
|
{
|
|
return _staticInst->isReturn();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isUnconditionalJump() const
|
|
{
|
|
return _staticInst->isUnconditionalJump();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isSpecialOp() const
|
|
{
|
|
return _staticInst->isSpecialOp();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isWaitcnt() const
|
|
{
|
|
return _staticInst->isWaitcnt();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isSleep() const
|
|
{
|
|
return _staticInst->isSleep();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isBarrier() const
|
|
{
|
|
return _staticInst->isBarrier();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isMemSync() const
|
|
{
|
|
return _staticInst->isMemSync();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isMemRef() const
|
|
{
|
|
return _staticInst->isMemRef();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isFlat() const
|
|
{
|
|
return _staticInst->isFlat();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isLoad() const
|
|
{
|
|
return _staticInst->isLoad();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isStore() const
|
|
{
|
|
return _staticInst->isStore();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomic() const
|
|
{
|
|
return _staticInst->isAtomic();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicNoRet() const
|
|
{
|
|
return _staticInst->isAtomicNoRet();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicRet() const
|
|
{
|
|
return _staticInst->isAtomicRet();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isVector() const
|
|
{
|
|
return !_staticInst->isScalar();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isScalar() const
|
|
{
|
|
return _staticInst->isScalar();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::readsSCC() const
|
|
{
|
|
return _staticInst->readsSCC();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::writesSCC() const
|
|
{
|
|
return _staticInst->writesSCC();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::readsVCC() const
|
|
{
|
|
return _staticInst->readsVCC();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::writesVCC() const
|
|
{
|
|
return _staticInst->writesVCC();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::readsMode() const
|
|
{
|
|
return _staticInst->readsMode();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::writesMode() const
|
|
{
|
|
return _staticInst->writesMode();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::readsEXEC() const
|
|
{
|
|
return _staticInst->readsEXEC();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::writesEXEC() const
|
|
{
|
|
return _staticInst->writesEXEC();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::ignoreExec() const
|
|
{
|
|
return _staticInst->ignoreExec();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::writesExecMask() const
|
|
{
|
|
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
|
return _staticInst->isDstOperand(i) &&
|
|
_staticInst->isExecMaskRegister(i);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::readsExecMask() const
|
|
{
|
|
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
|
return _staticInst->isSrcOperand(i) &&
|
|
_staticInst->isExecMaskRegister(i);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::writesFlatScratch() const
|
|
{
|
|
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
|
if (_staticInst->isScalarRegister(i) && _staticInst->isDstOperand(i)) {
|
|
return _staticInst->isFlatScratchRegister(i);
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::readsFlatScratch() const
|
|
{
|
|
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
|
if (_staticInst->isScalarRegister(i) && _staticInst->isSrcOperand(i)) {
|
|
return _staticInst->isFlatScratchRegister(i);
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicAnd() const
|
|
{
|
|
return _staticInst->isAtomicAnd();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicOr() const
|
|
{
|
|
return _staticInst->isAtomicOr();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicXor() const
|
|
{
|
|
return _staticInst->isAtomicXor();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicCAS() const
|
|
{
|
|
return _staticInst->isAtomicCAS();
|
|
}
|
|
|
|
bool GPUDynInst::isAtomicExch() const
|
|
{
|
|
return _staticInst->isAtomicExch();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicAdd() const
|
|
{
|
|
return _staticInst->isAtomicAdd();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicSub() const
|
|
{
|
|
return _staticInst->isAtomicSub();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicInc() const
|
|
{
|
|
return _staticInst->isAtomicInc();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicDec() const
|
|
{
|
|
return _staticInst->isAtomicDec();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicMax() const
|
|
{
|
|
return _staticInst->isAtomicMax();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicMin() const
|
|
{
|
|
return _staticInst->isAtomicMin();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isArgLoad() const
|
|
{
|
|
return _staticInst->isArgLoad();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isGlobalMem() const
|
|
{
|
|
return _staticInst->isGlobalMem();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isLocalMem() const
|
|
{
|
|
return _staticInst->isLocalMem();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isArgSeg() const
|
|
{
|
|
return _staticInst->isArgSeg();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isGlobalSeg() const
|
|
{
|
|
return _staticInst->isGlobalSeg();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isGroupSeg() const
|
|
{
|
|
return _staticInst->isGroupSeg();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isKernArgSeg() const
|
|
{
|
|
return _staticInst->isKernArgSeg();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isPrivateSeg() const
|
|
{
|
|
return _staticInst->isPrivateSeg();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isReadOnlySeg() const
|
|
{
|
|
return _staticInst->isReadOnlySeg();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isSpillSeg() const
|
|
{
|
|
return _staticInst->isSpillSeg();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isGloballyCoherent() const
|
|
{
|
|
return _staticInst->isGloballyCoherent();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isSystemCoherent() const
|
|
{
|
|
return _staticInst->isSystemCoherent();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isF16() const
|
|
{
|
|
return _staticInst->isF16();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isF32() const
|
|
{
|
|
return _staticInst->isF32();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isF64() const
|
|
{
|
|
return _staticInst->isF64();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isFMA() const
|
|
{
|
|
return _staticInst->isFMA();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isMAC() const
|
|
{
|
|
return _staticInst->isMAC();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isMAD() const
|
|
{
|
|
return _staticInst->isMAD();
|
|
}
|
|
|
|
void
|
|
GPUDynInst::doApertureCheck(const VectorMask &mask)
|
|
{
|
|
assert(mask.any());
|
|
// find the segment of the first active address, after
|
|
// that we check that all other active addresses also
|
|
// fall within the same APE
|
|
for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
if (computeUnit()->shader->isLdsApe(addr[lane])) {
|
|
// group segment
|
|
staticInstruction()->executed_as = Enums::SC_GROUP;
|
|
break;
|
|
} else if (computeUnit()->shader->isScratchApe(addr[lane])) {
|
|
// private segment
|
|
staticInstruction()->executed_as = Enums::SC_PRIVATE;
|
|
break;
|
|
} else if (computeUnit()->shader->isGpuVmApe(addr[lane])) {
|
|
// we won't support GPUVM
|
|
fatal("flat access is in GPUVM APE\n");
|
|
} else if (bits(addr[lane], 63, 47) != 0x1FFFF &&
|
|
bits(addr[lane], 63, 47)) {
|
|
// we are in the "hole", this is a memory violation
|
|
fatal("flat access at addr %#x has a memory violation\n",
|
|
addr[lane]);
|
|
} else {
|
|
// global memory segment
|
|
staticInstruction()->executed_as = Enums::SC_GLOBAL;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// we should have found the segment
|
|
assert(executedAs() != Enums::SC_NONE);
|
|
|
|
// flat accesses should not straddle multiple APEs so we
|
|
// must check that all addresses fall within the same APE
|
|
if (executedAs() == Enums::SC_GROUP) {
|
|
for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
// if the first valid addr we found above was LDS,
|
|
// all the rest should be
|
|
assert(computeUnit()->shader->isLdsApe(addr[lane]));
|
|
}
|
|
}
|
|
} else if (executedAs() == Enums::SC_PRIVATE) {
|
|
for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
// if the first valid addr we found above was private,
|
|
// all the rest should be
|
|
assert(computeUnit()->shader->isScratchApe(addr[lane]));
|
|
}
|
|
}
|
|
} else {
|
|
for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
// if the first valid addr we found above was global,
|
|
// all the rest should be. because we don't have an
|
|
// explicit range of the global segment, we just make
|
|
// sure that the address fall in no other APE and that
|
|
// it is not a memory violation
|
|
assert(!computeUnit()->shader->isLdsApe(addr[lane]));
|
|
assert(!computeUnit()->shader->isScratchApe(addr[lane]));
|
|
assert(!computeUnit()->shader->isGpuVmApe(addr[lane]));
|
|
assert(!(bits(addr[lane], 63, 47) != 0x1FFFF
|
|
&& bits(addr[lane], 63, 47)));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
GPUDynInst::resolveFlatSegment(const VectorMask &mask)
|
|
{
|
|
doApertureCheck(mask);
|
|
|
|
|
|
// Now that we know the aperature, do the following:
|
|
// 1. Transform the flat address to its segmented equivalent.
|
|
// 2. Set the execUnitId based an the aperture check.
|
|
// 3. Decrement any extra resources that were reserved. Other
|
|
// resources are released as normal, below.
|
|
if (executedAs() == Enums::SC_GLOBAL) {
|
|
// no transormation for global segment
|
|
wavefront()->execUnitId = wavefront()->flatGmUnitId;
|
|
if (isLoad()) {
|
|
wavefront()->rdLmReqsInPipe--;
|
|
} else if (isStore()) {
|
|
wavefront()->wrLmReqsInPipe--;
|
|
} else if (isAtomic() || isMemSync()) {
|
|
wavefront()->wrLmReqsInPipe--;
|
|
wavefront()->rdLmReqsInPipe--;
|
|
} else {
|
|
panic("Invalid memory operation!\n");
|
|
}
|
|
} else if (executedAs() == Enums::SC_GROUP) {
|
|
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
// flat address calculation goes here.
|
|
// addr[lane] = segmented address
|
|
panic("Flat group memory operation is unimplemented!\n");
|
|
}
|
|
}
|
|
wavefront()->execUnitId = wavefront()->flatLmUnitId;
|
|
wavefront()->decVMemInstsIssued();
|
|
if (isLoad()) {
|
|
wavefront()->rdGmReqsInPipe--;
|
|
} else if (isStore()) {
|
|
wavefront()->wrGmReqsInPipe--;
|
|
} else if (isAtomic() || isMemSync()) {
|
|
wavefront()->rdGmReqsInPipe--;
|
|
wavefront()->wrGmReqsInPipe--;
|
|
} else {
|
|
panic("Invalid memory operation!\n");
|
|
}
|
|
} else if (executedAs() == Enums::SC_PRIVATE) {
|
|
/**
|
|
* Flat instructions may resolve to the private segment (scratch),
|
|
* which is backed by main memory and provides per-lane scratch
|
|
* memory. Flat addressing uses apertures - registers that specify
|
|
* the address range in the VA space where LDS/private memory is
|
|
* mapped. The value of which is set by the kernel mode driver.
|
|
* These apertures use addresses that are not used by x86 CPUs.
|
|
* When the address of a Flat operation falls into one of the
|
|
* apertures, the Flat operation is redirected to either LDS or
|
|
* to the private memory segment.
|
|
*
|
|
* For private memory the SW runtime will allocate some space in
|
|
* the VA space for each AQL queue. The base address of which is
|
|
* stored in scalar registers per the AMD GPU ABI. The amd_queue_t
|
|
* scratch_backing_memory_location provides the base address in
|
|
* memory for the queue's private segment. Various other fields
|
|
* loaded into register state during kernel launch specify per-WF
|
|
* and per-work-item offsets so that individual lanes may access
|
|
* their private segment allocation.
|
|
*
|
|
* For more details about flat addressing see:
|
|
* http://rocm-documentation.readthedocs.io/en/latest/
|
|
* ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
|
|
*
|
|
* https://github.com/ROCm-Developer-Tools/
|
|
* ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
|
|
* #flat-addressing
|
|
*/
|
|
|
|
uint32_t numSgprs = wavefront()->maxSgprs;
|
|
uint32_t physSgprIdx =
|
|
wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
|
|
numSgprs - 3);
|
|
uint32_t offset =
|
|
wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
|
|
physSgprIdx =
|
|
wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
|
|
numSgprs - 4);
|
|
uint32_t size =
|
|
wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
|
|
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
addr[lane] = addr[lane] + lane * size + offset +
|
|
wavefront()->computeUnit->shader->getHiddenPrivateBase() -
|
|
wavefront()->computeUnit->shader->getScratchBase();
|
|
}
|
|
}
|
|
wavefront()->execUnitId = wavefront()->flatLmUnitId;
|
|
wavefront()->decLGKMInstsIssued();
|
|
if (isLoad()) {
|
|
wavefront()->rdGmReqsInPipe--;
|
|
} else if (isStore()) {
|
|
wavefront()->wrGmReqsInPipe--;
|
|
} else if (isAtomic() || isMemSync()) {
|
|
wavefront()->rdGmReqsInPipe--;
|
|
wavefront()->wrGmReqsInPipe--;
|
|
} else {
|
|
panic("Invalid memory operation!\n");
|
|
}
|
|
} else {
|
|
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
panic("flat addr %#llx maps to bad segment %d\n",
|
|
addr[lane], executedAs());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
TheGpuISA::ScalarRegU32
|
|
GPUDynInst::srcLiteral() const
|
|
{
|
|
return _staticInst->srcLiteral();
|
|
}
|
|
|
|
void
|
|
GPUDynInst::updateStats()
|
|
{
|
|
if (_staticInst->isLocalMem()) {
|
|
// access to LDS (shared) memory
|
|
cu->stats.dynamicLMemInstrCnt++;
|
|
} else if (_staticInst->isFlat()) {
|
|
cu->stats.dynamicFlatMemInstrCnt++;
|
|
} else {
|
|
// access to global memory
|
|
|
|
// update PageDivergence histogram
|
|
int number_pages_touched = cu->pagesTouched.size();
|
|
assert(number_pages_touched);
|
|
cu->stats.pageDivergenceDist.sample(number_pages_touched);
|
|
|
|
std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret;
|
|
|
|
for (auto it : cu->pagesTouched) {
|
|
// see if this page has been touched before. if not, this also
|
|
// inserts the page into the table.
|
|
ret = cu->pageAccesses
|
|
.insert(ComputeUnit::pageDataStruct::value_type(it.first,
|
|
std::make_pair(1, it.second)));
|
|
|
|
// if yes, then update the stats
|
|
if (!ret.second) {
|
|
ret.first->second.first++;
|
|
ret.first->second.second += it.second;
|
|
}
|
|
}
|
|
|
|
cu->pagesTouched.clear();
|
|
|
|
// total number of memory instructions (dynamic)
|
|
// Atomics are counted as a single memory instruction.
|
|
// this is # memory instructions per wavefronts, not per workitem
|
|
cu->stats.dynamicGMemInstrCnt++;
|
|
}
|
|
}
|
|
|
|
void
|
|
GPUDynInst::profileRoundTripTime(Tick currentTime, int hopId)
|
|
{
|
|
// Only take the first measurement in the case of coalescing
|
|
if (roundTripTime.size() > hopId)
|
|
return;
|
|
|
|
roundTripTime.push_back(currentTime);
|
|
}
|
|
|
|
void
|
|
GPUDynInst::profileLineAddressTime(Addr addr, Tick currentTime, int hopId)
|
|
{
|
|
if (lineAddressTime.count(addr)) {
|
|
if (lineAddressTime[addr].size() > hopId) {
|
|
return;
|
|
}
|
|
|
|
lineAddressTime[addr].push_back(currentTime);
|
|
} else if (hopId == 0) {
|
|
auto addressTimeVec = std::vector<Tick> { currentTime };
|
|
lineAddressTime.insert(std::make_pair(addr, addressTimeVec));
|
|
}
|
|
}
|