Previously, with HSAIL, we were guaranteed by the HSA specification that the GPU will never issue unaligned accesses. However, now that we are directly running GCN this is no longer true. Accordingly, this commit adds support for unaligned accesses. Moreover, to reduce the replication of nearly identical code for the different request types, I also added new helper functions that are called by all the different memory request producing instruction types in op_encodings.hh. Adding support for unaligned instructions requires changing the statusBitVector used to track the status of the memory requests for each lane from a bit per lane to an int per lane. This is necessary because an unaligned access may span multiple cache lines. In the worst case, each lane may span multiple cache lines. There are corresponding changes in the files that use the statusBitVector. Change-Id: I319bf2f0f644083e98ca546d2bfe68cf87a5f967 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29920 Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com> Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com> Tested-by: kokoro <noreply+kokoro@google.com>
991 lines
24 KiB
C++
991 lines
24 KiB
C++
/*
|
|
* Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* For use for simulation and test purposes only
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from this
|
|
* software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "gpu-compute/gpu_dyn_inst.hh"
|
|
|
|
#include "debug/GPUMem.hh"
|
|
#include "gpu-compute/gpu_static_inst.hh"
|
|
#include "gpu-compute/scalar_register_file.hh"
|
|
#include "gpu-compute/shader.hh"
|
|
#include "gpu-compute/wavefront.hh"
|
|
|
|
GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
|
|
GPUStaticInst *static_inst, InstSeqNum instSeqNum)
|
|
: GPUExecContext(_cu, _wf), scalarAddr(0), addr(computeUnit()->wfSize(),
|
|
(Addr)0), numScalarReqs(0), isSaveRestore(false),
|
|
_staticInst(static_inst), _seqNum(instSeqNum)
|
|
{
|
|
statusVector.assign(TheGpuISA::NumVecElemPerVecReg, 0);
|
|
tlbHitLevel.assign(computeUnit()->wfSize(), -1);
|
|
// vector instructions can have up to 4 source/destination operands
|
|
d_data = new uint8_t[computeUnit()->wfSize() * 4 * sizeof(double)];
|
|
a_data = new uint8_t[computeUnit()->wfSize() * 8];
|
|
x_data = new uint8_t[computeUnit()->wfSize() * 8];
|
|
// scalar loads can read up to 16 Dwords of data (see publicly
|
|
// available GCN3 ISA manual)
|
|
scalar_data = new uint8_t[16 * sizeof(uint32_t)];
|
|
for (int i = 0; i < (16 * sizeof(uint32_t)); ++i) {
|
|
scalar_data[i] = 0;
|
|
}
|
|
for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) {
|
|
a_data[i] = 0;
|
|
x_data[i] = 0;
|
|
}
|
|
for (int i = 0; i < (computeUnit()->wfSize() * 4 * sizeof(double)); ++i) {
|
|
d_data[i] = 0;
|
|
}
|
|
time = 0;
|
|
|
|
cu_id = _cu->cu_id;
|
|
if (_wf) {
|
|
simdId = _wf->simdId;
|
|
wfDynId = _wf->wfDynId;
|
|
kern_id = _wf->kernId;
|
|
wg_id = _wf->wgId;
|
|
wfSlotId = _wf->wfSlotId;
|
|
} else {
|
|
simdId = -1;
|
|
wfDynId = -1;
|
|
kern_id = -1;
|
|
wg_id = -1;
|
|
wfSlotId = -1;
|
|
}
|
|
}
|
|
|
|
GPUDynInst::~GPUDynInst()
|
|
{
|
|
delete[] d_data;
|
|
delete[] a_data;
|
|
delete[] x_data;
|
|
delete[] scalar_data;
|
|
delete _staticInst;
|
|
}
|
|
|
|
void
|
|
GPUDynInst::execute(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
_staticInst->execute(gpuDynInst);
|
|
}
|
|
|
|
int
|
|
GPUDynInst::numSrcRegOperands()
|
|
{
|
|
return _staticInst->numSrcRegOperands();
|
|
}
|
|
|
|
int
|
|
GPUDynInst::numDstRegOperands()
|
|
{
|
|
return _staticInst->numDstRegOperands();
|
|
}
|
|
|
|
int
|
|
GPUDynInst::numSrcVecOperands()
|
|
{
|
|
return _staticInst->numSrcVecOperands();
|
|
}
|
|
|
|
int
|
|
GPUDynInst::numDstVecOperands()
|
|
{
|
|
return _staticInst->numDstVecOperands();
|
|
}
|
|
|
|
int
|
|
GPUDynInst::numSrcVecDWORDs()
|
|
{
|
|
return _staticInst->numSrcVecDWORDs();
|
|
}
|
|
|
|
int
|
|
GPUDynInst::numDstVecDWORDs()
|
|
{
|
|
return _staticInst->numDstVecDWORDs();
|
|
}
|
|
|
|
int
|
|
GPUDynInst::numOpdDWORDs(int operandIdx)
|
|
{
|
|
return _staticInst->numOpdDWORDs(operandIdx);
|
|
}
|
|
|
|
int
|
|
GPUDynInst::getNumOperands()
|
|
{
|
|
return _staticInst->getNumOperands();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isVectorRegister(int operandIdx)
|
|
{
|
|
return _staticInst->isVectorRegister(operandIdx);
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isScalarRegister(int operandIdx)
|
|
{
|
|
return _staticInst->isScalarRegister(operandIdx);
|
|
}
|
|
|
|
int
|
|
GPUDynInst::getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst)
|
|
{
|
|
return _staticInst->getRegisterIndex(operandIdx, gpuDynInst);
|
|
}
|
|
|
|
int
|
|
GPUDynInst::getOperandSize(int operandIdx)
|
|
{
|
|
return _staticInst->getOperandSize(operandIdx);
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isDstOperand(int operandIdx)
|
|
{
|
|
return _staticInst->isDstOperand(operandIdx);
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isSrcOperand(int operandIdx)
|
|
{
|
|
return _staticInst->isSrcOperand(operandIdx);
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::hasSourceSgpr() const
|
|
{
|
|
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
|
if (_staticInst->isScalarRegister(i) && _staticInst->isSrcOperand(i)) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::hasSourceVgpr() const
|
|
{
|
|
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
|
if (_staticInst->isVectorRegister(i) && _staticInst->isSrcOperand(i)) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::hasDestinationSgpr() const
|
|
{
|
|
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
|
if (_staticInst->isScalarRegister(i) && _staticInst->isDstOperand(i)) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::srcIsVgpr(int index) const
|
|
{
|
|
assert(index >= 0 && index < _staticInst->getNumOperands());
|
|
if (_staticInst->isVectorRegister(index) &&
|
|
_staticInst->isSrcOperand(index)) {
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::hasDestinationVgpr() const
|
|
{
|
|
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
|
if (_staticInst->isVectorRegister(i) && _staticInst->isDstOperand(i)) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isOpcode(const std::string& opcodeStr,
|
|
const std::string& extStr) const
|
|
{
|
|
return _staticInst->opcode().find(opcodeStr) != std::string::npos &&
|
|
_staticInst->opcode().find(extStr) != std::string::npos;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isOpcode(const std::string& opcodeStr) const
|
|
{
|
|
return _staticInst->opcode().find(opcodeStr) != std::string::npos;
|
|
}
|
|
|
|
const std::string&
|
|
GPUDynInst::disassemble() const
|
|
{
|
|
return _staticInst->disassemble();
|
|
}
|
|
|
|
InstSeqNum
|
|
GPUDynInst::seqNum() const
|
|
{
|
|
return _seqNum;
|
|
}
|
|
|
|
Enums::StorageClassType
|
|
GPUDynInst::executedAs()
|
|
{
|
|
return _staticInst->executed_as;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::hasVgprRawDependence(GPUDynInstPtr s)
|
|
{
|
|
assert(s);
|
|
for (int i = 0; i < getNumOperands(); ++i) {
|
|
if (isVectorRegister(i) && isSrcOperand(i)) {
|
|
for (int j = 0; j < s->getNumOperands(); ++j) {
|
|
if (s->isVectorRegister(j) && s->isDstOperand(j)) {
|
|
if (i == j)
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::hasSgprRawDependence(GPUDynInstPtr s)
|
|
{
|
|
assert(s);
|
|
for (int i = 0; i < getNumOperands(); ++i) {
|
|
if (isScalarRegister(i) && isSrcOperand(i)) {
|
|
for (int j = 0; j < s->getNumOperands(); ++j) {
|
|
if (s->isScalarRegister(j) && s->isDstOperand(j)) {
|
|
if (i == j)
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// Process a memory instruction and (if necessary) submit timing request
|
|
void
|
|
GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n",
|
|
cu->cu_id, simdId, wfSlotId, exec_mask);
|
|
|
|
_staticInst->initiateAcc(gpuDynInst);
|
|
}
|
|
|
|
void
|
|
GPUDynInst::completeAcc(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector="
|
|
"%#x\n complete",
|
|
cu->cu_id, simdId, wfSlotId, exec_mask);
|
|
|
|
_staticInst->completeAcc(gpuDynInst);
|
|
}
|
|
|
|
/**
|
|
* accessor methods for the attributes of
|
|
* the underlying GPU static instruction
|
|
*/
|
|
bool
|
|
GPUDynInst::isALU() const
|
|
{
|
|
return _staticInst->isALU();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isBranch() const
|
|
{
|
|
return _staticInst->isBranch();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isCondBranch() const
|
|
{
|
|
return _staticInst->isCondBranch();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isNop() const
|
|
{
|
|
return _staticInst->isNop();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isEndOfKernel() const
|
|
{
|
|
return _staticInst->isEndOfKernel();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isKernelLaunch() const
|
|
{
|
|
return _staticInst->isKernelLaunch();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isSDWAInst() const
|
|
{
|
|
return _staticInst->isSDWAInst();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isDPPInst() const
|
|
{
|
|
return _staticInst->isDPPInst();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isReturn() const
|
|
{
|
|
return _staticInst->isReturn();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isUnconditionalJump() const
|
|
{
|
|
return _staticInst->isUnconditionalJump();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isSpecialOp() const
|
|
{
|
|
return _staticInst->isSpecialOp();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isWaitcnt() const
|
|
{
|
|
return _staticInst->isWaitcnt();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isBarrier() const
|
|
{
|
|
return _staticInst->isBarrier();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isMemSync() const
|
|
{
|
|
return _staticInst->isMemSync();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isMemRef() const
|
|
{
|
|
return _staticInst->isMemRef();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isFlat() const
|
|
{
|
|
return _staticInst->isFlat();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isLoad() const
|
|
{
|
|
return _staticInst->isLoad();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isStore() const
|
|
{
|
|
return _staticInst->isStore();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomic() const
|
|
{
|
|
return _staticInst->isAtomic();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicNoRet() const
|
|
{
|
|
return _staticInst->isAtomicNoRet();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicRet() const
|
|
{
|
|
return _staticInst->isAtomicRet();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isVector() const
|
|
{
|
|
return !_staticInst->isScalar();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isScalar() const
|
|
{
|
|
return _staticInst->isScalar();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::readsSCC() const
|
|
{
|
|
return _staticInst->readsSCC();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::writesSCC() const
|
|
{
|
|
return _staticInst->writesSCC();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::readsVCC() const
|
|
{
|
|
return _staticInst->readsVCC();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::writesVCC() const
|
|
{
|
|
return _staticInst->writesVCC();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::readsMode() const
|
|
{
|
|
return _staticInst->readsMode();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::writesMode() const
|
|
{
|
|
return _staticInst->writesMode();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::readsEXEC() const
|
|
{
|
|
return _staticInst->readsEXEC();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::writesEXEC() const
|
|
{
|
|
return _staticInst->writesEXEC();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::ignoreExec() const
|
|
{
|
|
return _staticInst->ignoreExec();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::writesExecMask() const
|
|
{
|
|
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
|
return _staticInst->isDstOperand(i) &&
|
|
_staticInst->isExecMaskRegister(i);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::readsExecMask() const
|
|
{
|
|
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
|
return _staticInst->isSrcOperand(i) &&
|
|
_staticInst->isExecMaskRegister(i);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::writesFlatScratch() const
|
|
{
|
|
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
|
if (_staticInst->isScalarRegister(i) && _staticInst->isDstOperand(i)) {
|
|
return _staticInst->isFlatScratchRegister(i);
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::readsFlatScratch() const
|
|
{
|
|
for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
|
|
if (_staticInst->isScalarRegister(i) && _staticInst->isSrcOperand(i)) {
|
|
return _staticInst->isFlatScratchRegister(i);
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicAnd() const
|
|
{
|
|
return _staticInst->isAtomicAnd();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicOr() const
|
|
{
|
|
return _staticInst->isAtomicOr();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicXor() const
|
|
{
|
|
return _staticInst->isAtomicXor();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicCAS() const
|
|
{
|
|
return _staticInst->isAtomicCAS();
|
|
}
|
|
|
|
bool GPUDynInst::isAtomicExch() const
|
|
{
|
|
return _staticInst->isAtomicExch();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicAdd() const
|
|
{
|
|
return _staticInst->isAtomicAdd();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicSub() const
|
|
{
|
|
return _staticInst->isAtomicSub();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicInc() const
|
|
{
|
|
return _staticInst->isAtomicInc();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicDec() const
|
|
{
|
|
return _staticInst->isAtomicDec();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicMax() const
|
|
{
|
|
return _staticInst->isAtomicMax();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isAtomicMin() const
|
|
{
|
|
return _staticInst->isAtomicMin();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isArgLoad() const
|
|
{
|
|
return _staticInst->isArgLoad();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isGlobalMem() const
|
|
{
|
|
return _staticInst->isGlobalMem();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isLocalMem() const
|
|
{
|
|
return _staticInst->isLocalMem();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isArgSeg() const
|
|
{
|
|
return _staticInst->isArgSeg();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isGlobalSeg() const
|
|
{
|
|
return _staticInst->isGlobalSeg();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isGroupSeg() const
|
|
{
|
|
return _staticInst->isGroupSeg();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isKernArgSeg() const
|
|
{
|
|
return _staticInst->isKernArgSeg();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isPrivateSeg() const
|
|
{
|
|
return _staticInst->isPrivateSeg();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isReadOnlySeg() const
|
|
{
|
|
return _staticInst->isReadOnlySeg();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isSpillSeg() const
|
|
{
|
|
return _staticInst->isSpillSeg();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isGloballyCoherent() const
|
|
{
|
|
return _staticInst->isGloballyCoherent();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isSystemCoherent() const
|
|
{
|
|
return _staticInst->isSystemCoherent();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isF16() const
|
|
{
|
|
return _staticInst->isF16();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isF32() const
|
|
{
|
|
return _staticInst->isF32();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isF64() const
|
|
{
|
|
return _staticInst->isF64();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isFMA() const
|
|
{
|
|
return _staticInst->isFMA();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isMAC() const
|
|
{
|
|
return _staticInst->isMAC();
|
|
}
|
|
|
|
bool
|
|
GPUDynInst::isMAD() const
|
|
{
|
|
return _staticInst->isMAD();
|
|
}
|
|
|
|
void
|
|
GPUDynInst::doApertureCheck(const VectorMask &mask)
|
|
{
|
|
assert(mask.any());
|
|
// find the segment of the first active address, after
|
|
// that we check that all other active addresses also
|
|
// fall within the same APE
|
|
for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
if (computeUnit()->shader->isLdsApe(addr[lane])) {
|
|
// group segment
|
|
staticInstruction()->executed_as = Enums::SC_GROUP;
|
|
break;
|
|
} else if (computeUnit()->shader->isScratchApe(addr[lane])) {
|
|
// private segment
|
|
staticInstruction()->executed_as = Enums::SC_PRIVATE;
|
|
break;
|
|
} else if (computeUnit()->shader->isGpuVmApe(addr[lane])) {
|
|
// we won't support GPUVM
|
|
fatal("flat access is in GPUVM APE\n");
|
|
} else if (bits(addr[lane], 63, 47) != 0x1FFFF &&
|
|
bits(addr[lane], 63, 47)) {
|
|
// we are in the "hole", this is a memory violation
|
|
fatal("flat access at addr %#x has a memory violation\n",
|
|
addr[lane]);
|
|
} else {
|
|
// global memory segment
|
|
staticInstruction()->executed_as = Enums::SC_GLOBAL;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// we should have found the segment
|
|
assert(executedAs() != Enums::SC_NONE);
|
|
|
|
// flat accesses should not straddle multiple APEs so we
|
|
// must check that all addresses fall within the same APE
|
|
if (executedAs() == Enums::SC_GROUP) {
|
|
for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
// if the first valid addr we found above was LDS,
|
|
// all the rest should be
|
|
assert(computeUnit()->shader->isLdsApe(addr[lane]));
|
|
}
|
|
}
|
|
} else if (executedAs() == Enums::SC_PRIVATE) {
|
|
for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
// if the first valid addr we found above was private,
|
|
// all the rest should be
|
|
assert(computeUnit()->shader->isScratchApe(addr[lane]));
|
|
}
|
|
}
|
|
} else {
|
|
for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
// if the first valid addr we found above was global,
|
|
// all the rest should be. because we don't have an
|
|
// explicit range of the global segment, we just make
|
|
// sure that the address fall in no other APE and that
|
|
// it is not a memory violation
|
|
assert(!computeUnit()->shader->isLdsApe(addr[lane]));
|
|
assert(!computeUnit()->shader->isScratchApe(addr[lane]));
|
|
assert(!computeUnit()->shader->isGpuVmApe(addr[lane]));
|
|
assert(!(bits(addr[lane], 63, 47) != 0x1FFFF
|
|
&& bits(addr[lane], 63, 47)));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
GPUDynInst::resolveFlatSegment(const VectorMask &mask)
|
|
{
|
|
doApertureCheck(mask);
|
|
|
|
|
|
// Now that we know the aperature, do the following:
|
|
// 1. Transform the flat address to its segmented equivalent.
|
|
// 2. Set the execUnitId based an the aperture check.
|
|
// 3. Decrement any extra resources that were reserved. Other
|
|
// resources are released as normal, below.
|
|
if (executedAs() == Enums::SC_GLOBAL) {
|
|
// no transormation for global segment
|
|
wavefront()->execUnitId = wavefront()->flatGmUnitId;
|
|
if (isLoad()) {
|
|
wavefront()->rdLmReqsInPipe--;
|
|
} else if (isStore()) {
|
|
wavefront()->wrLmReqsInPipe--;
|
|
} else if (isAtomic() || isMemSync()) {
|
|
wavefront()->wrLmReqsInPipe--;
|
|
wavefront()->rdLmReqsInPipe--;
|
|
} else {
|
|
panic("Invalid memory operation!\n");
|
|
}
|
|
} else if (executedAs() == Enums::SC_GROUP) {
|
|
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
// flat address calculation goes here.
|
|
// addr[lane] = segmented address
|
|
panic("Flat group memory operation is unimplemented!\n");
|
|
}
|
|
}
|
|
wavefront()->execUnitId = wavefront()->flatLmUnitId;
|
|
if (isLoad()) {
|
|
wavefront()->rdGmReqsInPipe--;
|
|
} else if (isStore()) {
|
|
wavefront()->wrGmReqsInPipe--;
|
|
} else if (isAtomic() || isMemSync()) {
|
|
wavefront()->rdGmReqsInPipe--;
|
|
wavefront()->wrGmReqsInPipe--;
|
|
} else {
|
|
panic("Invalid memory operation!\n");
|
|
}
|
|
} else if (executedAs() == Enums::SC_PRIVATE) {
|
|
/**
|
|
* Flat instructions may resolve to the private segment (scratch),
|
|
* which is backed by main memory and provides per-lane scratch
|
|
* memory. Flat addressing uses apertures - registers that specify
|
|
* the address range in the VA space where LDS/private memory is
|
|
* mapped. The value of which is set by the kernel mode driver.
|
|
* These apertures use addresses that are not used by x86 CPUs.
|
|
* When the address of a Flat operation falls into one of the
|
|
* apertures, the Flat operation is redirected to either LDS or
|
|
* to the private memory segment.
|
|
*
|
|
* For private memory the SW runtime will allocate some space in
|
|
* the VA space for each AQL queue. The base address of which is
|
|
* stored in scalar registers per the AMD GPU ABI. The amd_queue_t
|
|
* scratch_backing_memory_location provides the base address in
|
|
* memory for the queue's private segment. Various other fields
|
|
* loaded into register state during kernel launch specify per-WF
|
|
* and per-work-item offsets so that individual lanes may access
|
|
* their private segment allocation.
|
|
*
|
|
* For more details about flat addressing see:
|
|
* http://rocm-documentation.readthedocs.io/en/latest/
|
|
* ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
|
|
*
|
|
* https://github.com/ROCm-Developer-Tools/
|
|
* ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
|
|
* #flat-addressing
|
|
*/
|
|
|
|
uint32_t numSgprs = wavefront()->maxSgprs;
|
|
uint32_t physSgprIdx =
|
|
wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
|
|
numSgprs - 3);
|
|
uint32_t offset =
|
|
wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
|
|
physSgprIdx =
|
|
wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
|
|
numSgprs - 4);
|
|
uint32_t size =
|
|
wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
|
|
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
addr[lane] = addr[lane] + lane * size + offset +
|
|
wavefront()->computeUnit->shader->getHiddenPrivateBase() -
|
|
wavefront()->computeUnit->shader->getScratchBase();
|
|
}
|
|
}
|
|
wavefront()->execUnitId = wavefront()->flatLmUnitId;
|
|
if (isLoad()) {
|
|
wavefront()->rdGmReqsInPipe--;
|
|
} else if (isStore()) {
|
|
wavefront()->wrGmReqsInPipe--;
|
|
} else if (isAtomic() || isMemSync()) {
|
|
wavefront()->rdGmReqsInPipe--;
|
|
wavefront()->wrGmReqsInPipe--;
|
|
} else {
|
|
panic("Invalid memory operation!\n");
|
|
}
|
|
} else {
|
|
for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
|
|
if (mask[lane]) {
|
|
panic("flat addr %#llx maps to bad segment %d\n",
|
|
addr[lane], executedAs());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
TheGpuISA::ScalarRegU32
|
|
GPUDynInst::srcLiteral() const
|
|
{
|
|
return _staticInst->srcLiteral();
|
|
}
|
|
|
|
void
|
|
GPUDynInst::updateStats()
|
|
{
|
|
if (_staticInst->isLocalMem()) {
|
|
// access to LDS (shared) memory
|
|
cu->dynamicLMemInstrCnt++;
|
|
} else if (_staticInst->isFlat()) {
|
|
cu->dynamicFlatMemInstrCnt++;
|
|
} else {
|
|
// access to global memory
|
|
|
|
// update PageDivergence histogram
|
|
int number_pages_touched = cu->pagesTouched.size();
|
|
assert(number_pages_touched);
|
|
cu->pageDivergenceDist.sample(number_pages_touched);
|
|
|
|
std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret;
|
|
|
|
for (auto it : cu->pagesTouched) {
|
|
// see if this page has been touched before. if not, this also
|
|
// inserts the page into the table.
|
|
ret = cu->pageAccesses
|
|
.insert(ComputeUnit::pageDataStruct::value_type(it.first,
|
|
std::make_pair(1, it.second)));
|
|
|
|
// if yes, then update the stats
|
|
if (!ret.second) {
|
|
ret.first->second.first++;
|
|
ret.first->second.second += it.second;
|
|
}
|
|
}
|
|
|
|
cu->pagesTouched.clear();
|
|
|
|
// total number of memory instructions (dynamic)
|
|
// Atomics are counted as a single memory instruction.
|
|
// this is # memory instructions per wavefronts, not per workitem
|
|
cu->dynamicGMemInstrCnt++;
|
|
}
|
|
}
|
|
|
|
void
|
|
GPUDynInst::profileRoundTripTime(Tick currentTime, int hopId)
|
|
{
|
|
// Only take the first measurement in the case of coalescing
|
|
if (roundTripTime.size() > hopId)
|
|
return;
|
|
|
|
roundTripTime.push_back(currentTime);
|
|
}
|
|
|
|
void
|
|
GPUDynInst::profileLineAddressTime(Addr addr, Tick currentTime, int hopId)
|
|
{
|
|
if (lineAddressTime.count(addr)) {
|
|
if (lineAddressTime[addr].size() > hopId) {
|
|
return;
|
|
}
|
|
|
|
lineAddressTime[addr].push_back(currentTime);
|
|
} else if (hopId == 0) {
|
|
auto addressTimeVec = std::vector<Tick> { currentTime };
|
|
lineAddressTime.insert(std::make_pair(addr, addressTimeVec));
|
|
}
|
|
}
|