Files
gem5/src/gpu-compute/gpu_dyn_inst.hh
Matthew Poremba 9f4d334644 gpu-compute: Update tokens for flat global/scratch
Memory instructions acquire coalescer tokens in the schedule stage.
Currently this is only done for buffer and flat instructions, but not
flat global or flat scratch. This change now acquires tokens for flat
global and flat scratch instructions. This provides back-pressure to the
CUs and helps to avoid deadlocks in Ruby.

The change also handles returning tokens for buffer, flat global, and
flat scratch instructions. This was previously only being done for
normal flat instructions leading to deadlocks in some applications when
the tokens were exhausted.

To simplify the logic, added a needsToken() method to GPUDynInst which
return if the instruction is buffer or any flat segment.

The waitcnts were also incorrect for flat global and flat scratch. We
should always decrement vmem and exp count for stores and only normal
flat instructions should decrement lgkm. Currently vmem/exp are not
decremented for flat global and flat scratch which can lead to deadlock.
This change set fixes this by always decrementing vmem/exp and lgkm only
for normal flat instructions.

Change-Id: I673f4ac6121e4b5a5e8491bc9130c6d825d95fc5
2023-10-10 09:48:16 -05:00

507 lines
15 KiB
C++

/*
* Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __GPU_DYN_INST_HH__
#define __GPU_DYN_INST_HH__
#include <cstdint>
#include <memory>
#include <string>
#include "base/amo.hh"
#include "base/logging.hh"
#include "base/trace.hh"
#include "debug/GPUMem.hh"
#include "enums/StorageClassType.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_exec_context.hh"
#include "gpu-compute/operand_info.hh"
namespace gem5
{
class GPUStaticInst;
template<typename T>
class AtomicOpCAS : public TypedAtomicOpFunctor<T>
{
public:
T c;
T s;
ComputeUnit *computeUnit;
AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
: c(_c), s(_s), computeUnit(compute_unit) { }
void
execute(T *b)
{
computeUnit->stats.numCASOps++;
if (*b == c) {
*b = s;
} else {
computeUnit->stats.numFailedCASOps++;
}
}
AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
};
class RegisterOperandInfo
{
public:
RegisterOperandInfo() = delete;
RegisterOperandInfo(int op_idx, int num_dwords,
const std::vector<int> &virt_indices,
const std::vector<int> &phys_indices)
: opIdx(op_idx), numDWORDs(num_dwords), virtIndices(virt_indices),
physIndices(phys_indices)
{
}
/**
* The number of registers required to store this operand.
*/
int numRegisters() const { return numDWORDs / TheGpuISA::RegSizeDWords; }
int operandIdx() const { return opIdx; }
/**
* We typically only need the first virtual register for the operand
* regardless of its size.
*/
int virtIdx(int reg_num=0) const { return virtIndices.at(reg_num); }
private:
/**
* Index of this operand within the set of its parent instruction's
* operand list.
*/
const int opIdx;
/**
* Size of this operand in DWORDs.
*/
const int numDWORDs;
const std::vector<int> virtIndices;
const std::vector<int> physIndices;
};
class GPUDynInst : public GPUExecContext
{
public:
GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
uint64_t instSeqNum);
~GPUDynInst();
void execute(GPUDynInstPtr gpuDynInst);
const std::vector<OperandInfo>& srcVecRegOperands() const;
const std::vector<OperandInfo>& dstVecRegOperands() const;
const std::vector<OperandInfo>& srcScalarRegOperands() const;
const std::vector<OperandInfo>& dstScalarRegOperands() const;
int numSrcRegOperands();
int numDstRegOperands();
int numSrcVecRegOperands() const;
int numDstVecRegOperands() const;
int maxSrcVecRegOperandSize();
int numSrcVecDWords();
int numDstVecDWords();
int numSrcScalarRegOperands() const;
int numDstScalarRegOperands() const;
int maxSrcScalarRegOperandSize();
int numSrcScalarDWords();
int numDstScalarDWords();
int maxOperandSize();
int getNumOperands() const;
bool hasSourceSgpr() const;
bool hasDestinationSgpr() const;
bool hasSourceVgpr() const;
bool hasDestinationVgpr() const;
// returns true if the string "opcodeStr" is found in the
// opcode of the instruction
bool isOpcode(const std::string& opcodeStr) const;
bool isOpcode(const std::string& opcodeStr,
const std::string& extStr) const;
const std::string &disassemble() const;
InstSeqNum seqNum() const;
Addr pc();
void pc(Addr _pc);
enums::StorageClassType executedAs();
// virtual address for scalar memory operations
Addr scalarAddr;
// virtual addressies for vector memory operations
std::vector<Addr> addr;
Addr pAddr;
// vector data to get written
uint8_t *d_data;
// scalar data to be transferred
uint8_t *scalar_data;
// Additional data (for atomics)
uint8_t *a_data;
// Additional data (for atomics)
uint8_t *x_data;
// The execution mask
VectorMask exec_mask;
// SIMD where the WF of the memory instruction has been mapped to
int simdId;
// unique id of the WF where the memory instruction belongs to
int wfDynId;
// The kernel id of the requesting wf
int kern_id;
// The CU id of the requesting wf
int cu_id;
// The workgroup id of the requesting wf
int wg_id;
// HW slot id where the WF is mapped to inside a SIMD unit
int wfSlotId;
// execution pipeline id where the memory instruction has been scheduled
int execUnitId;
// The execution time of this operation
Tick time;
// The latency of this operation
WaitClass latency;
// Initiate the specified memory operation, by creating a
// memory request and sending it off to the memory system.
void initiateAcc(GPUDynInstPtr gpuDynInst);
// Complete the specified memory operation, by writing
// value back to the RF in the case of a load or atomic
// return or, in the case of a store, we do nothing
void completeAcc(GPUDynInstPtr gpuDynInst);
void updateStats();
GPUStaticInst* staticInstruction() { return _staticInst; }
TheGpuISA::ScalarRegU32 srcLiteral() const;
bool isALU() const;
bool isBranch() const;
bool isCondBranch() const;
bool isNop() const;
bool isReturn() const;
bool isEndOfKernel() const;
bool isKernelLaunch() const;
bool isSDWAInst() const;
bool isDPPInst() const;
bool isUnconditionalJump() const;
bool isSpecialOp() const;
bool isWaitcnt() const;
bool isSleep() const;
bool isBarrier() const;
bool isMemSync() const;
bool isMemRef() const;
bool isFlat() const;
bool isFlatGlobal() const;
bool isFlatScratch() const;
bool isLoad() const;
bool isStore() const;
bool isAtomic() const;
bool isAtomicNoRet() const;
bool isAtomicRet() const;
bool isScalar() const;
bool isVector() const;
bool readsSCC() const;
bool writesSCC() const;
bool readsVCC() const;
bool writesVCC() const;
bool readsExec() const;
bool writesExec() const;
bool readsMode() const;
bool writesMode() const;
bool ignoreExec() const;
bool readsFlatScratch() const;
bool writesFlatScratch() const;
bool readsExecMask() const;
bool writesExecMask() const;
bool needsToken() const;
bool isAtomicAnd() const;
bool isAtomicOr() const;
bool isAtomicXor() const;
bool isAtomicCAS() const;
bool isAtomicExch() const;
bool isAtomicAdd() const;
bool isAtomicSub() const;
bool isAtomicInc() const;
bool isAtomicDec() const;
bool isAtomicMax() const;
bool isAtomicMin() const;
bool isArgLoad() const;
bool isGlobalMem() const;
bool isLocalMem() const;
bool isArgSeg() const;
bool isGlobalSeg() const;
bool isGroupSeg() const;
bool isKernArgSeg() const;
bool isPrivateSeg() const;
bool isReadOnlySeg() const;
bool isSpillSeg() const;
bool isGloballyCoherent() const;
bool isSystemCoherent() const;
bool isF16() const;
bool isF32() const;
bool isF64() const;
bool isFMA() const;
bool isMAC() const;
bool isMAD() const;
// for FLAT memory ops. check the segment address
// against the APE registers to see if it falls
// within one of the APE ranges for LDS/SCRATCH/GPUVM.
// if it does not fall into one of the three APEs, it
// will be a regular global access.
void doApertureCheck(const VectorMask &mask);
// Function to resolve a flat accesses during execution stage.
void resolveFlatSegment(const VectorMask &mask);
template<typename c0> AtomicOpFunctorPtr
makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
{
if (isAtomicAnd()) {
return std::make_unique<AtomicOpAnd<c0>>(*reg0);
} else if (isAtomicOr()) {
return std::make_unique<AtomicOpOr<c0>>(*reg0);
} else if (isAtomicXor()) {
return std::make_unique<AtomicOpXor<c0>>(*reg0);
} else if (isAtomicCAS()) {
return std::make_unique<AtomicOpCAS<c0>>(*reg0, *reg1, cu);
} else if (isAtomicExch()) {
return std::make_unique<AtomicOpExch<c0>>(*reg0);
} else if (isAtomicAdd()) {
return std::make_unique<AtomicOpAdd<c0>>(*reg0);
} else if (isAtomicSub()) {
return std::make_unique<AtomicOpSub<c0>>(*reg0);
} else if (isAtomicInc()) {
return std::make_unique<AtomicOpInc<c0>>();
} else if (isAtomicDec()) {
return std::make_unique<AtomicOpDec<c0>>();
} else if (isAtomicMax()) {
return std::make_unique<AtomicOpMax<c0>>(*reg0);
} else if (isAtomicMin()) {
return std::make_unique<AtomicOpMin<c0>>(*reg0);
} else {
fatal("Unrecognized atomic operation");
}
}
void
setRequestFlags(RequestPtr req) const
{
if (isGloballyCoherent()) {
req->setCacheCoherenceFlags(Request::GLC_BIT);
}
if (isSystemCoherent()) {
req->setCacheCoherenceFlags(Request::SLC_BIT);
}
if (isAtomicRet()) {
req->setFlags(Request::ATOMIC_RETURN_OP);
} else if (isAtomicNoRet()) {
req->setFlags(Request::ATOMIC_NO_RETURN_OP);
}
if (isMemSync()) {
// the path for kernel launch and kernel end is different
// from non-kernel mem sync.
assert(!isKernelLaunch());
assert(!isEndOfKernel());
// must be wbinv inst if not kernel launch/end
req->setCacheCoherenceFlags(Request::INV_L1);
}
}
// reset the number of pending memory requests for all lanes
void
resetEntireStatusVector()
{
assert(statusVector.size() == TheGpuISA::NumVecElemPerVecReg);
for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
resetStatusVector(lane);
}
}
// reset the number of pending memory requests for the inputted lane
void
resetStatusVector(int lane)
{
setStatusVector(lane, 0);
}
// set the number of pending memory requests for the inputted lane
void
setStatusVector(int lane, int newVal)
{
// currently we can have up to 2 memory requests per lane (if the
// lane's request goes across multiple cache lines)
assert((newVal >= 0) && (newVal <= 2));
statusVector[lane] = newVal;
}
// subtracts the number of pending memory requests for the inputted lane
// by 1
void
decrementStatusVector(int lane)
{
// this lane may have multiple requests, so only subtract one for
// this request
assert(statusVector[lane] >= 1);
statusVector[lane]--;
}
// return the current number of pending memory requests for the inputted
// lane
int
getLaneStatus(int lane) const
{
return statusVector[lane];
}
// returns true if all memory requests from all lanes have been received,
// else returns false
bool
allLanesZero() const
{
// local variables
bool allZero = true;
// iterate over all lanes, checking the number of pending memory
// requests they have
for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
// if any lane still has pending requests, return false
if (statusVector[lane] > 0) {
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: lane: %d has %d pending "
"request(s) for %#x\n", cu_id, simdId, wfSlotId, lane,
statusVector[lane], addr[lane]);
allZero = false;
}
}
if (allZero) {
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: all lanes have no pending"
" requests for %#x\n", cu_id, simdId, wfSlotId, addr[0]);
}
return allZero;
}
// returns a string representing the current state of the statusVector
std::string
printStatusVector() const
{
std::string statusVec_str = "[";
// iterate over all lanes, adding the current number of pending
// requests for this lane to the string
for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
statusVec_str += std::to_string(statusVector[lane]);
}
statusVec_str += "]";
return statusVec_str;
}
// Map returned packets and the addresses they satisfy with which lane they
// were requested from
typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
StatusVector memStatusVector;
// Track the status of memory requests per lane, an int per lane to allow
// unaligned accesses
std::vector<int> statusVector;
// for ld_v# or st_v#
std::vector<int> tlbHitLevel;
// for misaligned scalar ops we track the number
// of outstanding reqs here
int numScalarReqs;
Tick getAccessTime() const { return accessTime; }
void setAccessTime(Tick currentTime) { accessTime = currentTime; }
void profileRoundTripTime(Tick currentTime, int hopId);
std::vector<Tick> getRoundTripTime() const { return roundTripTime; }
void profileLineAddressTime(Addr addr, Tick currentTime, int hopId);
const std::map<Addr, std::vector<Tick>>& getLineAddressTime() const
{ return lineAddressTime; }
// inst used to save/restore a wavefront context
bool isSaveRestore;
bool isSystemReq() { return systemReq; }
void setSystemReq() { systemReq = true; }
private:
GPUStaticInst *_staticInst;
const InstSeqNum _seqNum;
int maxSrcVecRegOpSize;
int maxSrcScalarRegOpSize;
bool systemReq = false;
// the time the request was started
Tick accessTime = -1;
// hold the tick when the instruction arrives at certain hop points
// on it's way to main memory
std::vector<Tick> roundTripTime;
// hold each cache block address for the instruction and a vector
// to hold the tick when the block arrives at certain hop points
std::map<Addr, std::vector<Tick>> lineAddressTime;
};
} // namespace gem5
#endif // __GPU_DYN_INST_HH__