Files
gem5/src/gpu-compute/gpu_dyn_inst.hh
Daniel R. Carvalho 974a47dfb9 misc: Adopt the gem5 namespace
Apply the gem5 namespace to the codebase.

Some anonymous namespaces could theoretically be removed,
but since this change's main goal was to keep conflicts
at a minimum, it was decided not to modify much the
general shape of the files.

A few missing comments of the form "// namespace X" that
occurred before the newly added "} // namespace gem5"
have been added for consistency.

std out should not be included in the gem5 namespace, so
they weren't.

ProtoMessage has not been included in the gem5 namespace,
since I'm not familiar with how proto works.

Regarding the SystemC files, although they belong to gem5,
they actually perform integration between gem5 and SystemC;
therefore, it deserved its own separate namespace.

Files that are automatically generated have been included
in the gem5 namespace.

The .isa files currently are limited to a single namespace.
This limitation should be later removed to make it easier
to accomodate a better API.

Regarding the files in util, gem5:: was prepended where
suitable. Notice that this patch was tested as much as
possible given that most of these were already not
previously compiling.

Change-Id: Ia53d404ec79c46edaa98f654e23bc3b0e179fe2d
Signed-off-by: Daniel R. Carvalho <odanrc@yahoo.com.br>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/46323
Maintainer: Bobby R. Bruce <bbruce@ucdavis.edu>
Reviewed-by: Bobby R. Bruce <bbruce@ucdavis.edu>
Reviewed-by: Matthew Poremba <matthew.poremba@amd.com>
Tested-by: kokoro <noreply+kokoro@google.com>
2021-07-01 19:08:24 +00:00

498 lines
15 KiB
C++

/*
* Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __GPU_DYN_INST_HH__
#define __GPU_DYN_INST_HH__
#include <cstdint>
#include <memory>
#include <string>
#include "base/amo.hh"
#include "base/logging.hh"
#include "base/trace.hh"
#include "debug/GPUMem.hh"
#include "enums/StorageClassType.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_exec_context.hh"
#include "gpu-compute/operand_info.hh"
namespace gem5
{
class GPUStaticInst;
template<typename T>
class AtomicOpCAS : public TypedAtomicOpFunctor<T>
{
public:
T c;
T s;
ComputeUnit *computeUnit;
AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
: c(_c), s(_s), computeUnit(compute_unit) { }
void
execute(T *b)
{
computeUnit->stats.numCASOps++;
if (*b == c) {
*b = s;
} else {
computeUnit->stats.numFailedCASOps++;
}
}
AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
};
class RegisterOperandInfo
{
public:
RegisterOperandInfo() = delete;
RegisterOperandInfo(int op_idx, int num_dwords,
const std::vector<int> &virt_indices,
const std::vector<int> &phys_indices)
: opIdx(op_idx), numDWORDs(num_dwords), virtIndices(virt_indices),
physIndices(phys_indices)
{
}
/**
* The number of registers required to store this operand.
*/
int numRegisters() const { return numDWORDs / TheGpuISA::RegSizeDWords; }
int operandIdx() const { return opIdx; }
/**
* We typically only need the first virtual register for the operand
* regardless of its size.
*/
int virtIdx(int reg_num=0) const { return virtIndices.at(reg_num); }
private:
/**
* Index of this operand within the set of its parent instruction's
* operand list.
*/
const int opIdx;
/**
* Size of this operand in DWORDs.
*/
const int numDWORDs;
const std::vector<int> virtIndices;
const std::vector<int> physIndices;
};
class GPUDynInst : public GPUExecContext
{
public:
GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
uint64_t instSeqNum);
~GPUDynInst();
void execute(GPUDynInstPtr gpuDynInst);
const std::vector<OperandInfo>& srcVecRegOperands() const;
const std::vector<OperandInfo>& dstVecRegOperands() const;
const std::vector<OperandInfo>& srcScalarRegOperands() const;
const std::vector<OperandInfo>& dstScalarRegOperands() const;
int numSrcRegOperands();
int numDstRegOperands();
int numSrcVecRegOperands() const;
int numDstVecRegOperands() const;
int maxSrcVecRegOperandSize();
int numSrcVecDWords();
int numDstVecDWords();
int numSrcScalarRegOperands() const;
int numDstScalarRegOperands() const;
int maxSrcScalarRegOperandSize();
int numSrcScalarDWords();
int numDstScalarDWords();
int maxOperandSize();
int getNumOperands() const;
bool hasSourceSgpr() const;
bool hasDestinationSgpr() const;
bool hasSourceVgpr() const;
bool hasDestinationVgpr() const;
// returns true if the string "opcodeStr" is found in the
// opcode of the instruction
bool isOpcode(const std::string& opcodeStr) const;
bool isOpcode(const std::string& opcodeStr,
const std::string& extStr) const;
const std::string &disassemble() const;
InstSeqNum seqNum() const;
enums::StorageClassType executedAs();
// virtual address for scalar memory operations
Addr scalarAddr;
// virtual addressies for vector memory operations
std::vector<Addr> addr;
Addr pAddr;
// vector data to get written
uint8_t *d_data;
// scalar data to be transferred
uint8_t *scalar_data;
// Additional data (for atomics)
uint8_t *a_data;
// Additional data (for atomics)
uint8_t *x_data;
// The execution mask
VectorMask exec_mask;
// SIMD where the WF of the memory instruction has been mapped to
int simdId;
// unique id of the WF where the memory instruction belongs to
int wfDynId;
// The kernel id of the requesting wf
int kern_id;
// The CU id of the requesting wf
int cu_id;
// The workgroup id of the requesting wf
int wg_id;
// HW slot id where the WF is mapped to inside a SIMD unit
int wfSlotId;
// execution pipeline id where the memory instruction has been scheduled
int execUnitId;
// The execution time of this operation
Tick time;
// The latency of this operation
WaitClass latency;
// Initiate the specified memory operation, by creating a
// memory request and sending it off to the memory system.
void initiateAcc(GPUDynInstPtr gpuDynInst);
// Complete the specified memory operation, by writing
// value back to the RF in the case of a load or atomic
// return or, in the case of a store, we do nothing
void completeAcc(GPUDynInstPtr gpuDynInst);
void updateStats();
GPUStaticInst* staticInstruction() { return _staticInst; }
TheGpuISA::ScalarRegU32 srcLiteral() const;
bool isALU() const;
bool isBranch() const;
bool isCondBranch() const;
bool isNop() const;
bool isReturn() const;
bool isEndOfKernel() const;
bool isKernelLaunch() const;
bool isSDWAInst() const;
bool isDPPInst() const;
bool isUnconditionalJump() const;
bool isSpecialOp() const;
bool isWaitcnt() const;
bool isSleep() const;
bool isBarrier() const;
bool isMemSync() const;
bool isMemRef() const;
bool isFlat() const;
bool isLoad() const;
bool isStore() const;
bool isAtomic() const;
bool isAtomicNoRet() const;
bool isAtomicRet() const;
bool isScalar() const;
bool isVector() const;
bool readsSCC() const;
bool writesSCC() const;
bool readsVCC() const;
bool writesVCC() const;
bool readsExec() const;
bool writesExec() const;
bool readsMode() const;
bool writesMode() const;
bool ignoreExec() const;
bool readsFlatScratch() const;
bool writesFlatScratch() const;
bool readsExecMask() const;
bool writesExecMask() const;
bool isAtomicAnd() const;
bool isAtomicOr() const;
bool isAtomicXor() const;
bool isAtomicCAS() const;
bool isAtomicExch() const;
bool isAtomicAdd() const;
bool isAtomicSub() const;
bool isAtomicInc() const;
bool isAtomicDec() const;
bool isAtomicMax() const;
bool isAtomicMin() const;
bool isArgLoad() const;
bool isGlobalMem() const;
bool isLocalMem() const;
bool isArgSeg() const;
bool isGlobalSeg() const;
bool isGroupSeg() const;
bool isKernArgSeg() const;
bool isPrivateSeg() const;
bool isReadOnlySeg() const;
bool isSpillSeg() const;
bool isGloballyCoherent() const;
bool isSystemCoherent() const;
bool isF16() const;
bool isF32() const;
bool isF64() const;
bool isFMA() const;
bool isMAC() const;
bool isMAD() const;
// for FLAT memory ops. check the segment address
// against the APE registers to see if it falls
// within one of the APE ranges for LDS/SCRATCH/GPUVM.
// if it does not fall into one of the three APEs, it
// will be a regular global access.
void doApertureCheck(const VectorMask &mask);
// Function to resolve a flat accesses during execution stage.
void resolveFlatSegment(const VectorMask &mask);
template<typename c0> AtomicOpFunctorPtr
makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
{
if (isAtomicAnd()) {
return std::make_unique<AtomicOpAnd<c0>>(*reg0);
} else if (isAtomicOr()) {
return std::make_unique<AtomicOpOr<c0>>(*reg0);
} else if (isAtomicXor()) {
return std::make_unique<AtomicOpXor<c0>>(*reg0);
} else if (isAtomicCAS()) {
return std::make_unique<AtomicOpCAS<c0>>(*reg0, *reg1, cu);
} else if (isAtomicExch()) {
return std::make_unique<AtomicOpExch<c0>>(*reg0);
} else if (isAtomicAdd()) {
return std::make_unique<AtomicOpAdd<c0>>(*reg0);
} else if (isAtomicSub()) {
return std::make_unique<AtomicOpSub<c0>>(*reg0);
} else if (isAtomicInc()) {
return std::make_unique<AtomicOpInc<c0>>();
} else if (isAtomicDec()) {
return std::make_unique<AtomicOpDec<c0>>();
} else if (isAtomicMax()) {
return std::make_unique<AtomicOpMax<c0>>(*reg0);
} else if (isAtomicMin()) {
return std::make_unique<AtomicOpMin<c0>>(*reg0);
} else {
fatal("Unrecognized atomic operation");
}
}
void
setRequestFlags(RequestPtr req) const
{
if (isGloballyCoherent()) {
req->setCacheCoherenceFlags(Request::GLC_BIT);
}
if (isSystemCoherent()) {
req->setCacheCoherenceFlags(Request::SLC_BIT);
}
if (isAtomicRet()) {
req->setFlags(Request::ATOMIC_RETURN_OP);
} else if (isAtomicNoRet()) {
req->setFlags(Request::ATOMIC_NO_RETURN_OP);
}
if (isMemSync()) {
// the path for kernel launch and kernel end is different
// from non-kernel mem sync.
assert(!isKernelLaunch());
assert(!isEndOfKernel());
// must be wbinv inst if not kernel launch/end
req->setCacheCoherenceFlags(Request::INV_L1);
}
}
// reset the number of pending memory requests for all lanes
void
resetEntireStatusVector()
{
assert(statusVector.size() == TheGpuISA::NumVecElemPerVecReg);
for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
resetStatusVector(lane);
}
}
// reset the number of pending memory requests for the inputted lane
void
resetStatusVector(int lane)
{
setStatusVector(lane, 0);
}
// set the number of pending memory requests for the inputted lane
void
setStatusVector(int lane, int newVal)
{
// currently we can have up to 2 memory requests per lane (if the
// lane's request goes across multiple cache lines)
assert((newVal >= 0) && (newVal <= 2));
statusVector[lane] = newVal;
}
// subtracts the number of pending memory requests for the inputted lane
// by 1
void
decrementStatusVector(int lane)
{
// this lane may have multiple requests, so only subtract one for
// this request
assert(statusVector[lane] >= 1);
statusVector[lane]--;
}
// return the current number of pending memory requests for the inputted
// lane
int
getLaneStatus(int lane) const
{
return statusVector[lane];
}
// returns true if all memory requests from all lanes have been received,
// else returns false
bool
allLanesZero() const
{
// local variables
bool allZero = true;
// iterate over all lanes, checking the number of pending memory
// requests they have
for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
// if any lane still has pending requests, return false
if (statusVector[lane] > 0) {
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: lane: %d has %d pending "
"request(s) for %#x\n", cu_id, simdId, wfSlotId, lane,
statusVector[lane], addr[lane]);
allZero = false;
}
}
if (allZero) {
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: all lanes have no pending"
" requests for %#x\n", cu_id, simdId, wfSlotId, addr[0]);
}
return allZero;
}
// returns a string representing the current state of the statusVector
std::string
printStatusVector() const
{
std::string statusVec_str = "[";
// iterate over all lanes, adding the current number of pending
// requests for this lane to the string
for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
statusVec_str += std::to_string(statusVector[lane]);
}
statusVec_str += "]";
return statusVec_str;
}
// Map returned packets and the addresses they satisfy with which lane they
// were requested from
typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
StatusVector memStatusVector;
// Track the status of memory requests per lane, an int per lane to allow
// unaligned accesses
std::vector<int> statusVector;
// for ld_v# or st_v#
std::vector<int> tlbHitLevel;
// for misaligned scalar ops we track the number
// of outstanding reqs here
int numScalarReqs;
Tick getAccessTime() const { return accessTime; }
void setAccessTime(Tick currentTime) { accessTime = currentTime; }
void profileRoundTripTime(Tick currentTime, int hopId);
std::vector<Tick> getRoundTripTime() const { return roundTripTime; }
void profileLineAddressTime(Addr addr, Tick currentTime, int hopId);
const std::map<Addr, std::vector<Tick>>& getLineAddressTime() const
{ return lineAddressTime; }
// inst used to save/restore a wavefront context
bool isSaveRestore;
private:
GPUStaticInst *_staticInst;
const InstSeqNum _seqNum;
int maxSrcVecRegOpSize;
int maxSrcScalarRegOpSize;
// the time the request was started
Tick accessTime = -1;
// hold the tick when the instruction arrives at certain hop points
// on it's way to main memory
std::vector<Tick> roundTripTime;
// hold each cache block address for the instruction and a vector
// to hold the tick when the block arrives at certain hop points
std::map<Addr, std::vector<Tick>> lineAddressTime;
};
} // namespace gem5
#endif // __GPU_DYN_INST_HH__