The amdgpu driver supports reading and writing scalar and vector memory addresses that reside in system memory. This is commonly used for things like blit kernels that perform host-to-device or device-to-host copies using GPU load/store instructions. This is done by utilizing the system hub device added in a prior changeset. Memory packets translated by the Scalar or VMEM TLBs will have the correspoding system request field set from the PTE in the TLB which can be used in the compute unit to determine if a request is for system memory or not. Another important change is to return global memory tokens for system requests. Since these do not flow through the GPU coalescer where the token is returned, the token can be returned once the request is known to be a system request. Change-Id: I35030e0b3698f10c63a397f96b81267271e3130e Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/57711 Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
505 lines
15 KiB
C++
505 lines
15 KiB
C++
/*
|
|
* Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from this
|
|
* software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#ifndef __GPU_DYN_INST_HH__
|
|
#define __GPU_DYN_INST_HH__
|
|
|
|
#include <cstdint>
|
|
#include <memory>
|
|
#include <string>
|
|
|
|
#include "base/amo.hh"
|
|
#include "base/logging.hh"
|
|
#include "base/trace.hh"
|
|
#include "debug/GPUMem.hh"
|
|
#include "enums/StorageClassType.hh"
|
|
#include "gpu-compute/compute_unit.hh"
|
|
#include "gpu-compute/gpu_exec_context.hh"
|
|
#include "gpu-compute/operand_info.hh"
|
|
|
|
namespace gem5
|
|
{
|
|
|
|
class GPUStaticInst;
|
|
|
|
template<typename T>
|
|
class AtomicOpCAS : public TypedAtomicOpFunctor<T>
|
|
{
|
|
public:
|
|
T c;
|
|
T s;
|
|
|
|
ComputeUnit *computeUnit;
|
|
|
|
AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
|
|
: c(_c), s(_s), computeUnit(compute_unit) { }
|
|
|
|
void
|
|
execute(T *b)
|
|
{
|
|
computeUnit->stats.numCASOps++;
|
|
|
|
if (*b == c) {
|
|
*b = s;
|
|
} else {
|
|
computeUnit->stats.numFailedCASOps++;
|
|
}
|
|
}
|
|
AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
|
|
};
|
|
|
|
class RegisterOperandInfo
|
|
{
|
|
public:
|
|
RegisterOperandInfo() = delete;
|
|
RegisterOperandInfo(int op_idx, int num_dwords,
|
|
const std::vector<int> &virt_indices,
|
|
const std::vector<int> &phys_indices)
|
|
: opIdx(op_idx), numDWORDs(num_dwords), virtIndices(virt_indices),
|
|
physIndices(phys_indices)
|
|
{
|
|
}
|
|
|
|
/**
|
|
* The number of registers required to store this operand.
|
|
*/
|
|
int numRegisters() const { return numDWORDs / TheGpuISA::RegSizeDWords; }
|
|
int operandIdx() const { return opIdx; }
|
|
/**
|
|
* We typically only need the first virtual register for the operand
|
|
* regardless of its size.
|
|
*/
|
|
int virtIdx(int reg_num=0) const { return virtIndices.at(reg_num); }
|
|
|
|
private:
|
|
/**
|
|
* Index of this operand within the set of its parent instruction's
|
|
* operand list.
|
|
*/
|
|
const int opIdx;
|
|
/**
|
|
* Size of this operand in DWORDs.
|
|
*/
|
|
const int numDWORDs;
|
|
const std::vector<int> virtIndices;
|
|
const std::vector<int> physIndices;
|
|
};
|
|
|
|
class GPUDynInst : public GPUExecContext
|
|
{
|
|
public:
|
|
GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
|
|
uint64_t instSeqNum);
|
|
~GPUDynInst();
|
|
void execute(GPUDynInstPtr gpuDynInst);
|
|
|
|
const std::vector<OperandInfo>& srcVecRegOperands() const;
|
|
const std::vector<OperandInfo>& dstVecRegOperands() const;
|
|
const std::vector<OperandInfo>& srcScalarRegOperands() const;
|
|
const std::vector<OperandInfo>& dstScalarRegOperands() const;
|
|
|
|
int numSrcRegOperands();
|
|
int numDstRegOperands();
|
|
|
|
int numSrcVecRegOperands() const;
|
|
int numDstVecRegOperands() const;
|
|
int maxSrcVecRegOperandSize();
|
|
int numSrcVecDWords();
|
|
int numDstVecDWords();
|
|
|
|
int numSrcScalarRegOperands() const;
|
|
int numDstScalarRegOperands() const;
|
|
int maxSrcScalarRegOperandSize();
|
|
int numSrcScalarDWords();
|
|
int numDstScalarDWords();
|
|
|
|
int maxOperandSize();
|
|
|
|
int getNumOperands() const;
|
|
|
|
bool hasSourceSgpr() const;
|
|
bool hasDestinationSgpr() const;
|
|
bool hasSourceVgpr() const;
|
|
bool hasDestinationVgpr() const;
|
|
|
|
// returns true if the string "opcodeStr" is found in the
|
|
// opcode of the instruction
|
|
bool isOpcode(const std::string& opcodeStr) const;
|
|
bool isOpcode(const std::string& opcodeStr,
|
|
const std::string& extStr) const;
|
|
|
|
const std::string &disassemble() const;
|
|
|
|
InstSeqNum seqNum() const;
|
|
|
|
Addr pc();
|
|
void pc(Addr _pc);
|
|
|
|
enums::StorageClassType executedAs();
|
|
|
|
// virtual address for scalar memory operations
|
|
Addr scalarAddr;
|
|
// virtual addressies for vector memory operations
|
|
std::vector<Addr> addr;
|
|
Addr pAddr;
|
|
|
|
// vector data to get written
|
|
uint8_t *d_data;
|
|
// scalar data to be transferred
|
|
uint8_t *scalar_data;
|
|
// Additional data (for atomics)
|
|
uint8_t *a_data;
|
|
// Additional data (for atomics)
|
|
uint8_t *x_data;
|
|
// The execution mask
|
|
VectorMask exec_mask;
|
|
|
|
// SIMD where the WF of the memory instruction has been mapped to
|
|
int simdId;
|
|
// unique id of the WF where the memory instruction belongs to
|
|
int wfDynId;
|
|
// The kernel id of the requesting wf
|
|
int kern_id;
|
|
// The CU id of the requesting wf
|
|
int cu_id;
|
|
// The workgroup id of the requesting wf
|
|
int wg_id;
|
|
// HW slot id where the WF is mapped to inside a SIMD unit
|
|
int wfSlotId;
|
|
// execution pipeline id where the memory instruction has been scheduled
|
|
int execUnitId;
|
|
// The execution time of this operation
|
|
Tick time;
|
|
// The latency of this operation
|
|
WaitClass latency;
|
|
|
|
// Initiate the specified memory operation, by creating a
|
|
// memory request and sending it off to the memory system.
|
|
void initiateAcc(GPUDynInstPtr gpuDynInst);
|
|
// Complete the specified memory operation, by writing
|
|
// value back to the RF in the case of a load or atomic
|
|
// return or, in the case of a store, we do nothing
|
|
void completeAcc(GPUDynInstPtr gpuDynInst);
|
|
|
|
void updateStats();
|
|
|
|
GPUStaticInst* staticInstruction() { return _staticInst; }
|
|
|
|
TheGpuISA::ScalarRegU32 srcLiteral() const;
|
|
|
|
bool isALU() const;
|
|
bool isBranch() const;
|
|
bool isCondBranch() const;
|
|
bool isNop() const;
|
|
bool isReturn() const;
|
|
bool isEndOfKernel() const;
|
|
bool isKernelLaunch() const;
|
|
bool isSDWAInst() const;
|
|
bool isDPPInst() const;
|
|
bool isUnconditionalJump() const;
|
|
bool isSpecialOp() const;
|
|
bool isWaitcnt() const;
|
|
bool isSleep() const;
|
|
|
|
bool isBarrier() const;
|
|
bool isMemSync() const;
|
|
bool isMemRef() const;
|
|
bool isFlat() const;
|
|
bool isFlatGlobal() const;
|
|
bool isLoad() const;
|
|
bool isStore() const;
|
|
|
|
bool isAtomic() const;
|
|
bool isAtomicNoRet() const;
|
|
bool isAtomicRet() const;
|
|
|
|
bool isScalar() const;
|
|
bool isVector() const;
|
|
bool readsSCC() const;
|
|
bool writesSCC() const;
|
|
bool readsVCC() const;
|
|
bool writesVCC() const;
|
|
bool readsExec() const;
|
|
bool writesExec() const;
|
|
bool readsMode() const;
|
|
bool writesMode() const;
|
|
bool ignoreExec() const;
|
|
bool readsFlatScratch() const;
|
|
bool writesFlatScratch() const;
|
|
bool readsExecMask() const;
|
|
bool writesExecMask() const;
|
|
|
|
bool isAtomicAnd() const;
|
|
bool isAtomicOr() const;
|
|
bool isAtomicXor() const;
|
|
bool isAtomicCAS() const;
|
|
bool isAtomicExch() const;
|
|
bool isAtomicAdd() const;
|
|
bool isAtomicSub() const;
|
|
bool isAtomicInc() const;
|
|
bool isAtomicDec() const;
|
|
bool isAtomicMax() const;
|
|
bool isAtomicMin() const;
|
|
|
|
bool isArgLoad() const;
|
|
bool isGlobalMem() const;
|
|
bool isLocalMem() const;
|
|
|
|
bool isArgSeg() const;
|
|
bool isGlobalSeg() const;
|
|
bool isGroupSeg() const;
|
|
bool isKernArgSeg() const;
|
|
bool isPrivateSeg() const;
|
|
bool isReadOnlySeg() const;
|
|
bool isSpillSeg() const;
|
|
|
|
bool isGloballyCoherent() const;
|
|
bool isSystemCoherent() const;
|
|
|
|
bool isF16() const;
|
|
bool isF32() const;
|
|
bool isF64() const;
|
|
|
|
bool isFMA() const;
|
|
bool isMAC() const;
|
|
bool isMAD() const;
|
|
|
|
// for FLAT memory ops. check the segment address
|
|
// against the APE registers to see if it falls
|
|
// within one of the APE ranges for LDS/SCRATCH/GPUVM.
|
|
// if it does not fall into one of the three APEs, it
|
|
// will be a regular global access.
|
|
void doApertureCheck(const VectorMask &mask);
|
|
// Function to resolve a flat accesses during execution stage.
|
|
void resolveFlatSegment(const VectorMask &mask);
|
|
|
|
template<typename c0> AtomicOpFunctorPtr
|
|
makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
|
|
{
|
|
if (isAtomicAnd()) {
|
|
return std::make_unique<AtomicOpAnd<c0>>(*reg0);
|
|
} else if (isAtomicOr()) {
|
|
return std::make_unique<AtomicOpOr<c0>>(*reg0);
|
|
} else if (isAtomicXor()) {
|
|
return std::make_unique<AtomicOpXor<c0>>(*reg0);
|
|
} else if (isAtomicCAS()) {
|
|
return std::make_unique<AtomicOpCAS<c0>>(*reg0, *reg1, cu);
|
|
} else if (isAtomicExch()) {
|
|
return std::make_unique<AtomicOpExch<c0>>(*reg0);
|
|
} else if (isAtomicAdd()) {
|
|
return std::make_unique<AtomicOpAdd<c0>>(*reg0);
|
|
} else if (isAtomicSub()) {
|
|
return std::make_unique<AtomicOpSub<c0>>(*reg0);
|
|
} else if (isAtomicInc()) {
|
|
return std::make_unique<AtomicOpInc<c0>>();
|
|
} else if (isAtomicDec()) {
|
|
return std::make_unique<AtomicOpDec<c0>>();
|
|
} else if (isAtomicMax()) {
|
|
return std::make_unique<AtomicOpMax<c0>>(*reg0);
|
|
} else if (isAtomicMin()) {
|
|
return std::make_unique<AtomicOpMin<c0>>(*reg0);
|
|
} else {
|
|
fatal("Unrecognized atomic operation");
|
|
}
|
|
}
|
|
|
|
void
|
|
setRequestFlags(RequestPtr req) const
|
|
{
|
|
if (isGloballyCoherent()) {
|
|
req->setCacheCoherenceFlags(Request::GLC_BIT);
|
|
}
|
|
|
|
if (isSystemCoherent()) {
|
|
req->setCacheCoherenceFlags(Request::SLC_BIT);
|
|
}
|
|
|
|
if (isAtomicRet()) {
|
|
req->setFlags(Request::ATOMIC_RETURN_OP);
|
|
} else if (isAtomicNoRet()) {
|
|
req->setFlags(Request::ATOMIC_NO_RETURN_OP);
|
|
}
|
|
|
|
if (isMemSync()) {
|
|
// the path for kernel launch and kernel end is different
|
|
// from non-kernel mem sync.
|
|
assert(!isKernelLaunch());
|
|
assert(!isEndOfKernel());
|
|
|
|
// must be wbinv inst if not kernel launch/end
|
|
req->setCacheCoherenceFlags(Request::INV_L1);
|
|
}
|
|
}
|
|
|
|
// reset the number of pending memory requests for all lanes
|
|
void
|
|
resetEntireStatusVector()
|
|
{
|
|
assert(statusVector.size() == TheGpuISA::NumVecElemPerVecReg);
|
|
for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
|
|
resetStatusVector(lane);
|
|
}
|
|
}
|
|
|
|
// reset the number of pending memory requests for the inputted lane
|
|
void
|
|
resetStatusVector(int lane)
|
|
{
|
|
setStatusVector(lane, 0);
|
|
}
|
|
|
|
// set the number of pending memory requests for the inputted lane
|
|
void
|
|
setStatusVector(int lane, int newVal)
|
|
{
|
|
// currently we can have up to 2 memory requests per lane (if the
|
|
// lane's request goes across multiple cache lines)
|
|
assert((newVal >= 0) && (newVal <= 2));
|
|
statusVector[lane] = newVal;
|
|
}
|
|
|
|
// subtracts the number of pending memory requests for the inputted lane
|
|
// by 1
|
|
void
|
|
decrementStatusVector(int lane)
|
|
{
|
|
// this lane may have multiple requests, so only subtract one for
|
|
// this request
|
|
assert(statusVector[lane] >= 1);
|
|
statusVector[lane]--;
|
|
}
|
|
|
|
// return the current number of pending memory requests for the inputted
|
|
// lane
|
|
int
|
|
getLaneStatus(int lane) const
|
|
{
|
|
return statusVector[lane];
|
|
}
|
|
|
|
// returns true if all memory requests from all lanes have been received,
|
|
// else returns false
|
|
bool
|
|
allLanesZero() const
|
|
{
|
|
// local variables
|
|
bool allZero = true;
|
|
|
|
// iterate over all lanes, checking the number of pending memory
|
|
// requests they have
|
|
for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
|
|
// if any lane still has pending requests, return false
|
|
if (statusVector[lane] > 0) {
|
|
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: lane: %d has %d pending "
|
|
"request(s) for %#x\n", cu_id, simdId, wfSlotId, lane,
|
|
statusVector[lane], addr[lane]);
|
|
allZero = false;
|
|
}
|
|
}
|
|
|
|
if (allZero) {
|
|
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: all lanes have no pending"
|
|
" requests for %#x\n", cu_id, simdId, wfSlotId, addr[0]);
|
|
}
|
|
return allZero;
|
|
}
|
|
|
|
// returns a string representing the current state of the statusVector
|
|
std::string
|
|
printStatusVector() const
|
|
{
|
|
std::string statusVec_str = "[";
|
|
|
|
// iterate over all lanes, adding the current number of pending
|
|
// requests for this lane to the string
|
|
for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
|
|
statusVec_str += std::to_string(statusVector[lane]);
|
|
}
|
|
statusVec_str += "]";
|
|
|
|
return statusVec_str;
|
|
}
|
|
|
|
// Map returned packets and the addresses they satisfy with which lane they
|
|
// were requested from
|
|
typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
|
|
StatusVector memStatusVector;
|
|
|
|
// Track the status of memory requests per lane, an int per lane to allow
|
|
// unaligned accesses
|
|
std::vector<int> statusVector;
|
|
// for ld_v# or st_v#
|
|
std::vector<int> tlbHitLevel;
|
|
|
|
// for misaligned scalar ops we track the number
|
|
// of outstanding reqs here
|
|
int numScalarReqs;
|
|
|
|
Tick getAccessTime() const { return accessTime; }
|
|
|
|
void setAccessTime(Tick currentTime) { accessTime = currentTime; }
|
|
|
|
void profileRoundTripTime(Tick currentTime, int hopId);
|
|
std::vector<Tick> getRoundTripTime() const { return roundTripTime; }
|
|
|
|
void profileLineAddressTime(Addr addr, Tick currentTime, int hopId);
|
|
const std::map<Addr, std::vector<Tick>>& getLineAddressTime() const
|
|
{ return lineAddressTime; }
|
|
|
|
// inst used to save/restore a wavefront context
|
|
bool isSaveRestore;
|
|
|
|
bool isSystemReq() { return systemReq; }
|
|
void setSystemReq() { systemReq = true; }
|
|
|
|
private:
|
|
GPUStaticInst *_staticInst;
|
|
const InstSeqNum _seqNum;
|
|
int maxSrcVecRegOpSize;
|
|
int maxSrcScalarRegOpSize;
|
|
bool systemReq = false;
|
|
|
|
// the time the request was started
|
|
Tick accessTime = -1;
|
|
|
|
// hold the tick when the instruction arrives at certain hop points
|
|
// on it's way to main memory
|
|
std::vector<Tick> roundTripTime;
|
|
|
|
// hold each cache block address for the instruction and a vector
|
|
// to hold the tick when the block arrives at certain hop points
|
|
std::map<Addr, std::vector<Tick>> lineAddressTime;
|
|
};
|
|
|
|
} // namespace gem5
|
|
|
|
#endif // __GPU_DYN_INST_HH__
|