gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model

Change-Id: Ibe46970f3ba25d62ca2ade5cbc2054ad746b2254
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29912
Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com>
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
Tony Gutierrez
2018-05-01 16:59:35 -04:00
committed by Anthony Gutierrez
parent b0eac7857a
commit b8da9abba7
86 changed files with 10299 additions and 3734 deletions

View File

@@ -31,161 +31,116 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __WAVEFRONT_HH__
#define __WAVEFRONT_HH__
#ifndef __GPU_COMPUTE_WAVEFRONT_HH__
#define __GPU_COMPUTE_WAVEFRONT_HH__
#include <cassert>
#include <deque>
#include <list>
#include <memory>
#include <stack>
#include <unordered_map>
#include <vector>
#include "arch/gpu_isa.hh"
#include "base/logging.hh"
#include "base/types.hh"
#include "config/the_gpu_isa.hh"
#include "gpu-compute/condition_register_state.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/dispatcher.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/hsa_queue_entry.hh"
#include "gpu-compute/lds_state.hh"
#include "gpu-compute/misc.hh"
#include "gpu-compute/ndrange.hh"
#include "params/Wavefront.hh"
#include "sim/sim_object.hh"
static const int MAX_NUM_INSTS_PER_WF = 12;
/**
* A reconvergence stack entry conveys the necessary state to implement
* control flow divergence.
*/
struct ReconvergenceStackEntry {
/**
* PC of current instruction.
*/
uint32_t pc;
/**
* PC of the immediate post-dominator instruction, i.e., the value of
* @a pc for the first instruction that will be executed by the wavefront
* when a reconvergence point is reached.
*/
uint32_t rpc;
/**
* Execution mask.
*/
VectorMask execMask;
};
/*
* Arguments for the hsail opcode call, are user defined and variable length.
* The hardware/finalizer can support arguments in hardware or use memory to
* pass arguments. For now, let's assume that an unlimited number of arguments
* are supported in hardware (the compiler inlines functions whenver it can
* anyways, so unless someone is interested in the implications of linking/
* library functions, I think this is a reasonable assumption given the typical
* size of an OpenCL kernel).
*
* Note that call args are different than kernel arguments:
* * All work-items in a kernel refer the same set of kernel arguments
* * Each work-item has it's on set of call args. So a call argument at
* address 0x4 is different for work-item 0 and work-item 1.
*
* Ok, the table below shows an example of how we organize the call arguments in
* the CallArgMem class.
*
* int foo(int arg1, double arg2)
* ___________________________________________________
* | 0: return.0 | 4: return.1 | ... | 252: return.63 |
* |---------------------------------------------------|
* | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 |
* |---------------------------------------------------|
* | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 |
* ___________________________________________________
*/
class CallArgMem
{
public:
// pointer to buffer for storing function arguments
uint8_t *mem;
int wfSize;
// size of function args
int funcArgsSizePerItem;
template<typename CType>
int
getLaneOffset(int lane, int addr)
{
return addr * wfSize + sizeof(CType) * lane;
}
CallArgMem(int func_args_size_per_item, int wf_size)
: wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
{
mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
}
~CallArgMem()
{
free(mem);
}
template<typename CType>
uint8_t*
getLaneAddr(int lane, int addr)
{
return mem + getLaneOffset<CType>(lane, addr);
}
template<typename CType>
void
setLaneAddr(int lane, int addr, CType val)
{
*((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
}
};
class Wavefront : public SimObject
{
public:
enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
// Base pointer for array of instruction pointers
uint64_t basePtr;
enum status_e {
// wavefront is stalled
S_STOPPED,
// wavefront is returning from a kernel
S_RETURNING,
// wavefront is running normally
S_RUNNING,
// wavefront is stalled
S_STALLED,
/**
* wavefront has unsatisfied wait counts
*
* while in this state the WF will only execute if
* the oldest instruction is the waitcnt. while in
* S_WAITCNT, the wavefront will not be ready until
* all of its waitcnts have been satisfied. the
* scoreboard ready() function will check the status
* of the waitcnts whenever the WF is in S_WAITCNT,
* and once they are satisfied, it will resume normal
* operation.
*/
S_WAITCNT
};
uint32_t oldBarrierCnt;
uint32_t barrierCnt;
uint32_t barrierId;
uint32_t barrierSlots;
status_e status;
// HW slot id where the WF is mapped to inside a SIMD unit
int wfSlotId;
const int wfSlotId;
int kernId;
// SIMD unit where the WV has been scheduled
int simdId;
const int simdId;
// id of the execution unit (or pipeline) where the oldest instruction
// of the WF is scheduled
int execUnitId;
int flatLmUnitId;
int flatGmUnitId;
// pointer to parent CU
ComputeUnit *computeUnit;
int maxIbSize;
std::deque<GPUDynInstPtr> instructionBuffer;
bool pendingFetch;
bool dropFetch;
// last tick during which all WFs in the CU are not idle
Tick lastNonIdleTick;
// Condition Register State (for HSAIL simulations only)
class ConditionRegisterState *condRegState;
// number of single precision VGPRs required by WF
uint32_t maxSpVgprs;
// number of double precision VGPRs required by WF
uint32_t maxDpVgprs;
// map virtual to physical vector register
uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
// Execution unit resource ID's associated with this WF
// These are static mappings set at WF slot construction and
// based off of the simdId and wfSlotId.
// Index to scalarALUs resource vector in CU
int scalarAlu;
// Indices into readyList/dispatchList of resources used by this
// wavefront
int scalarAluGlobalIdx;
int globalMem;
int localMem;
int scalarMem;
// number of VGPRs required by WF
uint32_t maxVgprs;
// number of SGPRs required by WF
uint32_t maxSgprs;
void freeResources();
GPUDynInstPtr nextInstr();
void setStatus(status_e newStatus);
status_e getStatus() { return status; }
void resizeRegFiles(int num_vregs, int num_sregs);
bool isGmInstruction(GPUDynInstPtr ii);
bool isLmInstruction(GPUDynInstPtr ii);
bool isOldestInstWaitcnt();
bool isOldestInstGMem();
bool isOldestInstLMem();
bool isOldestInstPrivMem();
bool isOldestInstFlatMem();
bool isOldestInstALU();
bool isOldestInstVectorALU();
bool isOldestInstScalarALU();
bool isOldestInstScalarMem();
bool isOldestInstBarrier();
// used for passing spill address to DDInstGPU
std::vector<Addr> lastAddr;
std::vector<uint32_t> workItemId[3];
@@ -199,36 +154,44 @@ class Wavefront : public SimObject
/* the actual WG size can differ than the maximum size */
uint32_t actualWgSz[3];
uint32_t actualWgSzTotal;
void computeActualWgSz(NDRange *ndr);
void computeActualWgSz(HSAQueueEntry *task);
// wavefront id within a workgroup
uint32_t wfId;
uint32_t maxDynWaveId;
uint32_t dispatchId;
// outstanding global+local memory requests
uint32_t outstandingReqs;
// memory requests between scoreboard
// and execute stage not yet executed
uint32_t memReqsInPipe;
// vector and scalar memory requests pending in memory system
int outstandingReqs;
// outstanding global memory write requests
uint32_t outstandingReqsWrGm;
int outstandingReqsWrGm;
// outstanding local memory write requests
uint32_t outstandingReqsWrLm;
int outstandingReqsWrLm;
// outstanding global memory read requests
uint32_t outstandingReqsRdGm;
int outstandingReqsRdGm;
// outstanding local memory read requests
uint32_t outstandingReqsRdLm;
uint32_t rdLmReqsInPipe;
uint32_t rdGmReqsInPipe;
uint32_t wrLmReqsInPipe;
uint32_t wrGmReqsInPipe;
int outstandingReqsRdLm;
// outstanding scalar memory read requests
int scalarOutstandingReqsRdGm;
// outstanding scalar memory write requests
int scalarOutstandingReqsWrGm;
int rdLmReqsInPipe;
int rdGmReqsInPipe;
int wrLmReqsInPipe;
int wrGmReqsInPipe;
int scalarRdGmReqsInPipe;
int scalarWrGmReqsInPipe;
int memTraceBusy;
uint64_t lastTrace;
// number of vector registers reserved by WF
// number of virtual vector registers reserved by WF
int reservedVectorRegs;
// number of virtual scalar registers reserved by WF
int reservedScalarRegs;
// Index into the Vector Register File's namespace where the WF's registers
// will live while the WF is executed
uint32_t startVgprIndex;
// Index into the Scalar Register File's namespace where the WF's registers
// will live while the WF is executed
uint32_t startSgprIndex;
// Old value of destination gpr (for trace)
std::vector<uint32_t> oldVgpr;
@@ -257,64 +220,63 @@ class Wavefront : public SimObject
// to this workgroup (thus this wavefront)
LdsChunk *ldsChunk;
// A pointer to the spill area
Addr spillBase;
// The size of the spill area
uint32_t spillSizePerItem;
// The vector width of the spill area
uint32_t spillWidth;
// A pointer to the private memory area
Addr privBase;
// The size of the private memory area
uint32_t privSizePerItem;
// A pointer ot the read-only memory area
Addr roBase;
// size of the read-only memory area
uint32_t roSize;
// pointer to buffer for storing kernel arguments
uint8_t *kernelArgs;
// unique WF id over all WFs executed across all CUs
uint64_t wfDynId;
// number of times instruction issue for this wavefront is blocked
// due to VRF port availability
Stats::Scalar numTimesBlockedDueVrfPortAvail;
// Wavefront slot stats
// Number of instructions executed by this wavefront slot across all
// dynamic wavefronts
Stats::Scalar numInstrExecuted;
// Number of cycles this WF spends in SCH stage
Stats::Scalar schCycles;
// Number of stall cycles encounterd by this WF in SCH stage
Stats::Scalar schStalls;
// The following stats sum to the value of schStalls, and record, per
// WF slot, what the cause of each stall was at a coarse granularity.
// Cycles WF is selected by scheduler, but RFs cannot support instruction
Stats::Scalar schRfAccessStalls;
// Cycles spent waiting for execution resources
Stats::Scalar schResourceStalls;
// cycles spent waiting for RF reads to complete in SCH stage
Stats::Scalar schOpdNrdyStalls;
// LDS arbitration stall cycles. WF attempts to execute LM instruction,
// but another wave is executing FLAT, which requires LM and GM and forces
// this WF to stall.
Stats::Scalar schLdsArbStalls;
// number of times an instruction of a WF is blocked from being issued
// due to WAR and WAW dependencies
Stats::Scalar numTimesBlockedDueWAXDependencies;
// number of times an instruction of a WF is blocked from being issued
// due to WAR and WAW dependencies
Stats::Scalar numTimesBlockedDueRAWDependencies;
// distribution of executed instructions based on their register
// operands; this is used to highlight the load on the VRF
Stats::Distribution srcRegOpDist;
Stats::Distribution dstRegOpDist;
// Functions to operate on call argument memory
// argument memory for hsail call instruction
CallArgMem *callArgMem;
void
initCallArgMem(int func_args_size_per_item, int wf_size)
{
callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
}
// dyn inst id (per SIMD) of last instruction exec from this wave
uint64_t lastInstExec;
template<typename CType>
CType
readCallArgMem(int lane, int addr)
{
return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
}
// Distribution to track the distance between producer and consumer
// for vector register values
Stats::Distribution vecRawDistance;
// Map to track the dyn instruction id of each vector register value
// produced, indexed by physical vector register ID
std::unordered_map<int,uint64_t> rawDist;
template<typename CType>
void
writeCallArgMem(int lane, int addr, CType val)
{
callArgMem->setLaneAddr<CType>(lane, addr, val);
}
// Distribution to track the number of times every vector register
// value produced is consumed.
Stats::Distribution readsPerWrite;
// Counts the number of reads performed to each physical register
// - counts are reset to 0 for each dynamic wavefront launched
std::vector<int> vecReads;
void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems);
// context for save/restore
uint8_t *context;
typedef WavefrontParams Params;
Wavefront(const Params *p);
@@ -327,50 +289,31 @@ class Wavefront : public SimObject
computeUnit = cu;
}
void validateRequestCounters();
void start(uint64_t _wfDynId, uint64_t _base_ptr);
void exec();
void updateResources();
int ready(itype_e type);
bool instructionBufferHasBranch();
// called by SCH stage to reserve
std::vector<int> reserveResources();
bool stopFetch();
void regStats();
VectorMask getPred() { return execMask() & initMask; }
bool waitingAtBarrier(int lane);
void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
const VectorMask& exec_mask);
void popFromReconvergenceStack();
uint32_t pc() const;
uint32_t rpc() const;
VectorMask execMask() const;
Addr pc() const;
void pc(Addr new_pc);
VectorMask& execMask();
bool execMask(int lane) const;
void pc(uint32_t new_pc);
void discardFetch();
/**
* Returns the size of the static hardware context of a particular wavefront
* This should be updated everytime the context is changed
*/
uint32_t getStaticContextSize() const;
bool waitCntsSatisfied();
void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt);
void clearWaitCnts();
/**
* Returns the hardware context as a stream of bytes
* This method is designed for HSAIL execution
*/
void getContext(const void *out);
/**
* Sets the hardware context fromt a stream of bytes
* This method is designed for HSAIL execution
*/
void setContext(const void *in);
/** Freeing VRF space */
void freeRegisterFile();
TheGpuISA::GPUISA&
gpuISA()
@@ -380,14 +323,32 @@ class Wavefront : public SimObject
private:
TheGpuISA::GPUISA _gpuISA;
void reserveGmResource(GPUDynInstPtr ii);
void reserveLmResource(GPUDynInstPtr ii);
/**
* Stack containing Control Flow Graph nodes (i.e., kernel instructions)
* to be visited by the wavefront, and the associated execution masks. The
* reconvergence stack grows every time the wavefront reaches a divergence
* point (branch instruction), and shrinks every time the wavefront
* reaches a reconvergence point (immediate post-dominator instruction).
* the following are used for waitcnt instructions
* vmWaitCnt: once set, we wait for the oustanding
* number of vector mem instructions to be
* at, or below vmWaitCnt.
*
* expWaitCnt: once set, we wait for the outstanding
* number outstanding VM writes or EXP
* insts to be at, or below expWaitCnt.
*
* lgkmWaitCnt: once set, we wait for the oustanding
* number of LDS, GDS, scalar memory,
* and message instructions to be at, or
* below lgkmCount. we currently do not
* support GDS/message ops.
*/
std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
int vmWaitCnt;
int expWaitCnt;
int lgkmWaitCnt;
status_e status;
Addr _pc;
VectorMask _execMask;
};
#endif // __WAVEFRONT_HH__
#endif // __GPU_COMPUTE_WAVEFRONT_HH__