Change-Id: Ibe46970f3ba25d62ca2ade5cbc2054ad746b2254 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29912 Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com> Tested-by: kokoro <noreply+kokoro@google.com>
355 lines
11 KiB
C++
355 lines
11 KiB
C++
/*
|
|
* Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* For use for simulation and test purposes only
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from this
|
|
* software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#ifndef __GPU_COMPUTE_WAVEFRONT_HH__
|
|
#define __GPU_COMPUTE_WAVEFRONT_HH__
|
|
|
|
#include <cassert>
|
|
#include <deque>
|
|
#include <list>
|
|
#include <memory>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
|
|
#include "arch/gpu_isa.hh"
|
|
#include "base/logging.hh"
|
|
#include "base/types.hh"
|
|
#include "config/the_gpu_isa.hh"
|
|
#include "gpu-compute/compute_unit.hh"
|
|
#include "gpu-compute/dispatcher.hh"
|
|
#include "gpu-compute/gpu_dyn_inst.hh"
|
|
#include "gpu-compute/hsa_queue_entry.hh"
|
|
#include "gpu-compute/lds_state.hh"
|
|
#include "gpu-compute/misc.hh"
|
|
#include "params/Wavefront.hh"
|
|
#include "sim/sim_object.hh"
|
|
|
|
class Wavefront : public SimObject
|
|
{
|
|
public:
|
|
enum status_e {
|
|
// wavefront is stalled
|
|
S_STOPPED,
|
|
// wavefront is returning from a kernel
|
|
S_RETURNING,
|
|
// wavefront is running normally
|
|
S_RUNNING,
|
|
// wavefront is stalled
|
|
S_STALLED,
|
|
/**
|
|
* wavefront has unsatisfied wait counts
|
|
*
|
|
* while in this state the WF will only execute if
|
|
* the oldest instruction is the waitcnt. while in
|
|
* S_WAITCNT, the wavefront will not be ready until
|
|
* all of its waitcnts have been satisfied. the
|
|
* scoreboard ready() function will check the status
|
|
* of the waitcnts whenever the WF is in S_WAITCNT,
|
|
* and once they are satisfied, it will resume normal
|
|
* operation.
|
|
*/
|
|
S_WAITCNT
|
|
};
|
|
|
|
uint32_t oldBarrierCnt;
|
|
uint32_t barrierCnt;
|
|
uint32_t barrierId;
|
|
uint32_t barrierSlots;
|
|
// HW slot id where the WF is mapped to inside a SIMD unit
|
|
const int wfSlotId;
|
|
int kernId;
|
|
// SIMD unit where the WV has been scheduled
|
|
const int simdId;
|
|
// id of the execution unit (or pipeline) where the oldest instruction
|
|
// of the WF is scheduled
|
|
int execUnitId;
|
|
int flatLmUnitId;
|
|
int flatGmUnitId;
|
|
// pointer to parent CU
|
|
ComputeUnit *computeUnit;
|
|
int maxIbSize;
|
|
|
|
std::deque<GPUDynInstPtr> instructionBuffer;
|
|
|
|
bool pendingFetch;
|
|
bool dropFetch;
|
|
// last tick during which all WFs in the CU are not idle
|
|
Tick lastNonIdleTick;
|
|
|
|
// Execution unit resource ID's associated with this WF
|
|
// These are static mappings set at WF slot construction and
|
|
// based off of the simdId and wfSlotId.
|
|
|
|
// Index to scalarALUs resource vector in CU
|
|
int scalarAlu;
|
|
|
|
// Indices into readyList/dispatchList of resources used by this
|
|
// wavefront
|
|
int scalarAluGlobalIdx;
|
|
int globalMem;
|
|
int localMem;
|
|
int scalarMem;
|
|
|
|
// number of VGPRs required by WF
|
|
uint32_t maxVgprs;
|
|
// number of SGPRs required by WF
|
|
uint32_t maxSgprs;
|
|
void freeResources();
|
|
GPUDynInstPtr nextInstr();
|
|
void setStatus(status_e newStatus);
|
|
status_e getStatus() { return status; }
|
|
void resizeRegFiles(int num_vregs, int num_sregs);
|
|
bool isGmInstruction(GPUDynInstPtr ii);
|
|
bool isLmInstruction(GPUDynInstPtr ii);
|
|
bool isOldestInstWaitcnt();
|
|
bool isOldestInstGMem();
|
|
bool isOldestInstLMem();
|
|
bool isOldestInstPrivMem();
|
|
bool isOldestInstFlatMem();
|
|
bool isOldestInstVectorALU();
|
|
bool isOldestInstScalarALU();
|
|
bool isOldestInstScalarMem();
|
|
bool isOldestInstBarrier();
|
|
|
|
// used for passing spill address to DDInstGPU
|
|
std::vector<Addr> lastAddr;
|
|
std::vector<uint32_t> workItemId[3];
|
|
std::vector<uint32_t> workItemFlatId;
|
|
/* kernel launch parameters */
|
|
uint32_t workGroupId[3];
|
|
uint32_t workGroupSz[3];
|
|
uint32_t gridSz[3];
|
|
uint32_t wgId;
|
|
uint32_t wgSz;
|
|
/* the actual WG size can differ than the maximum size */
|
|
uint32_t actualWgSz[3];
|
|
uint32_t actualWgSzTotal;
|
|
void computeActualWgSz(HSAQueueEntry *task);
|
|
// wavefront id within a workgroup
|
|
uint32_t wfId;
|
|
uint32_t maxDynWaveId;
|
|
uint32_t dispatchId;
|
|
// vector and scalar memory requests pending in memory system
|
|
int outstandingReqs;
|
|
// outstanding global memory write requests
|
|
int outstandingReqsWrGm;
|
|
// outstanding local memory write requests
|
|
int outstandingReqsWrLm;
|
|
// outstanding global memory read requests
|
|
int outstandingReqsRdGm;
|
|
// outstanding local memory read requests
|
|
int outstandingReqsRdLm;
|
|
// outstanding scalar memory read requests
|
|
int scalarOutstandingReqsRdGm;
|
|
// outstanding scalar memory write requests
|
|
int scalarOutstandingReqsWrGm;
|
|
int rdLmReqsInPipe;
|
|
int rdGmReqsInPipe;
|
|
int wrLmReqsInPipe;
|
|
int wrGmReqsInPipe;
|
|
int scalarRdGmReqsInPipe;
|
|
int scalarWrGmReqsInPipe;
|
|
|
|
int memTraceBusy;
|
|
uint64_t lastTrace;
|
|
// number of virtual vector registers reserved by WF
|
|
int reservedVectorRegs;
|
|
// number of virtual scalar registers reserved by WF
|
|
int reservedScalarRegs;
|
|
// Index into the Vector Register File's namespace where the WF's registers
|
|
// will live while the WF is executed
|
|
uint32_t startVgprIndex;
|
|
// Index into the Scalar Register File's namespace where the WF's registers
|
|
// will live while the WF is executed
|
|
uint32_t startSgprIndex;
|
|
|
|
// Old value of destination gpr (for trace)
|
|
std::vector<uint32_t> oldVgpr;
|
|
// Id of destination gpr (for trace)
|
|
uint32_t oldVgprId;
|
|
// Tick count of last old_vgpr copy
|
|
uint64_t oldVgprTcnt;
|
|
|
|
// Old value of destination gpr (for trace)
|
|
std::vector<uint64_t> oldDgpr;
|
|
// Id of destination gpr (for trace)
|
|
uint32_t oldDgprId;
|
|
// Tick count of last old_vgpr copy
|
|
uint64_t oldDgprTcnt;
|
|
|
|
// Execution mask at wavefront start
|
|
VectorMask initMask;
|
|
|
|
// number of barriers this WF has joined
|
|
std::vector<int> barCnt;
|
|
int maxBarCnt;
|
|
// Flag to stall a wave on barrier
|
|
bool stalledAtBarrier;
|
|
|
|
// a pointer to the fraction of the LDS allocated
|
|
// to this workgroup (thus this wavefront)
|
|
LdsChunk *ldsChunk;
|
|
|
|
// unique WF id over all WFs executed across all CUs
|
|
uint64_t wfDynId;
|
|
|
|
// Wavefront slot stats
|
|
|
|
// Number of instructions executed by this wavefront slot across all
|
|
// dynamic wavefronts
|
|
Stats::Scalar numInstrExecuted;
|
|
|
|
// Number of cycles this WF spends in SCH stage
|
|
Stats::Scalar schCycles;
|
|
|
|
// Number of stall cycles encounterd by this WF in SCH stage
|
|
Stats::Scalar schStalls;
|
|
|
|
// The following stats sum to the value of schStalls, and record, per
|
|
// WF slot, what the cause of each stall was at a coarse granularity.
|
|
|
|
// Cycles WF is selected by scheduler, but RFs cannot support instruction
|
|
Stats::Scalar schRfAccessStalls;
|
|
// Cycles spent waiting for execution resources
|
|
Stats::Scalar schResourceStalls;
|
|
// cycles spent waiting for RF reads to complete in SCH stage
|
|
Stats::Scalar schOpdNrdyStalls;
|
|
// LDS arbitration stall cycles. WF attempts to execute LM instruction,
|
|
// but another wave is executing FLAT, which requires LM and GM and forces
|
|
// this WF to stall.
|
|
Stats::Scalar schLdsArbStalls;
|
|
|
|
// number of times an instruction of a WF is blocked from being issued
|
|
// due to WAR and WAW dependencies
|
|
Stats::Scalar numTimesBlockedDueWAXDependencies;
|
|
// number of times an instruction of a WF is blocked from being issued
|
|
// due to WAR and WAW dependencies
|
|
Stats::Scalar numTimesBlockedDueRAWDependencies;
|
|
|
|
// dyn inst id (per SIMD) of last instruction exec from this wave
|
|
uint64_t lastInstExec;
|
|
|
|
// Distribution to track the distance between producer and consumer
|
|
// for vector register values
|
|
Stats::Distribution vecRawDistance;
|
|
// Map to track the dyn instruction id of each vector register value
|
|
// produced, indexed by physical vector register ID
|
|
std::unordered_map<int,uint64_t> rawDist;
|
|
|
|
// Distribution to track the number of times every vector register
|
|
// value produced is consumed.
|
|
Stats::Distribution readsPerWrite;
|
|
// Counts the number of reads performed to each physical register
|
|
// - counts are reset to 0 for each dynamic wavefront launched
|
|
std::vector<int> vecReads;
|
|
|
|
void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems);
|
|
|
|
// context for save/restore
|
|
uint8_t *context;
|
|
|
|
typedef WavefrontParams Params;
|
|
Wavefront(const Params *p);
|
|
~Wavefront();
|
|
virtual void init();
|
|
|
|
void
|
|
setParent(ComputeUnit *cu)
|
|
{
|
|
computeUnit = cu;
|
|
}
|
|
|
|
void validateRequestCounters();
|
|
void start(uint64_t _wfDynId, uint64_t _base_ptr);
|
|
void exec();
|
|
// called by SCH stage to reserve
|
|
std::vector<int> reserveResources();
|
|
bool stopFetch();
|
|
void regStats();
|
|
|
|
bool waitingAtBarrier(int lane);
|
|
|
|
Addr pc() const;
|
|
void pc(Addr new_pc);
|
|
|
|
VectorMask& execMask();
|
|
bool execMask(int lane) const;
|
|
|
|
|
|
void discardFetch();
|
|
|
|
bool waitCntsSatisfied();
|
|
void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt);
|
|
void clearWaitCnts();
|
|
|
|
/** Freeing VRF space */
|
|
void freeRegisterFile();
|
|
|
|
TheGpuISA::GPUISA&
|
|
gpuISA()
|
|
{
|
|
return _gpuISA;
|
|
}
|
|
|
|
private:
|
|
TheGpuISA::GPUISA _gpuISA;
|
|
|
|
void reserveGmResource(GPUDynInstPtr ii);
|
|
void reserveLmResource(GPUDynInstPtr ii);
|
|
|
|
/**
|
|
* the following are used for waitcnt instructions
|
|
* vmWaitCnt: once set, we wait for the oustanding
|
|
* number of vector mem instructions to be
|
|
* at, or below vmWaitCnt.
|
|
*
|
|
* expWaitCnt: once set, we wait for the outstanding
|
|
* number outstanding VM writes or EXP
|
|
* insts to be at, or below expWaitCnt.
|
|
*
|
|
* lgkmWaitCnt: once set, we wait for the oustanding
|
|
* number of LDS, GDS, scalar memory,
|
|
* and message instructions to be at, or
|
|
* below lgkmCount. we currently do not
|
|
* support GDS/message ops.
|
|
*/
|
|
int vmWaitCnt;
|
|
int expWaitCnt;
|
|
int lgkmWaitCnt;
|
|
status_e status;
|
|
Addr _pc;
|
|
VectorMask _execMask;
|
|
};
|
|
|
|
#endif // __GPU_COMPUTE_WAVEFRONT_HH__
|