Files
gem5/src/gpu-compute/wavefront.hh
Gabe Black 91d83cc8a1 misc: Standardize the way create() constructs SimObjects.
The create() method on Params structs usually instantiate SimObjects
using a constructor which takes the Params struct as a parameter
somehow. There has been a lot of needless variation in how that was
done, making it annoying to pass Params down to base classes. Some of
the different forms were:

const Params &
Params &
Params *
const Params *
Params const*

This change goes through and fixes up every constructor and every
create() method to use the const Params & form. We use a reference
because the Params struct should never be null. We use const because
neither the create method nor the consuming object should modify the
record of the parameters as they came in from the config. That would
make consuming them not idempotent, and make it impossible to tell what
the actual simulation configuration was since it would change from any
user visible form (config script, config.ini, dot pdf output).

Change-Id: I77453cba52fdcfd5f4eec92dfb0bddb5a9945f31
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/35938
Reviewed-by: Gabe Black <gabeblack@google.com>
Reviewed-by: Daniel Carvalho <odanrc@yahoo.com.br>
Maintainer: Gabe Black <gabeblack@google.com>
Tested-by: kokoro <noreply+kokoro@google.com>
2020-10-14 12:06:44 +00:00

363 lines
12 KiB
C++

/*
* Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __GPU_COMPUTE_WAVEFRONT_HH__
#define __GPU_COMPUTE_WAVEFRONT_HH__
#include <cassert>
#include <deque>
#include <list>
#include <memory>
#include <unordered_map>
#include <vector>
#include "arch/gpu_isa.hh"
#include "base/logging.hh"
#include "base/types.hh"
#include "config/the_gpu_isa.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/dispatcher.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/hsa_queue_entry.hh"
#include "gpu-compute/lds_state.hh"
#include "gpu-compute/misc.hh"
#include "params/Wavefront.hh"
#include "sim/sim_object.hh"
class Wavefront : public SimObject
{
public:
enum status_e {
// wavefront is stalled
S_STOPPED,
// wavefront is returning from a kernel
S_RETURNING,
// wavefront is running normally
S_RUNNING,
// wavefront is stalled
S_STALLED,
/**
* wavefront has unsatisfied wait counts
*
* while in this state the WF will only execute if
* the oldest instruction is the waitcnt. while in
* S_WAITCNT, the wavefront will not be ready until
* all of its waitcnts have been satisfied. the
* scoreboard ready() function will check the status
* of the waitcnts whenever the WF is in S_WAITCNT,
* and once they are satisfied, it will resume normal
* operation.
*/
S_WAITCNT,
/**
* WF is stalled at a barrier.
*/
S_BARRIER
};
// HW slot id where the WF is mapped to inside a SIMD unit
const int wfSlotId;
int kernId;
// SIMD unit where the WV has been scheduled
const int simdId;
// id of the execution unit (or pipeline) where the oldest instruction
// of the WF is scheduled
int execUnitId;
int flatLmUnitId;
int flatGmUnitId;
// pointer to parent CU
ComputeUnit *computeUnit;
int maxIbSize;
std::deque<GPUDynInstPtr> instructionBuffer;
bool pendingFetch;
bool dropFetch;
// last tick during which all WFs in the CU are not idle
Tick lastNonIdleTick;
// Execution unit resource ID's associated with this WF
// These are static mappings set at WF slot construction and
// based off of the simdId and wfSlotId.
// Index to scalarALUs resource vector in CU
int scalarAlu;
// Indices into readyList/dispatchList of resources used by this
// wavefront
int scalarAluGlobalIdx;
int globalMem;
int localMem;
int scalarMem;
// number of VGPRs required by WF
uint32_t maxVgprs;
// number of SGPRs required by WF
uint32_t maxSgprs;
void freeResources();
GPUDynInstPtr nextInstr();
void setStatus(status_e newStatus);
status_e getStatus() { return status; }
void resizeRegFiles(int num_vregs, int num_sregs);
bool isGmInstruction(GPUDynInstPtr ii);
bool isLmInstruction(GPUDynInstPtr ii);
bool isOldestInstWaitcnt();
bool isOldestInstGMem();
bool isOldestInstLMem();
bool isOldestInstPrivMem();
bool isOldestInstFlatMem();
bool isOldestInstVectorALU();
bool isOldestInstScalarALU();
bool isOldestInstScalarMem();
bool isOldestInstBarrier();
// used for passing spill address to DDInstGPU
std::vector<Addr> lastAddr;
std::vector<uint32_t> workItemId[3];
std::vector<uint32_t> workItemFlatId;
/* kernel launch parameters */
uint32_t workGroupId[3];
uint32_t workGroupSz[3];
uint32_t gridSz[3];
uint32_t wgId;
uint32_t wgSz;
/* the actual WG size can differ than the maximum size */
uint32_t actualWgSz[3];
uint32_t actualWgSzTotal;
void computeActualWgSz(HSAQueueEntry *task);
// wavefront id within a workgroup
uint32_t wfId;
uint32_t maxDynWaveId;
uint32_t dispatchId;
// vector and scalar memory requests pending in memory system
int outstandingReqs;
// outstanding global memory write requests
int outstandingReqsWrGm;
// outstanding local memory write requests
int outstandingReqsWrLm;
// outstanding global memory read requests
int outstandingReqsRdGm;
// outstanding local memory read requests
int outstandingReqsRdLm;
// outstanding scalar memory read requests
int scalarOutstandingReqsRdGm;
// outstanding scalar memory write requests
int scalarOutstandingReqsWrGm;
int rdLmReqsInPipe;
int rdGmReqsInPipe;
int wrLmReqsInPipe;
int wrGmReqsInPipe;
int scalarRdGmReqsInPipe;
int scalarWrGmReqsInPipe;
int memTraceBusy;
uint64_t lastTrace;
// number of virtual vector registers reserved by WF
int reservedVectorRegs;
// number of virtual scalar registers reserved by WF
int reservedScalarRegs;
// Index into the Vector Register File's namespace where the WF's registers
// will live while the WF is executed
uint32_t startVgprIndex;
// Index into the Scalar Register File's namespace where the WF's registers
// will live while the WF is executed
uint32_t startSgprIndex;
// Old value of destination gpr (for trace)
std::vector<uint32_t> oldVgpr;
// Id of destination gpr (for trace)
uint32_t oldVgprId;
// Tick count of last old_vgpr copy
uint64_t oldVgprTcnt;
// Old value of destination gpr (for trace)
std::vector<uint64_t> oldDgpr;
// Id of destination gpr (for trace)
uint32_t oldDgprId;
// Tick count of last old_vgpr copy
uint64_t oldDgprTcnt;
// Execution mask at wavefront start
VectorMask initMask;
// a pointer to the fraction of the LDS allocated
// to this workgroup (thus this wavefront)
LdsChunk *ldsChunk;
// unique WF id over all WFs executed across all CUs
uint64_t wfDynId;
// Wavefront slot stats
// Number of instructions executed by this wavefront slot across all
// dynamic wavefronts
Stats::Scalar numInstrExecuted;
// Number of cycles this WF spends in SCH stage
Stats::Scalar schCycles;
// Number of stall cycles encounterd by this WF in SCH stage
Stats::Scalar schStalls;
// The following stats sum to the value of schStalls, and record, per
// WF slot, what the cause of each stall was at a coarse granularity.
// Cycles WF is selected by scheduler, but RFs cannot support instruction
Stats::Scalar schRfAccessStalls;
// Cycles spent waiting for execution resources
Stats::Scalar schResourceStalls;
// cycles spent waiting for RF reads to complete in SCH stage
Stats::Scalar schOpdNrdyStalls;
// LDS arbitration stall cycles. WF attempts to execute LM instruction,
// but another wave is executing FLAT, which requires LM and GM and forces
// this WF to stall.
Stats::Scalar schLdsArbStalls;
// number of times an instruction of a WF is blocked from being issued
// due to WAR and WAW dependencies
Stats::Scalar numTimesBlockedDueWAXDependencies;
// number of times an instruction of a WF is blocked from being issued
// due to WAR and WAW dependencies
Stats::Scalar numTimesBlockedDueRAWDependencies;
// dyn inst id (per SIMD) of last instruction exec from this wave
uint64_t lastInstExec;
// Distribution to track the distance between producer and consumer
// for vector register values
Stats::Distribution vecRawDistance;
// Map to track the dyn instruction id of each vector register value
// produced, indexed by physical vector register ID
std::unordered_map<int,uint64_t> rawDist;
// Distribution to track the number of times every vector register
// value produced is consumed.
Stats::Distribution readsPerWrite;
// Counts the number of reads performed to each physical register
// - counts are reset to 0 for each dynamic wavefront launched
std::vector<int> vecReads;
void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems);
// context for save/restore
uint8_t *context;
typedef WavefrontParams Params;
Wavefront(const Params &p);
~Wavefront();
virtual void init();
void
setParent(ComputeUnit *cu)
{
computeUnit = cu;
}
void validateRequestCounters();
void start(uint64_t _wfDynId, uint64_t _base_ptr);
void exec();
// called by SCH stage to reserve
std::vector<int> reserveResources();
bool stopFetch();
void regStats();
Addr pc() const;
void pc(Addr new_pc);
VectorMask& execMask();
bool execMask(int lane) const;
void discardFetch();
bool waitCntsSatisfied();
void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt);
void clearWaitCnts();
void incVMemInstsIssued();
void incExpInstsIssued();
void incLGKMInstsIssued();
void decVMemInstsIssued();
void decExpInstsIssued();
void decLGKMInstsIssued();
/** Freeing VRF space */
void freeRegisterFile();
TheGpuISA::GPUISA&
gpuISA()
{
return _gpuISA;
}
void barrierId(int bar_id);
int barrierId() const;
bool hasBarrier() const;
void releaseBarrier();
private:
TheGpuISA::GPUISA _gpuISA;
void reserveGmResource(GPUDynInstPtr ii);
void reserveLmResource(GPUDynInstPtr ii);
/**
* the following are used for waitcnt instructions
* vmWaitCnt: once set, we wait for the oustanding
* number of vector mem instructions to be
* at, or below vmWaitCnt.
*
* expWaitCnt: once set, we wait for the outstanding
* number outstanding VM writes or EXP
* insts to be at, or below expWaitCnt.
*
* lgkmWaitCnt: once set, we wait for the oustanding
* number of LDS, GDS, scalar memory,
* and message instructions to be at, or
* below lgkmCount. we currently do not
* support GDS/message ops.
*/
int vmWaitCnt;
int expWaitCnt;
int lgkmWaitCnt;
int vmemInstsIssued;
int expInstsIssued;
int lgkmInstsIssued;
status_e status;
Addr _pc;
VectorMask _execMask;
int barId;
};
#endif // __GPU_COMPUTE_WAVEFRONT_HH__