/* * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef __COMPUTE_UNIT_HH__ #define __COMPUTE_UNIT_HH__ #include #include #include #include #include "base/callback.hh" #include "base/compiler.hh" #include "base/statistics.hh" #include "base/stats/group.hh" #include "base/types.hh" #include "config/the_gpu_isa.hh" #include "enums/PrefetchType.hh" #include "gpu-compute/comm.hh" #include "gpu-compute/exec_stage.hh" #include "gpu-compute/fetch_stage.hh" #include "gpu-compute/global_memory_pipeline.hh" #include "gpu-compute/hsa_queue_entry.hh" #include "gpu-compute/local_memory_pipeline.hh" #include "gpu-compute/register_manager.hh" #include "gpu-compute/scalar_memory_pipeline.hh" #include "gpu-compute/schedule_stage.hh" #include "gpu-compute/scoreboard_check_stage.hh" #include "mem/port.hh" #include "mem/token_port.hh" #include "sim/clocked_object.hh" namespace gem5 { class HSAQueueEntry; class LdsChunk; class ScalarRegisterFile; class Shader; class VectorRegisterFile; struct ComputeUnitParams; enum EXEC_POLICY { OLDEST = 0, RR }; enum TLB_CACHE { TLB_MISS_CACHE_MISS = 0, TLB_MISS_CACHE_HIT, TLB_HIT_CACHE_MISS, TLB_HIT_CACHE_HIT }; /** * WF barrier slots. This represents the barrier resource for * WF-level barriers (i.e., barriers to sync WFs within a WG). */ class WFBarrier { public: WFBarrier() : _numAtBarrier(0), _maxBarrierCnt(0) { } static const int InvalidID = -1; int numAtBarrier() const { return _numAtBarrier; } /** * Number of WFs that have not yet reached the barrier. */ int numYetToReachBarrier() const { return _maxBarrierCnt - _numAtBarrier; } int maxBarrierCnt() const { return _maxBarrierCnt; } /** * Set the maximum barrier count (i.e., the number of WFs that are * participating in the barrier). */ void setMaxBarrierCnt(int max_barrier_cnt) { _maxBarrierCnt = max_barrier_cnt; } /** * Mark that a WF has reached the barrier. */ void incNumAtBarrier() { assert(_numAtBarrier < _maxBarrierCnt); ++_numAtBarrier; } /** * Have all WFs participating in this barrier reached the barrier? * If so, then the barrier is satisfied and WFs may proceed past * the barrier. */ bool allAtBarrier() const { return _numAtBarrier == _maxBarrierCnt; } /** * Decrement the number of WFs that are participating in this barrier. * This should be called when a WF exits. */ void decMaxBarrierCnt() { assert(_maxBarrierCnt > 0); --_maxBarrierCnt; } /** * Release this barrier resource so it can be used by other WGs. This * is generally called when a WG has finished. */ void release() { _numAtBarrier = 0; _maxBarrierCnt = 0; } /** * Reset the barrier. This is used to reset the barrier, usually when * a dynamic instance of a barrier has been satisfied. */ void reset() { _numAtBarrier = 0; } private: /** * The number of WFs in the WG that have reached the barrier. Once * the number of WFs that reach a barrier matches the number of WFs * in the WG, the barrier is satisfied. */ int _numAtBarrier; /** * The maximum number of WFs that can reach this barrier. This is * essentially the number of WFs in the WG, and a barrier is satisfied * when the number of WFs that reach the barrier equal this value. If * a WF exits early it must decrement this value so that it is no * longer considered for this barrier. */ int _maxBarrierCnt; }; class ComputeUnit : public ClockedObject { public: // Execution resources // // The ordering of units is: // Vector ALUs // Scalar ALUs // GM Pipe // LM Pipe // Scalar Mem Pipe // // Note: the ordering of units is important and the code assumes the // above ordering. However, there may be more than one resource of // each type (e.g., 4 VALUs or 2 SALUs) int numVectorGlobalMemUnits; // Resource control for global memory to VRF data/address bus WaitClass glbMemToVrfBus; // Resource control for Vector Register File->Global Memory pipe buses WaitClass vrfToGlobalMemPipeBus; // Resource control for Vector Global Memory execution unit WaitClass vectorGlobalMemUnit; int numVectorSharedMemUnits; // Resource control for local memory to VRF data/address bus WaitClass locMemToVrfBus; // Resource control for Vector Register File->Local Memory pipe buses WaitClass vrfToLocalMemPipeBus; // Resource control for Vector Shared/Local Memory execution unit WaitClass vectorSharedMemUnit; int numScalarMemUnits; // Resource control for scalar memory to SRF data/address bus WaitClass scalarMemToSrfBus; // Resource control for Scalar Register File->Scalar Memory pipe buses WaitClass srfToScalarMemPipeBus; // Resource control for Scalar Memory execution unit WaitClass scalarMemUnit; // vector ALU execution resources int numVectorALUs; std::vector vectorALUs; // scalar ALU execution resources int numScalarALUs; std::vector scalarALUs; // Return total number of execution units on this CU int numExeUnits() const; // index into readyList of the first memory unit int firstMemUnit() const; // index into readyList of the last memory unit int lastMemUnit() const; // index into scalarALUs vector of SALU used by the wavefront int mapWaveToScalarAlu(Wavefront *w) const; // index into readyList of SALU used by wavefront int mapWaveToScalarAluGlobalIdx(Wavefront *w) const; // index into readyList of Global Memory unit used by wavefront int mapWaveToGlobalMem(Wavefront *w) const; // index into readyList of Local Memory unit used by wavefront int mapWaveToLocalMem(Wavefront *w) const; // index into readyList of Scalar Memory unit used by wavefront int mapWaveToScalarMem(Wavefront *w) const; int vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes int coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes int numCyclesPerStoreTransfer; // number of cycles per vector store int numCyclesPerLoadTransfer; // number of cycles per vector load // track presence of dynamic instructions in the Schedule pipeline // stage. This is used to check the readiness of the oldest, // non-dispatched instruction of every WF in the Scoreboard stage. std::unordered_set pipeMap; RegisterManager* registerManager; FetchStage fetchStage; ScoreboardCheckStage scoreboardCheckStage; ScheduleStage scheduleStage; ExecStage execStage; GlobalMemPipeline globalMemoryPipe; LocalMemPipeline localMemoryPipe; ScalarMemPipeline scalarMemoryPipe; EventFunctionWrapper tickEvent; typedef ComputeUnitParams Params; std::vector> wfList; int cu_id; // array of vector register files, one per SIMD std::vector vrf; // array of scalar register files, one per SIMD std::vector srf; // Width per VALU/SIMD unit: number of work items that can be executed // on the vector ALU simultaneously in a SIMD unit int simdWidth; // number of pipe stages for bypassing data to next dependent single // precision vector instruction inside the vector ALU pipeline int spBypassPipeLength; // number of pipe stages for bypassing data to next dependent double // precision vector instruction inside the vector ALU pipeline int dpBypassPipeLength; // number of pipe stages for scalar ALU int scalarPipeStages; // number of pipe stages for operand collection & distribution network int operandNetworkLength; // number of cycles per instruction issue period Cycles issuePeriod; // VRF to GM Bus latency Cycles vrf_gm_bus_latency; // SRF to Scalar Mem Bus latency Cycles srf_scm_bus_latency; // VRF to LM Bus latency Cycles vrf_lm_bus_latency; // tracks the last cycle a vector instruction was executed on a SIMD std::vector lastExecCycle; // tracks the number of dyn inst executed per SIMD std::vector instExecPerSimd; // true if we allow a separate TLB per lane bool perLaneTLB; // if 0, TLB prefetching is off. int prefetchDepth; // if fixed-stride prefetching, this is the stride. int prefetchStride; std::vector lastVaddrCU; std::vector> lastVaddrSimd; std::vector>> lastVaddrWF; enums::PrefetchType prefetchType; EXEC_POLICY exec_policy; bool debugSegFault; // Idle CU timeout in ticks Tick idleCUTimeout; int idleWfs; bool functionalTLB; bool localMemBarrier; /* * for Counting page accesses */ bool countPages; Shader *shader; Tick req_tick_latency; Tick resp_tick_latency; Tick scalar_req_tick_latency; Tick scalar_resp_tick_latency; /** * Number of WFs to schedule to each SIMD. This vector is populated * by hasDispResources(), and consumed by the subsequent call to * dispWorkgroup(), to schedule the specified number of WFs to the * SIMD units. Entry I provides the number of WFs to schedule to SIMD I. */ std::vector numWfsToSched; // number of currently reserved vector registers per SIMD unit std::vector vectorRegsReserved; // number of currently reserved scalar registers per SIMD unit std::vector scalarRegsReserved; // number of vector registers per SIMD unit int numVecRegsPerSimd; // number of available scalar registers per SIMD unit int numScalarRegsPerSimd; // this hash map will keep track of page divergence // per memory instruction per wavefront. The hash map // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc. std::map pagesTouched; void insertInPipeMap(Wavefront *w); void deleteFromPipeMap(Wavefront *w); ComputeUnit(const Params &p); ~ComputeUnit(); // Timing Functions int oprNetPipeLength() const { return operandNetworkLength; } int simdUnitWidth() const { return simdWidth; } int spBypassLength() const { return spBypassPipeLength; } int dpBypassLength() const { return dpBypassPipeLength; } int scalarPipeLength() const { return scalarPipeStages; } int storeBusLength() const { return numCyclesPerStoreTransfer; } int loadBusLength() const { return numCyclesPerLoadTransfer; } int wfSize() const { return wavefrontSize; } void exec(); void initiateFetch(Wavefront *wavefront); void fetch(PacketPtr pkt, Wavefront *wavefront); void fillKernelState(Wavefront *w, HSAQueueEntry *task); void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, HSAQueueEntry *task, int bar_id, bool fetchContext=false); void doInvalidate(RequestPtr req, int kernId); void doFlush(GPUDynInstPtr gpuDynInst); void dispWorkgroup(HSAQueueEntry *task, int num_wfs_in_wg); bool hasDispResources(HSAQueueEntry *task, int &num_wfs_in_wg); int cacheLineSize() const { return _cacheLineSize; } int getCacheLineBits() const { return cacheLineBits; } void resetRegisterPool(); private: WFBarrier& barrierSlot(int bar_id) { assert(bar_id > WFBarrier::InvalidID); return wfBarrierSlots.at(bar_id); } int getFreeBarrierId() { assert(freeBarrierIds.size()); auto free_bar_id = freeBarrierIds.begin(); int bar_id = *free_bar_id; freeBarrierIds.erase(free_bar_id); return bar_id; } public: int numYetToReachBarrier(int bar_id); bool allAtBarrier(int bar_id); void incNumAtBarrier(int bar_id); int numAtBarrier(int bar_id); int maxBarrierCnt(int bar_id); void resetBarrier(int bar_id); void decMaxBarrierCnt(int bar_id); void releaseBarrier(int bar_id); void releaseWFsFromBarrier(int bar_id); int numBarrierSlots() const { return _numBarrierSlots; } template void doSmReturn(GPUDynInstPtr gpuDynInst); virtual void init() override; void sendRequest(GPUDynInstPtr gpuDynInst, PortID index, PacketPtr pkt); void sendScalarRequest(GPUDynInstPtr gpuDynInst, PacketPtr pkt); void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelMemSync, RequestPtr req=nullptr); void handleMemPacket(PacketPtr pkt, int memport_index); bool processTimingPacket(PacketPtr pkt); void processFetchReturn(PacketPtr pkt); void updatePageDivergenceDist(Addr addr); RequestorID requestorId() { return _requestorId; } RequestorID vramRequestorId(); bool isDone() const; bool isVectorAluIdle(uint32_t simdId) const; void handleSQCReturn(PacketPtr pkt); protected: RequestorID _requestorId; LdsState &lds; public: LdsState & getLds() const { return lds; } int32_t getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const; [[nodiscard]] bool sendToLds(GPUDynInstPtr gpuDynInst); typedef std::unordered_map> pageDataStruct; pageDataStruct pageAccesses; void exitCallback(); class GMTokenPort : public TokenRequestPort { public: GMTokenPort(const std::string& name, SimObject *owner, PortID id = InvalidPortID) : TokenRequestPort(name, owner, id) { } ~GMTokenPort() { } protected: bool recvTimingResp(PacketPtr) { return false; } void recvReqRetry() { } }; // Manager for the number of tokens available to this compute unit to // send global memory request packets to the coalescer this is only used // between global memory pipe and TCP coalescer. TokenManager *memPortTokens; GMTokenPort gmTokenPort; /** Data access Port **/ class DataPort : public RequestPort { public: DataPort(const std::string &_name, ComputeUnit *_cu, PortID id) : RequestPort(_name, _cu, id), computeUnit(_cu) { } bool snoopRangeSent; struct SenderState : public Packet::SenderState { GPUDynInstPtr _gpuDynInst; PortID port_index; Packet::SenderState *saved; SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index, Packet::SenderState *sender_state=nullptr) : _gpuDynInst(gpuDynInst), port_index(_port_index), saved(sender_state) { } }; class SystemHubEvent : public Event { DataPort *dataPort; PacketPtr reqPkt; public: SystemHubEvent(PacketPtr pkt, DataPort *_dataPort) : dataPort(_dataPort), reqPkt(pkt) { setFlags(Event::AutoDelete); } void process() { // DMAs do not operate on packets and therefore do not // convert to a response. Do that here instead. reqPkt->makeResponse(); dataPort->handleResponse(reqPkt); } }; void processMemReqEvent(PacketPtr pkt); EventFunctionWrapper *createMemReqEvent(PacketPtr pkt); void processMemRespEvent(PacketPtr pkt); EventFunctionWrapper *createMemRespEvent(PacketPtr pkt); std::deque> retries; bool handleResponse(PacketPtr pkt); protected: ComputeUnit *computeUnit; virtual bool recvTimingResp(PacketPtr pkt); virtual Tick recvAtomic(PacketPtr pkt) { return 0; } virtual void recvFunctional(PacketPtr pkt) { } virtual void recvRangeChange() { } virtual void recvReqRetry(); virtual void getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) { resp.clear(); snoop = true; } }; // Scalar data cache access port class ScalarDataPort : public RequestPort { public: ScalarDataPort(const std::string &_name, ComputeUnit *_cu) : RequestPort(_name, _cu), computeUnit(_cu) { } bool recvTimingResp(PacketPtr pkt) override; void recvReqRetry() override; struct SenderState : public Packet::SenderState { SenderState(GPUDynInstPtr gpuDynInst, Packet::SenderState *sender_state=nullptr) : _gpuDynInst(gpuDynInst), saved(sender_state) { } GPUDynInstPtr _gpuDynInst; Packet::SenderState *saved; }; class MemReqEvent : public Event { private: ScalarDataPort &scalarDataPort; PacketPtr pkt; public: MemReqEvent(ScalarDataPort &_scalar_data_port, PacketPtr _pkt) : Event(), scalarDataPort(_scalar_data_port), pkt(_pkt) { setFlags(Event::AutoDelete); } void process(); const char *description() const; }; class SystemHubEvent : public Event { ScalarDataPort *dataPort; PacketPtr reqPkt; public: SystemHubEvent(PacketPtr pkt, ScalarDataPort *_dataPort) : dataPort(_dataPort), reqPkt(pkt) { setFlags(Event::AutoDelete); } void process() { // DMAs do not operate on packets and therefore do not // convert to a response. Do that here instead. reqPkt->makeResponse(); dataPort->handleResponse(reqPkt); } }; bool handleResponse(PacketPtr pkt); std::deque retries; private: ComputeUnit *computeUnit; }; // Instruction cache access port class SQCPort : public RequestPort { public: SQCPort(const std::string &_name, ComputeUnit *_cu) : RequestPort(_name, _cu), computeUnit(_cu) { } bool snoopRangeSent; struct SenderState : public Packet::SenderState { Wavefront *wavefront; Packet::SenderState *saved; // kernel id to be used in handling I-Cache invalidate response int kernId; SenderState(Wavefront *_wavefront, Packet::SenderState *sender_state=nullptr, int _kernId=-1) : wavefront(_wavefront), saved(sender_state), kernId(_kernId){ } }; std::deque> retries; protected: ComputeUnit *computeUnit; virtual bool recvTimingResp(PacketPtr pkt); virtual Tick recvAtomic(PacketPtr pkt) { return 0; } virtual void recvFunctional(PacketPtr pkt) { } virtual void recvRangeChange() { } virtual void recvReqRetry(); virtual void getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) { resp.clear(); snoop = true; } }; /** Data TLB port **/ class DTLBPort : public RequestPort { public: DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID id) : RequestPort(_name, _cu, id), computeUnit(_cu), stalled(false) { } bool isStalled() { return stalled; } void stallPort() { stalled = true; } void unstallPort() { stalled = false; } /** * here we queue all the translation requests that were * not successfully sent. */ std::deque retries; /** SenderState is information carried along with the packet * throughout the TLB hierarchy */ struct SenderState: public Packet::SenderState { // the memInst that this is associated with GPUDynInstPtr _gpuDynInst; // the lane in the memInst this is associated with, so we send // the memory request down the right port PortID portIndex; // constructor used for packets involved in timing accesses SenderState(GPUDynInstPtr gpuDynInst, PortID port_index) : _gpuDynInst(gpuDynInst), portIndex(port_index) { } }; protected: ComputeUnit *computeUnit; bool stalled; virtual bool recvTimingResp(PacketPtr pkt); virtual Tick recvAtomic(PacketPtr pkt) { return 0; } virtual void recvFunctional(PacketPtr pkt) { } virtual void recvRangeChange() { } virtual void recvReqRetry(); }; class ScalarDTLBPort : public RequestPort { public: ScalarDTLBPort(const std::string &_name, ComputeUnit *_cu) : RequestPort(_name, _cu), computeUnit(_cu), stalled(false) { } struct SenderState : public Packet::SenderState { SenderState(GPUDynInstPtr gpuDynInst) : _gpuDynInst(gpuDynInst) { } GPUDynInstPtr _gpuDynInst; }; bool recvTimingResp(PacketPtr pkt) override; void recvReqRetry() override { assert(false); } bool isStalled() const { return stalled; } void stallPort() { stalled = true; } void unstallPort() { stalled = false; } std::deque retries; private: ComputeUnit *computeUnit; bool stalled; }; class ITLBPort : public RequestPort { public: ITLBPort(const std::string &_name, ComputeUnit *_cu) : RequestPort(_name, _cu), computeUnit(_cu), stalled(false) { } bool isStalled() { return stalled; } void stallPort() { stalled = true; } void unstallPort() { stalled = false; } /** * here we queue all the translation requests that were * not successfully sent. */ std::deque retries; /** SenderState is information carried along with the packet * throughout the TLB hierarchy */ struct SenderState: public Packet::SenderState { // The wavefront associated with this request Wavefront *wavefront; SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { } }; protected: ComputeUnit *computeUnit; bool stalled; virtual bool recvTimingResp(PacketPtr pkt); virtual Tick recvAtomic(PacketPtr pkt) { return 0; } virtual void recvFunctional(PacketPtr pkt) { } virtual void recvRangeChange() { } virtual void recvReqRetry(); }; /** * the port intended to communicate between the CU and its LDS */ class LDSPort : public RequestPort { public: LDSPort(const std::string &_name, ComputeUnit *_cu) : RequestPort(_name, _cu), computeUnit(_cu) { } bool isStalled() const { return stalled; } void stallPort() { stalled = true; } void unstallPort() { stalled = false; } /** * here we queue all the requests that were * not successfully sent. */ std::queue retries; /** * SenderState is information carried along with the packet, esp. the * GPUDynInstPtr */ class SenderState: public Packet::SenderState { protected: // The actual read/write/atomic request that goes with this command GPUDynInstPtr _gpuDynInst = nullptr; public: SenderState(GPUDynInstPtr gpuDynInst): _gpuDynInst(gpuDynInst) { } GPUDynInstPtr getMemInst() const { return _gpuDynInst; } }; virtual bool sendTimingReq(PacketPtr pkt); protected: bool stalled = false; ///< whether or not it is stalled ComputeUnit *computeUnit; virtual bool recvTimingResp(PacketPtr pkt); virtual Tick recvAtomic(PacketPtr pkt) { return 0; } virtual void recvFunctional(PacketPtr pkt) { } virtual void recvRangeChange() { } virtual void recvReqRetry(); }; /** The port to access the Local Data Store * Can be connected to a LDS object */ LDSPort ldsPort; TokenManager * getTokenManager() { return memPortTokens; } /** The memory port for SIMD data accesses. * Can be connected to PhysMem for Ruby for timing simulations */ std::vector memPort; // port to the TLB hierarchy (i.e., the L1 TLB) std::vector tlbPort; // port to the scalar data cache ScalarDataPort scalarDataPort; // port to the scalar data TLB ScalarDTLBPort scalarDTLBPort; // port to the SQC (i.e. the I-cache) SQCPort sqcPort; // port to the SQC TLB (there's a separate TLB for each I-cache) ITLBPort sqcTLBPort; Port & getPort(const std::string &if_name, PortID idx) override { if (if_name == "memory_port" && idx < memPort.size()) { return memPort[idx]; } else if (if_name == "translation_port" && idx < tlbPort.size()) { return tlbPort[idx]; } else if (if_name == "scalar_port") { return scalarDataPort; } else if (if_name == "scalar_tlb_port") { return scalarDTLBPort; } else if (if_name == "sqc_port") { return sqcPort; } else if (if_name == "sqc_tlb_port") { return sqcTLBPort; } else if (if_name == "ldsPort") { return ldsPort; } else if (if_name == "gmTokenPort") { return gmTokenPort; } else { return ClockedObject::getPort(if_name, idx); } } InstSeqNum getAndIncSeqNum() { return globalSeqNum++; } private: const int _cacheLineSize; const int _numBarrierSlots; int cacheLineBits; InstSeqNum globalSeqNum; int wavefrontSize; /** * TODO: Update these comments once the pipe stage interface has * been fully refactored. * * Pipeline stage interfaces. * * Buffers used to communicate between various pipeline stages * List of waves which will be dispatched to * each execution resource. An EXREADY implies * dispatch list is non-empty and * execution unit has something to execute * this cycle. Currently, the dispatch list of * an execution resource can hold only one wave because * an execution resource can execute only one wave in a cycle. * dispatchList is used to communicate between schedule * and exec stage * * At a high level, the following intra-/inter-stage communication occurs: * SCB to SCH: readyList provides per exec resource list of waves that * passed dependency and readiness checks. If selected by * scheduler, attempt to add wave to schList conditional on * RF support. * SCH: schList holds waves that are gathering operands or waiting * for execution resource availability. Once ready, waves are * placed on the dispatchList as candidates for execution. A wave * may spend multiple cycles in SCH stage, on the schList due to * RF access conflicts or execution resource contention. * SCH to EX: dispatchList holds waves that are ready to be executed. * LM/FLAT arbitration may remove an LM wave and place it * back on the schList. RF model may also force a wave back * to the schList if using the detailed model. */ ScoreboardCheckToSchedule scoreboardCheckToSchedule; ScheduleToExecute scheduleToExecute; /** * The barrier slots for this CU. */ std::vector wfBarrierSlots; /** * A set used to easily retrieve a free barrier ID. */ std::unordered_set freeBarrierIds; // hold the time of the arrival of the first cache block related to // a particular GPUDynInst. This is used to calculate the difference // between the first and last chace block arrival times. std::unordered_map headTailMap; public: void updateInstStats(GPUDynInstPtr gpuDynInst); int activeWaves; struct ComputeUnitStats : public statistics::Group { ComputeUnitStats(statistics::Group *parent, int n_wf); statistics::Scalar vALUInsts; statistics::Formula vALUInstsPerWF; statistics::Scalar sALUInsts; statistics::Formula sALUInstsPerWF; statistics::Scalar instCyclesVALU; statistics::Scalar instCyclesSALU; statistics::Scalar threadCyclesVALU; statistics::Formula vALUUtilization; statistics::Scalar ldsNoFlatInsts; statistics::Formula ldsNoFlatInstsPerWF; statistics::Scalar flatVMemInsts; statistics::Formula flatVMemInstsPerWF; statistics::Scalar flatLDSInsts; statistics::Formula flatLDSInstsPerWF; statistics::Scalar vectorMemWrites; statistics::Formula vectorMemWritesPerWF; statistics::Scalar vectorMemReads; statistics::Formula vectorMemReadsPerWF; statistics::Scalar scalarMemWrites; statistics::Formula scalarMemWritesPerWF; statistics::Scalar scalarMemReads; statistics::Formula scalarMemReadsPerWF; statistics::Formula vectorMemReadsPerKiloInst; statistics::Formula vectorMemWritesPerKiloInst; statistics::Formula vectorMemInstsPerKiloInst; statistics::Formula scalarMemReadsPerKiloInst; statistics::Formula scalarMemWritesPerKiloInst; statistics::Formula scalarMemInstsPerKiloInst; // Cycles required to send register source (addr and data) from // register files to memory pipeline, per SIMD. statistics::Vector instCyclesVMemPerSimd; statistics::Vector instCyclesScMemPerSimd; statistics::Vector instCyclesLdsPerSimd; statistics::Scalar globalReads; statistics::Scalar globalWrites; statistics::Formula globalMemInsts; statistics::Scalar argReads; statistics::Scalar argWrites; statistics::Formula argMemInsts; statistics::Scalar spillReads; statistics::Scalar spillWrites; statistics::Formula spillMemInsts; statistics::Scalar groupReads; statistics::Scalar groupWrites; statistics::Formula groupMemInsts; statistics::Scalar privReads; statistics::Scalar privWrites; statistics::Formula privMemInsts; statistics::Scalar readonlyReads; statistics::Scalar readonlyWrites; statistics::Formula readonlyMemInsts; statistics::Scalar kernargReads; statistics::Scalar kernargWrites; statistics::Formula kernargMemInsts; statistics::Distribution waveLevelParallelism; // the following stats compute the avg. TLB accesslatency per // uncoalesced request (only for data) statistics::Scalar tlbRequests; statistics::Scalar tlbCycles; statistics::Formula tlbLatency; // hitsPerTLBLevel[x] are the hits in Level x TLB. // x = 0 is the page table. statistics::Vector hitsPerTLBLevel; statistics::Scalar ldsBankAccesses; statistics::Distribution ldsBankConflictDist; // over all memory instructions executed over all wavefronts // how many touched 0-4 pages, 4-8, ..., 60-64 pages statistics::Distribution pageDivergenceDist; // count of non-flat global memory vector instructions executed statistics::Scalar dynamicGMemInstrCnt; // count of flat global memory vector instructions executed statistics::Scalar dynamicFlatMemInstrCnt; statistics::Scalar dynamicLMemInstrCnt; statistics::Scalar wgBlockedDueBarrierAllocation; statistics::Scalar wgBlockedDueLdsAllocation; // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are // active when the instruction is committed, this number is still // incremented by 1 statistics::Scalar numInstrExecuted; // Number of cycles among successive instruction executions across all // wavefronts of the same CU statistics::Distribution execRateDist; // number of individual vector operations executed statistics::Scalar numVecOpsExecuted; // number of individual f16 vector operations executed statistics::Scalar numVecOpsExecutedF16; // number of individual f32 vector operations executed statistics::Scalar numVecOpsExecutedF32; // number of individual f64 vector operations executed statistics::Scalar numVecOpsExecutedF64; // number of individual FMA 16,32,64 vector operations executed statistics::Scalar numVecOpsExecutedFMA16; statistics::Scalar numVecOpsExecutedFMA32; statistics::Scalar numVecOpsExecutedFMA64; // number of individual MAC 16,32,64 vector operations executed statistics::Scalar numVecOpsExecutedMAC16; statistics::Scalar numVecOpsExecutedMAC32; statistics::Scalar numVecOpsExecutedMAC64; // number of individual MAD 16,32,64 vector operations executed statistics::Scalar numVecOpsExecutedMAD16; statistics::Scalar numVecOpsExecutedMAD32; statistics::Scalar numVecOpsExecutedMAD64; // total number of two op FP vector operations executed statistics::Scalar numVecOpsExecutedTwoOpFP; // Total cycles that something is running on the GPU statistics::Scalar totalCycles; statistics::Formula vpc; // vector ops per cycle statistics::Formula vpc_f16; // vector ops per cycle statistics::Formula vpc_f32; // vector ops per cycle statistics::Formula vpc_f64; // vector ops per cycle statistics::Formula ipc; // vector instructions per cycle statistics::Distribution controlFlowDivergenceDist; statistics::Distribution activeLanesPerGMemInstrDist; statistics::Distribution activeLanesPerLMemInstrDist; // number of vector ALU instructions received statistics::Formula numALUInstsExecuted; // number of times a WG cannot start due to lack of free VGPRs in SIMDs statistics::Scalar numTimesWgBlockedDueVgprAlloc; // number of times a WG cannot start due to lack of free SGPRs in SIMDs statistics::Scalar numTimesWgBlockedDueSgprAlloc; statistics::Scalar numCASOps; statistics::Scalar numFailedCASOps; statistics::Scalar completedWfs; statistics::Scalar completedWGs; // distrubtion in latency difference between first and last cache block // arrival ticks statistics::Distribution headTailLatency; // Track the amount of interleaving between wavefronts on each SIMD. // This stat is sampled using instExecPerSimd to compute the number // of instructions that have been executed on a SIMD between a WF // executing two successive instructions. statistics::VectorDistribution instInterleave; } stats; }; } // namespace gem5 #endif // __COMPUTE_UNIT_HH__