/* * Copyright (c) 2011-2017 Advanced Micro Devices, Inc. * All rights reserved. * * For use for simulation and test purposes only * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef __GPU_COMPUTE_WAVEFRONT_HH__ #define __GPU_COMPUTE_WAVEFRONT_HH__ #include #include #include #include #include #include #include "arch/gpu_isa.hh" #include "base/logging.hh" #include "base/types.hh" #include "config/the_gpu_isa.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/dispatcher.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/hsa_queue_entry.hh" #include "gpu-compute/lds_state.hh" #include "gpu-compute/misc.hh" #include "params/Wavefront.hh" #include "sim/sim_object.hh" class Wavefront : public SimObject { public: enum status_e { // wavefront is stalled S_STOPPED, // wavefront is returning from a kernel S_RETURNING, // wavefront is running normally S_RUNNING, // wavefront is stalled S_STALLED, /** * wavefront has unsatisfied wait counts * * while in this state the WF will only execute if * the oldest instruction is the waitcnt. while in * S_WAITCNT, the wavefront will not be ready until * all of its waitcnts have been satisfied. the * scoreboard ready() function will check the status * of the waitcnts whenever the WF is in S_WAITCNT, * and once they are satisfied, it will resume normal * operation. */ S_WAITCNT }; uint32_t oldBarrierCnt; uint32_t barrierCnt; uint32_t barrierId; uint32_t barrierSlots; // HW slot id where the WF is mapped to inside a SIMD unit const int wfSlotId; int kernId; // SIMD unit where the WV has been scheduled const int simdId; // id of the execution unit (or pipeline) where the oldest instruction // of the WF is scheduled int execUnitId; int flatLmUnitId; int flatGmUnitId; // pointer to parent CU ComputeUnit *computeUnit; int maxIbSize; std::deque instructionBuffer; bool pendingFetch; bool dropFetch; // last tick during which all WFs in the CU are not idle Tick lastNonIdleTick; // Execution unit resource ID's associated with this WF // These are static mappings set at WF slot construction and // based off of the simdId and wfSlotId. // Index to scalarALUs resource vector in CU int scalarAlu; // Indices into readyList/dispatchList of resources used by this // wavefront int scalarAluGlobalIdx; int globalMem; int localMem; int scalarMem; // number of VGPRs required by WF uint32_t maxVgprs; // number of SGPRs required by WF uint32_t maxSgprs; void freeResources(); GPUDynInstPtr nextInstr(); void setStatus(status_e newStatus); status_e getStatus() { return status; } void resizeRegFiles(int num_vregs, int num_sregs); bool isGmInstruction(GPUDynInstPtr ii); bool isLmInstruction(GPUDynInstPtr ii); bool isOldestInstWaitcnt(); bool isOldestInstGMem(); bool isOldestInstLMem(); bool isOldestInstPrivMem(); bool isOldestInstFlatMem(); bool isOldestInstVectorALU(); bool isOldestInstScalarALU(); bool isOldestInstScalarMem(); bool isOldestInstBarrier(); // used for passing spill address to DDInstGPU std::vector lastAddr; std::vector workItemId[3]; std::vector workItemFlatId; /* kernel launch parameters */ uint32_t workGroupId[3]; uint32_t workGroupSz[3]; uint32_t gridSz[3]; uint32_t wgId; uint32_t wgSz; /* the actual WG size can differ than the maximum size */ uint32_t actualWgSz[3]; uint32_t actualWgSzTotal; void computeActualWgSz(HSAQueueEntry *task); // wavefront id within a workgroup uint32_t wfId; uint32_t maxDynWaveId; uint32_t dispatchId; // vector and scalar memory requests pending in memory system int outstandingReqs; // outstanding global memory write requests int outstandingReqsWrGm; // outstanding local memory write requests int outstandingReqsWrLm; // outstanding global memory read requests int outstandingReqsRdGm; // outstanding local memory read requests int outstandingReqsRdLm; // outstanding scalar memory read requests int scalarOutstandingReqsRdGm; // outstanding scalar memory write requests int scalarOutstandingReqsWrGm; int rdLmReqsInPipe; int rdGmReqsInPipe; int wrLmReqsInPipe; int wrGmReqsInPipe; int scalarRdGmReqsInPipe; int scalarWrGmReqsInPipe; int memTraceBusy; uint64_t lastTrace; // number of virtual vector registers reserved by WF int reservedVectorRegs; // number of virtual scalar registers reserved by WF int reservedScalarRegs; // Index into the Vector Register File's namespace where the WF's registers // will live while the WF is executed uint32_t startVgprIndex; // Index into the Scalar Register File's namespace where the WF's registers // will live while the WF is executed uint32_t startSgprIndex; // Old value of destination gpr (for trace) std::vector oldVgpr; // Id of destination gpr (for trace) uint32_t oldVgprId; // Tick count of last old_vgpr copy uint64_t oldVgprTcnt; // Old value of destination gpr (for trace) std::vector oldDgpr; // Id of destination gpr (for trace) uint32_t oldDgprId; // Tick count of last old_vgpr copy uint64_t oldDgprTcnt; // Execution mask at wavefront start VectorMask initMask; // number of barriers this WF has joined std::vector barCnt; int maxBarCnt; // Flag to stall a wave on barrier bool stalledAtBarrier; // a pointer to the fraction of the LDS allocated // to this workgroup (thus this wavefront) LdsChunk *ldsChunk; // unique WF id over all WFs executed across all CUs uint64_t wfDynId; // Wavefront slot stats // Number of instructions executed by this wavefront slot across all // dynamic wavefronts Stats::Scalar numInstrExecuted; // Number of cycles this WF spends in SCH stage Stats::Scalar schCycles; // Number of stall cycles encounterd by this WF in SCH stage Stats::Scalar schStalls; // The following stats sum to the value of schStalls, and record, per // WF slot, what the cause of each stall was at a coarse granularity. // Cycles WF is selected by scheduler, but RFs cannot support instruction Stats::Scalar schRfAccessStalls; // Cycles spent waiting for execution resources Stats::Scalar schResourceStalls; // cycles spent waiting for RF reads to complete in SCH stage Stats::Scalar schOpdNrdyStalls; // LDS arbitration stall cycles. WF attempts to execute LM instruction, // but another wave is executing FLAT, which requires LM and GM and forces // this WF to stall. Stats::Scalar schLdsArbStalls; // number of times an instruction of a WF is blocked from being issued // due to WAR and WAW dependencies Stats::Scalar numTimesBlockedDueWAXDependencies; // number of times an instruction of a WF is blocked from being issued // due to WAR and WAW dependencies Stats::Scalar numTimesBlockedDueRAWDependencies; // dyn inst id (per SIMD) of last instruction exec from this wave uint64_t lastInstExec; // Distribution to track the distance between producer and consumer // for vector register values Stats::Distribution vecRawDistance; // Map to track the dyn instruction id of each vector register value // produced, indexed by physical vector register ID std::unordered_map rawDist; // Distribution to track the number of times every vector register // value produced is consumed. Stats::Distribution readsPerWrite; // Counts the number of reads performed to each physical register // - counts are reset to 0 for each dynamic wavefront launched std::vector vecReads; void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems); // context for save/restore uint8_t *context; typedef WavefrontParams Params; Wavefront(const Params *p); ~Wavefront(); virtual void init(); void setParent(ComputeUnit *cu) { computeUnit = cu; } void validateRequestCounters(); void start(uint64_t _wfDynId, uint64_t _base_ptr); void exec(); // called by SCH stage to reserve std::vector reserveResources(); bool stopFetch(); void regStats(); bool waitingAtBarrier(int lane); Addr pc() const; void pc(Addr new_pc); VectorMask& execMask(); bool execMask(int lane) const; void discardFetch(); bool waitCntsSatisfied(); void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt); void clearWaitCnts(); /** Freeing VRF space */ void freeRegisterFile(); TheGpuISA::GPUISA& gpuISA() { return _gpuISA; } private: TheGpuISA::GPUISA _gpuISA; void reserveGmResource(GPUDynInstPtr ii); void reserveLmResource(GPUDynInstPtr ii); /** * the following are used for waitcnt instructions * vmWaitCnt: once set, we wait for the oustanding * number of vector mem instructions to be * at, or below vmWaitCnt. * * expWaitCnt: once set, we wait for the outstanding * number outstanding VM writes or EXP * insts to be at, or below expWaitCnt. * * lgkmWaitCnt: once set, we wait for the oustanding * number of LDS, GDS, scalar memory, * and message instructions to be at, or * below lgkmCount. we currently do not * support GDS/message ops. */ int vmWaitCnt; int expWaitCnt; int lgkmWaitCnt; status_e status; Addr _pc; VectorMask _execMask; }; #endif // __GPU_COMPUTE_WAVEFRONT_HH__