/* * Copyright (c) 2015-2017 Advanced Micro Devices, Inc. * All rights reserved. * * For use for simulation and test purposes only * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef __GPU_DYN_INST_HH__ #define __GPU_DYN_INST_HH__ #include #include #include #include "base/amo.hh" #include "base/logging.hh" #include "base/trace.hh" #include "debug/GPUMem.hh" #include "enums/StorageClassType.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_exec_context.hh" #include "gpu-compute/operand_info.hh" class GPUStaticInst; template class AtomicOpCAS : public TypedAtomicOpFunctor { public: T c; T s; ComputeUnit *computeUnit; AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit) : c(_c), s(_s), computeUnit(compute_unit) { } void execute(T *b) { computeUnit->stats.numCASOps++; if (*b == c) { *b = s; } else { computeUnit->stats.numFailedCASOps++; } } AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); } }; class RegisterOperandInfo { public: RegisterOperandInfo() = delete; RegisterOperandInfo(int op_idx, int num_dwords, const std::vector &virt_indices, const std::vector &phys_indices) : opIdx(op_idx), numDWORDs(num_dwords), virtIndices(virt_indices), physIndices(phys_indices) { } /** * The number of registers required to store this operand. */ int numRegisters() const { return numDWORDs / TheGpuISA::RegSizeDWords; } int operandIdx() const { return opIdx; } /** * We typically only need the first virtual register for the operand * regardless of its size. */ int virtIdx(int reg_num=0) const { return virtIndices.at(reg_num); } private: /** * Index of this operand within the set of its parent instruction's * operand list. */ const int opIdx; /** * Size of this operand in DWORDs. */ const int numDWORDs; const std::vector virtIndices; const std::vector physIndices; }; class GPUDynInst : public GPUExecContext { public: GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst, uint64_t instSeqNum); ~GPUDynInst(); void execute(GPUDynInstPtr gpuDynInst); const std::vector& srcVecRegOperands() const; const std::vector& dstVecRegOperands() const; const std::vector& srcScalarRegOperands() const; const std::vector& dstScalarRegOperands() const; int numSrcRegOperands(); int numDstRegOperands(); int numSrcVecRegOperands() const; int numDstVecRegOperands() const; int maxSrcVecRegOperandSize(); int numSrcVecDWords(); int numDstVecDWords(); int numSrcScalarRegOperands() const; int numDstScalarRegOperands() const; int maxSrcScalarRegOperandSize(); int numSrcScalarDWords(); int numDstScalarDWords(); int maxOperandSize(); int getNumOperands() const; bool hasSourceSgpr() const; bool hasDestinationSgpr() const; bool hasSourceVgpr() const; bool hasDestinationVgpr() const; // returns true if the string "opcodeStr" is found in the // opcode of the instruction bool isOpcode(const std::string& opcodeStr) const; bool isOpcode(const std::string& opcodeStr, const std::string& extStr) const; const std::string &disassemble() const; InstSeqNum seqNum() const; enums::StorageClassType executedAs(); // virtual address for scalar memory operations Addr scalarAddr; // virtual addressies for vector memory operations std::vector addr; Addr pAddr; // vector data to get written uint8_t *d_data; // scalar data to be transferred uint8_t *scalar_data; // Additional data (for atomics) uint8_t *a_data; // Additional data (for atomics) uint8_t *x_data; // The execution mask VectorMask exec_mask; // SIMD where the WF of the memory instruction has been mapped to int simdId; // unique id of the WF where the memory instruction belongs to int wfDynId; // The kernel id of the requesting wf int kern_id; // The CU id of the requesting wf int cu_id; // The workgroup id of the requesting wf int wg_id; // HW slot id where the WF is mapped to inside a SIMD unit int wfSlotId; // execution pipeline id where the memory instruction has been scheduled int execUnitId; // The execution time of this operation Tick time; // The latency of this operation WaitClass latency; // Initiate the specified memory operation, by creating a // memory request and sending it off to the memory system. void initiateAcc(GPUDynInstPtr gpuDynInst); // Complete the specified memory operation, by writing // value back to the RF in the case of a load or atomic // return or, in the case of a store, we do nothing void completeAcc(GPUDynInstPtr gpuDynInst); void updateStats(); GPUStaticInst* staticInstruction() { return _staticInst; } TheGpuISA::ScalarRegU32 srcLiteral() const; bool isALU() const; bool isBranch() const; bool isCondBranch() const; bool isNop() const; bool isReturn() const; bool isEndOfKernel() const; bool isKernelLaunch() const; bool isSDWAInst() const; bool isDPPInst() const; bool isUnconditionalJump() const; bool isSpecialOp() const; bool isWaitcnt() const; bool isSleep() const; bool isBarrier() const; bool isMemSync() const; bool isMemRef() const; bool isFlat() const; bool isLoad() const; bool isStore() const; bool isAtomic() const; bool isAtomicNoRet() const; bool isAtomicRet() const; bool isScalar() const; bool isVector() const; bool readsSCC() const; bool writesSCC() const; bool readsVCC() const; bool writesVCC() const; bool readsExec() const; bool writesExec() const; bool readsMode() const; bool writesMode() const; bool ignoreExec() const; bool readsFlatScratch() const; bool writesFlatScratch() const; bool readsExecMask() const; bool writesExecMask() const; bool isAtomicAnd() const; bool isAtomicOr() const; bool isAtomicXor() const; bool isAtomicCAS() const; bool isAtomicExch() const; bool isAtomicAdd() const; bool isAtomicSub() const; bool isAtomicInc() const; bool isAtomicDec() const; bool isAtomicMax() const; bool isAtomicMin() const; bool isArgLoad() const; bool isGlobalMem() const; bool isLocalMem() const; bool isArgSeg() const; bool isGlobalSeg() const; bool isGroupSeg() const; bool isKernArgSeg() const; bool isPrivateSeg() const; bool isReadOnlySeg() const; bool isSpillSeg() const; bool isGloballyCoherent() const; bool isSystemCoherent() const; bool isF16() const; bool isF32() const; bool isF64() const; bool isFMA() const; bool isMAC() const; bool isMAD() const; // for FLAT memory ops. check the segment address // against the APE registers to see if it falls // within one of the APE ranges for LDS/SCRATCH/GPUVM. // if it does not fall into one of the three APEs, it // will be a regular global access. void doApertureCheck(const VectorMask &mask); // Function to resolve a flat accesses during execution stage. void resolveFlatSegment(const VectorMask &mask); template AtomicOpFunctorPtr makeAtomicOpFunctor(c0 *reg0, c0 *reg1) { if (isAtomicAnd()) { return std::make_unique>(*reg0); } else if (isAtomicOr()) { return std::make_unique>(*reg0); } else if (isAtomicXor()) { return std::make_unique>(*reg0); } else if (isAtomicCAS()) { return std::make_unique>(*reg0, *reg1, cu); } else if (isAtomicExch()) { return std::make_unique>(*reg0); } else if (isAtomicAdd()) { return std::make_unique>(*reg0); } else if (isAtomicSub()) { return std::make_unique>(*reg0); } else if (isAtomicInc()) { return std::make_unique>(); } else if (isAtomicDec()) { return std::make_unique>(); } else if (isAtomicMax()) { return std::make_unique>(*reg0); } else if (isAtomicMin()) { return std::make_unique>(*reg0); } else { fatal("Unrecognized atomic operation"); } } void setRequestFlags(RequestPtr req) const { if (isGloballyCoherent()) { req->setCacheCoherenceFlags(Request::GLC_BIT); } if (isSystemCoherent()) { req->setCacheCoherenceFlags(Request::SLC_BIT); } if (isAtomicRet()) { req->setFlags(Request::ATOMIC_RETURN_OP); } else if (isAtomicNoRet()) { req->setFlags(Request::ATOMIC_NO_RETURN_OP); } if (isMemSync()) { // the path for kernel launch and kernel end is different // from non-kernel mem sync. assert(!isKernelLaunch()); assert(!isEndOfKernel()); // must be wbinv inst if not kernel launch/end req->setCacheCoherenceFlags(Request::INV_L1); } } // reset the number of pending memory requests for all lanes void resetEntireStatusVector() { assert(statusVector.size() == TheGpuISA::NumVecElemPerVecReg); for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) { resetStatusVector(lane); } } // reset the number of pending memory requests for the inputted lane void resetStatusVector(int lane) { setStatusVector(lane, 0); } // set the number of pending memory requests for the inputted lane void setStatusVector(int lane, int newVal) { // currently we can have up to 2 memory requests per lane (if the // lane's request goes across multiple cache lines) assert((newVal >= 0) && (newVal <= 2)); statusVector[lane] = newVal; } // subtracts the number of pending memory requests for the inputted lane // by 1 void decrementStatusVector(int lane) { // this lane may have multiple requests, so only subtract one for // this request assert(statusVector[lane] >= 1); statusVector[lane]--; } // return the current number of pending memory requests for the inputted // lane int getLaneStatus(int lane) const { return statusVector[lane]; } // returns true if all memory requests from all lanes have been received, // else returns false bool allLanesZero() const { // local variables bool allZero = true; // iterate over all lanes, checking the number of pending memory // requests they have for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) { // if any lane still has pending requests, return false if (statusVector[lane] > 0) { DPRINTF(GPUMem, "CU%d: WF[%d][%d]: lane: %d has %d pending " "request(s) for %#x\n", cu_id, simdId, wfSlotId, lane, statusVector[lane], addr[lane]); allZero = false; } } if (allZero) { DPRINTF(GPUMem, "CU%d: WF[%d][%d]: all lanes have no pending" " requests for %#x\n", cu_id, simdId, wfSlotId, addr[0]); } return allZero; } // returns a string representing the current state of the statusVector std::string printStatusVector() const { std::string statusVec_str = "["; // iterate over all lanes, adding the current number of pending // requests for this lane to the string for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) { statusVec_str += std::to_string(statusVector[lane]); } statusVec_str += "]"; return statusVec_str; } // Map returned packets and the addresses they satisfy with which lane they // were requested from typedef std::unordered_map> StatusVector; StatusVector memStatusVector; // Track the status of memory requests per lane, an int per lane to allow // unaligned accesses std::vector statusVector; // for ld_v# or st_v# std::vector tlbHitLevel; // for misaligned scalar ops we track the number // of outstanding reqs here int numScalarReqs; Tick getAccessTime() const { return accessTime; } void setAccessTime(Tick currentTime) { accessTime = currentTime; } void profileRoundTripTime(Tick currentTime, int hopId); std::vector getRoundTripTime() const { return roundTripTime; } void profileLineAddressTime(Addr addr, Tick currentTime, int hopId); const std::map>& getLineAddressTime() const { return lineAddressTime; } // inst used to save/restore a wavefront context bool isSaveRestore; private: GPUStaticInst *_staticInst; const InstSeqNum _seqNum; int maxSrcVecRegOpSize; int maxSrcScalarRegOpSize; // the time the request was started Tick accessTime = -1; // hold the tick when the instruction arrives at certain hop points // on it's way to main memory std::vector roundTripTime; // hold each cache block address for the instruction and a vector // to hold the tick when the block arrives at certain hop points std::map> lineAddressTime; }; #endif // __GPU_DYN_INST_HH__