gem5/src/gpu-compute/gpu_dyn_inst.hh

/*
 * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef __GPU_DYN_INST_HH__
#define __GPU_DYN_INST_HH__

#include <cstdint>
#include <memory>
#include <string>

#include "base/amo.hh"
#include "base/logging.hh"
#include "base/trace.hh"
#include "debug/GPUMem.hh"
#include "enums/StorageClassType.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_exec_context.hh"
#include "gpu-compute/operand_info.hh"

class GPUStaticInst;

template<typename T>
class AtomicOpCAS : public TypedAtomicOpFunctor<T>
{
  public:
    T c;
    T s;

    ComputeUnit *computeUnit;

    AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
      : c(_c), s(_s), computeUnit(compute_unit) { }

    void
    execute(T *b)
    {
        computeUnit->stats.numCASOps++;

        if (*b == c) {
            *b = s;
        } else {
            computeUnit->stats.numFailedCASOps++;
        }
    }
    AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
};

class RegisterOperandInfo
{
  public:
    RegisterOperandInfo() = delete;
    RegisterOperandInfo(int op_idx, int num_dwords,
                        const std::vector<int> &virt_indices,
                        const std::vector<int> &phys_indices)
        : opIdx(op_idx), numDWORDs(num_dwords), virtIndices(virt_indices),
          physIndices(phys_indices)
    {
    }

    /**
     * The number of registers required to store this operand.
     */
    int numRegisters() const { return numDWORDs / TheGpuISA::RegSizeDWords; }
    int operandIdx() const { return opIdx; }
    /**
     * We typically only need the first virtual register for the operand
     * regardless of its size.
     */
    int virtIdx(int reg_num=0) const { return virtIndices.at(reg_num); }

  private:
    /**
     * Index of this operand within the set of its parent instruction's
     * operand list.
     */
    const int opIdx;
    /**
     * Size of this operand in DWORDs.
     */
    const int numDWORDs;
    const std::vector<int> virtIndices;
    const std::vector<int> physIndices;
};

class GPUDynInst : public GPUExecContext
{
  public:
    GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
               uint64_t instSeqNum);
    ~GPUDynInst();
    void execute(GPUDynInstPtr gpuDynInst);

    const std::vector<OperandInfo>& srcVecRegOperands() const;
    const std::vector<OperandInfo>& dstVecRegOperands() const;
    const std::vector<OperandInfo>& srcScalarRegOperands() const;
    const std::vector<OperandInfo>& dstScalarRegOperands() const;

    int numSrcRegOperands();
    int numDstRegOperands();

    int numSrcVecRegOperands() const;
    int numDstVecRegOperands() const;
    int maxSrcVecRegOperandSize();
    int numSrcVecDWords();
    int numDstVecDWords();

    int numSrcScalarRegOperands() const;
    int numDstScalarRegOperands() const;
    int maxSrcScalarRegOperandSize();
    int numSrcScalarDWords();
    int numDstScalarDWords();

    int maxOperandSize();

    int getNumOperands() const;

    bool hasSourceSgpr() const;
    bool hasDestinationSgpr() const;
    bool hasSourceVgpr() const;
    bool hasDestinationVgpr() const;

    // returns true if the string "opcodeStr" is found in the
    // opcode of the instruction
    bool isOpcode(const std::string& opcodeStr) const;
    bool isOpcode(const std::string& opcodeStr,
                  const std::string& extStr) const;

    const std::string &disassemble() const;

    InstSeqNum seqNum() const;

    enums::StorageClassType executedAs();

    // virtual address for scalar memory operations
    Addr scalarAddr;
    // virtual addressies for vector memory operations
    std::vector<Addr> addr;
    Addr pAddr;

    // vector data to get written
    uint8_t *d_data;
    // scalar data to be transferred
    uint8_t *scalar_data;
    // Additional data (for atomics)
    uint8_t *a_data;
    // Additional data (for atomics)
    uint8_t *x_data;
    // The execution mask
    VectorMask exec_mask;

    // SIMD where the WF of the memory instruction has been mapped to
    int simdId;
    // unique id of the WF where the memory instruction belongs to
    int wfDynId;
    // The kernel id of the requesting wf
    int kern_id;
    // The CU id of the requesting wf
    int cu_id;
    // The workgroup id of the requesting wf
    int wg_id;
    // HW slot id where the WF is mapped to inside a SIMD unit
    int wfSlotId;
    // execution pipeline id where the memory instruction has been scheduled
    int execUnitId;
    // The execution time of this operation
    Tick time;
    // The latency of this operation
    WaitClass latency;

    // Initiate the specified memory operation, by creating a
    // memory request and sending it off to the memory system.
    void initiateAcc(GPUDynInstPtr gpuDynInst);
    // Complete the specified memory operation, by writing
    // value back to the RF in the case of a load or atomic
    // return or, in the case of a store, we do nothing
    void completeAcc(GPUDynInstPtr gpuDynInst);

    void updateStats();

    GPUStaticInst* staticInstruction() { return _staticInst; }

    TheGpuISA::ScalarRegU32 srcLiteral() const;

    bool isALU() const;
    bool isBranch() const;
    bool isCondBranch() const;
    bool isNop() const;
    bool isReturn() const;
    bool isEndOfKernel() const;
    bool isKernelLaunch() const;
    bool isSDWAInst() const;
    bool isDPPInst() const;
    bool isUnconditionalJump() const;
    bool isSpecialOp() const;
    bool isWaitcnt() const;
    bool isSleep() const;

    bool isBarrier() const;
    bool isMemSync() const;
    bool isMemRef() const;
    bool isFlat() const;
    bool isLoad() const;
    bool isStore() const;

    bool isAtomic() const;
    bool isAtomicNoRet() const;
    bool isAtomicRet() const;

    bool isScalar() const;
    bool isVector() const;
    bool readsSCC() const;
    bool writesSCC() const;
    bool readsVCC() const;
    bool writesVCC() const;
    bool readsExec() const;
    bool writesExec() const;
    bool readsMode() const;
    bool writesMode() const;
    bool ignoreExec() const;
    bool readsFlatScratch() const;
    bool writesFlatScratch() const;
    bool readsExecMask() const;
    bool writesExecMask() const;

    bool isAtomicAnd() const;
    bool isAtomicOr() const;
    bool isAtomicXor() const;
    bool isAtomicCAS() const;
    bool isAtomicExch() const;
    bool isAtomicAdd() const;
    bool isAtomicSub() const;
    bool isAtomicInc() const;
    bool isAtomicDec() const;
    bool isAtomicMax() const;
    bool isAtomicMin() const;

    bool isArgLoad() const;
    bool isGlobalMem() const;
    bool isLocalMem() const;

    bool isArgSeg() const;
    bool isGlobalSeg() const;
    bool isGroupSeg() const;
    bool isKernArgSeg() const;
    bool isPrivateSeg() const;
    bool isReadOnlySeg() const;
    bool isSpillSeg() const;

    bool isGloballyCoherent() const;
    bool isSystemCoherent() const;

    bool isF16() const;
    bool isF32() const;
    bool isF64() const;

    bool isFMA() const;
    bool isMAC() const;
    bool isMAD() const;

    // for FLAT memory ops. check the segment address
    // against the APE registers to see if it falls
    // within one of the APE ranges for LDS/SCRATCH/GPUVM.
    // if it does not fall into one of the three APEs, it
    // will be a regular global access.
    void doApertureCheck(const VectorMask &mask);
    // Function to resolve a flat accesses during execution stage.
    void resolveFlatSegment(const VectorMask &mask);

    template<typename c0> AtomicOpFunctorPtr
    makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
    {
        if (isAtomicAnd()) {
            return std::make_unique<AtomicOpAnd<c0>>(*reg0);
        } else if (isAtomicOr()) {
            return std::make_unique<AtomicOpOr<c0>>(*reg0);
        } else if (isAtomicXor()) {
            return std::make_unique<AtomicOpXor<c0>>(*reg0);
        } else if (isAtomicCAS()) {
            return std::make_unique<AtomicOpCAS<c0>>(*reg0, *reg1, cu);
        } else if (isAtomicExch()) {
            return std::make_unique<AtomicOpExch<c0>>(*reg0);
        } else if (isAtomicAdd()) {
            return std::make_unique<AtomicOpAdd<c0>>(*reg0);
        } else if (isAtomicSub()) {
            return std::make_unique<AtomicOpSub<c0>>(*reg0);
        } else if (isAtomicInc()) {
            return std::make_unique<AtomicOpInc<c0>>();
        } else if (isAtomicDec()) {
            return std::make_unique<AtomicOpDec<c0>>();
        } else if (isAtomicMax()) {
            return std::make_unique<AtomicOpMax<c0>>(*reg0);
        } else if (isAtomicMin()) {
            return std::make_unique<AtomicOpMin<c0>>(*reg0);
        } else {
            fatal("Unrecognized atomic operation");
        }
    }

    void
    setRequestFlags(RequestPtr req) const
    {
        if (isGloballyCoherent()) {
            req->setCacheCoherenceFlags(Request::GLC_BIT);
        }

        if (isSystemCoherent()) {
            req->setCacheCoherenceFlags(Request::SLC_BIT);
        }

        if (isAtomicRet()) {
            req->setFlags(Request::ATOMIC_RETURN_OP);
        } else if (isAtomicNoRet()) {
            req->setFlags(Request::ATOMIC_NO_RETURN_OP);
        }

        if (isMemSync()) {
            // the path for kernel launch and kernel end is different
            // from non-kernel mem sync.
            assert(!isKernelLaunch());
            assert(!isEndOfKernel());

            // must be wbinv inst if not kernel launch/end
            req->setCacheCoherenceFlags(Request::INV_L1);
        }
    }

    // reset the number of pending memory requests for all lanes
    void
    resetEntireStatusVector()
    {
        assert(statusVector.size() == TheGpuISA::NumVecElemPerVecReg);
        for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
            resetStatusVector(lane);
        }
    }

    // reset the number of pending memory requests for the inputted lane
    void
    resetStatusVector(int lane)
    {
        setStatusVector(lane, 0);
    }

    // set the number of pending memory requests for the inputted lane
    void
    setStatusVector(int lane, int newVal)
    {
        // currently we can have up to 2 memory requests per lane (if the
        // lane's request goes across multiple cache lines)
        assert((newVal >= 0) && (newVal <= 2));
        statusVector[lane] = newVal;
    }

    // subtracts the number of pending memory requests for the inputted lane
    // by 1
    void
    decrementStatusVector(int lane)
    {
        // this lane may have multiple requests, so only subtract one for
        // this request
        assert(statusVector[lane] >= 1);
        statusVector[lane]--;
    }

    // return the current number of pending memory requests for the inputted
    // lane
    int
    getLaneStatus(int lane) const
    {
        return statusVector[lane];
    }

    // returns true if all memory requests from all lanes have been received,
    // else returns false
    bool
    allLanesZero() const
    {
        // local variables
        bool allZero = true;

        // iterate over all lanes, checking the number of pending memory
        // requests they have
        for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
            // if any lane still has pending requests, return false
            if (statusVector[lane] > 0) {
                DPRINTF(GPUMem, "CU%d: WF[%d][%d]: lane: %d has %d pending "
                        "request(s) for %#x\n", cu_id, simdId, wfSlotId, lane,
                        statusVector[lane], addr[lane]);
                allZero = false;
            }
        }

        if (allZero) {
            DPRINTF(GPUMem, "CU%d: WF[%d][%d]: all lanes have no pending"
                    " requests for %#x\n", cu_id, simdId, wfSlotId, addr[0]);
        }
        return allZero;
    }

    // returns a string representing the current state of the statusVector
    std::string
    printStatusVector() const
    {
        std::string statusVec_str = "[";

        // iterate over all lanes, adding the current number of pending
        // requests for this lane to the string
        for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
            statusVec_str += std::to_string(statusVector[lane]);
        }
        statusVec_str += "]";

        return statusVec_str;
    }

    // Map returned packets and the addresses they satisfy with which lane they
    // were requested from
    typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
    StatusVector memStatusVector;

    // Track the status of memory requests per lane, an int per lane to allow
    // unaligned accesses
    std::vector<int> statusVector;
    // for ld_v# or st_v#
    std::vector<int> tlbHitLevel;

    // for misaligned scalar ops we track the number
    // of outstanding reqs here
    int numScalarReqs;

    Tick getAccessTime() const { return accessTime; }

    void setAccessTime(Tick currentTime) { accessTime = currentTime; }

    void profileRoundTripTime(Tick currentTime, int hopId);
    std::vector<Tick> getRoundTripTime() const { return roundTripTime; }

    void profileLineAddressTime(Addr addr, Tick currentTime, int hopId);
    const std::map<Addr, std::vector<Tick>>& getLineAddressTime() const
    { return lineAddressTime; }

    // inst used to save/restore a wavefront context
    bool isSaveRestore;
  private:
    GPUStaticInst *_staticInst;
    const InstSeqNum _seqNum;
    int maxSrcVecRegOpSize;
    int maxSrcScalarRegOpSize;

    // the time the request was started
    Tick accessTime = -1;

    // hold the tick when the instruction arrives at certain hop points
    // on it's way to main memory
    std::vector<Tick> roundTripTime;

    // hold each cache block address for the instruction and a vector
    // to hold the tick when the block arrives at certain hop points
    std::map<Addr, std::vector<Tick>> lineAddressTime;
};

#endif // __GPU_DYN_INST_HH__