gem5/src/gpu-compute/wavefront.hh

/*
 * Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef __GPU_COMPUTE_WAVEFRONT_HH__
#define __GPU_COMPUTE_WAVEFRONT_HH__

#include <cassert>
#include <deque>
#include <list>
#include <memory>
#include <unordered_map>
#include <vector>

#include "arch/gpu_isa.hh"
#include "base/logging.hh"
#include "base/types.hh"
#include "config/the_gpu_isa.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/dispatcher.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/hsa_queue_entry.hh"
#include "gpu-compute/lds_state.hh"
#include "gpu-compute/misc.hh"
#include "params/Wavefront.hh"
#include "sim/sim_object.hh"

class Wavefront : public SimObject
{
  public:
    enum status_e {
        // wavefront is stalled
        S_STOPPED,
        // wavefront is returning from a kernel
        S_RETURNING,
        // wavefront is running normally
        S_RUNNING,
        // wavefront is stalled
        S_STALLED,
        /**
         * wavefront has unsatisfied wait counts
         *
         * while in this state the WF will only execute if
         * the oldest instruction is the waitcnt. while in
         * S_WAITCNT, the wavefront will not be ready until
         * all of its waitcnts have been satisfied. the
         * scoreboard ready() function will check the status
         * of the waitcnts whenever the WF is in S_WAITCNT,
         * and once they are satisfied, it will resume normal
         * operation.
         */
        S_WAITCNT,
        /**
         * WF is stalled at a barrier.
         */
        S_BARRIER
    };

    // HW slot id where the WF is mapped to inside a SIMD unit
    const int wfSlotId;
    int kernId;
    // SIMD unit where the WV has been scheduled
    const int simdId;
    // id of the execution unit (or pipeline) where the oldest instruction
    // of the WF is scheduled
    int execUnitId;
    int flatLmUnitId;
    int flatGmUnitId;
    // pointer to parent CU
    ComputeUnit *computeUnit;
    int maxIbSize;

    std::deque<GPUDynInstPtr> instructionBuffer;

    bool pendingFetch;
    bool dropFetch;
    // last tick during which all WFs in the CU are not idle
    Tick lastNonIdleTick;

    // Execution unit resource ID's associated with this WF
    // These are static mappings set at WF slot construction and
    // based off of the simdId and wfSlotId.

    // Index to scalarALUs resource vector in CU
    int scalarAlu;

    // Indices into readyList/dispatchList of resources used by this
    // wavefront
    int scalarAluGlobalIdx;
    int globalMem;
    int localMem;
    int scalarMem;

    // number of VGPRs required by WF
    uint32_t maxVgprs;
    // number of SGPRs required by WF
    uint32_t maxSgprs;
    void freeResources();
    GPUDynInstPtr nextInstr();
    void setStatus(status_e newStatus);
    status_e getStatus() { return status; }
    void resizeRegFiles(int num_vregs, int num_sregs);
    bool isGmInstruction(GPUDynInstPtr ii);
    bool isLmInstruction(GPUDynInstPtr ii);
    bool isOldestInstWaitcnt();
    bool isOldestInstGMem();
    bool isOldestInstLMem();
    bool isOldestInstPrivMem();
    bool isOldestInstFlatMem();
    bool isOldestInstVectorALU();
    bool isOldestInstScalarALU();
    bool isOldestInstScalarMem();
    bool isOldestInstBarrier();

    // used for passing spill address to DDInstGPU
    std::vector<Addr> lastAddr;
    std::vector<uint32_t> workItemId[3];
    std::vector<uint32_t> workItemFlatId;
    /* kernel launch parameters */
    uint32_t workGroupId[3];
    uint32_t workGroupSz[3];
    uint32_t gridSz[3];
    uint32_t wgId;
    uint32_t wgSz;
    /* the actual WG size can differ than the maximum size */
    uint32_t actualWgSz[3];
    uint32_t actualWgSzTotal;
    void computeActualWgSz(HSAQueueEntry *task);
    // wavefront id within a workgroup
    uint32_t wfId;
    uint32_t maxDynWaveId;
    uint32_t dispatchId;
    // vector and scalar memory requests pending in memory system
    int outstandingReqs;
    // outstanding global memory write requests
    int outstandingReqsWrGm;
    // outstanding local memory write requests
    int outstandingReqsWrLm;
    // outstanding global memory read requests
    int outstandingReqsRdGm;
    // outstanding local memory read requests
    int outstandingReqsRdLm;
    // outstanding scalar memory read requests
    int scalarOutstandingReqsRdGm;
    // outstanding scalar memory write requests
    int scalarOutstandingReqsWrGm;
    int rdLmReqsInPipe;
    int rdGmReqsInPipe;
    int wrLmReqsInPipe;
    int wrGmReqsInPipe;
    int scalarRdGmReqsInPipe;
    int scalarWrGmReqsInPipe;

    int memTraceBusy;
    uint64_t lastTrace;
    // number of virtual vector registers reserved by WF
    int reservedVectorRegs;
    // number of virtual scalar registers reserved by WF
    int reservedScalarRegs;
    // Index into the Vector Register File's namespace where the WF's registers
    // will live while the WF is executed
    uint32_t startVgprIndex;
    // Index into the Scalar Register File's namespace where the WF's registers
    // will live while the WF is executed
    uint32_t startSgprIndex;

    // Old value of destination gpr (for trace)
    std::vector<uint32_t> oldVgpr;
    // Id of destination gpr (for trace)
    uint32_t oldVgprId;
    // Tick count of last old_vgpr copy
    uint64_t oldVgprTcnt;

    // Old value of destination gpr (for trace)
    std::vector<uint64_t> oldDgpr;
    // Id of destination gpr (for trace)
    uint32_t oldDgprId;
    // Tick count of last old_vgpr copy
    uint64_t oldDgprTcnt;

    // Execution mask at wavefront start
    VectorMask initMask;

    // a pointer to the fraction of the LDS allocated
    // to this workgroup (thus this wavefront)
    LdsChunk *ldsChunk;

    // unique WF id over all WFs executed across all CUs
    uint64_t wfDynId;

    // Wavefront slot stats

    // Number of instructions executed by this wavefront slot across all
    // dynamic wavefronts
    Stats::Scalar numInstrExecuted;

    // Number of cycles this WF spends in SCH stage
    Stats::Scalar schCycles;

    // Number of stall cycles encounterd by this WF in SCH stage
    Stats::Scalar schStalls;

    // The following stats sum to the value of schStalls, and record, per
    // WF slot, what the cause of each stall was at a coarse granularity.

    // Cycles WF is selected by scheduler, but RFs cannot support instruction
    Stats::Scalar schRfAccessStalls;
    // Cycles spent waiting for execution resources
    Stats::Scalar schResourceStalls;
    // cycles spent waiting for RF reads to complete in SCH stage
    Stats::Scalar schOpdNrdyStalls;
    // LDS arbitration stall cycles. WF attempts to execute LM instruction,
    // but another wave is executing FLAT, which requires LM and GM and forces
    // this WF to stall.
    Stats::Scalar schLdsArbStalls;

    // number of times an instruction of a WF is blocked from being issued
    // due to WAR and WAW dependencies
    Stats::Scalar numTimesBlockedDueWAXDependencies;
    // number of times an instruction of a WF is blocked from being issued
    // due to WAR and WAW dependencies
    Stats::Scalar numTimesBlockedDueRAWDependencies;

    // dyn inst id (per SIMD) of last instruction exec from this wave
    uint64_t lastInstExec;

    // Distribution to track the distance between producer and consumer
    // for vector register values
    Stats::Distribution vecRawDistance;
    // Map to track the dyn instruction id of each vector register value
    // produced, indexed by physical vector register ID
    std::unordered_map<int,uint64_t> rawDist;

    // Distribution to track the number of times every vector register
    // value produced is consumed.
    Stats::Distribution readsPerWrite;
    // Counts the number of reads performed to each physical register
    // - counts are reset to 0 for each dynamic wavefront launched
    std::vector<int> vecReads;

    void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems);

    // context for save/restore
    uint8_t *context;

    typedef WavefrontParams Params;
    Wavefront(const Params &p);
    ~Wavefront();
    virtual void init();

    void
    setParent(ComputeUnit *cu)
    {
        computeUnit = cu;
    }

    void validateRequestCounters();
    void start(uint64_t _wfDynId, uint64_t _base_ptr);
    void exec();
    // called by SCH stage to reserve
    std::vector<int> reserveResources();
    bool stopFetch();
    void regStats();

    Addr pc() const;
    void pc(Addr new_pc);

    VectorMask& execMask();
    bool execMask(int lane) const;


    void discardFetch();

    bool waitCntsSatisfied();
    void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt);
    void clearWaitCnts();

    void incVMemInstsIssued();
    void incExpInstsIssued();
    void incLGKMInstsIssued();
    void decVMemInstsIssued();
    void decExpInstsIssued();
    void decLGKMInstsIssued();

    /** Freeing VRF space */
    void freeRegisterFile();

    TheGpuISA::GPUISA&
    gpuISA()
    {
        return _gpuISA;
    }

    void barrierId(int bar_id);
    int barrierId() const;
    bool hasBarrier() const;
    void releaseBarrier();

  private:
    TheGpuISA::GPUISA _gpuISA;

    void reserveGmResource(GPUDynInstPtr ii);
    void reserveLmResource(GPUDynInstPtr ii);

    /**
     * the following are used for waitcnt instructions
     * vmWaitCnt: once set, we wait for the oustanding
     *            number of vector mem instructions to be
     *            at, or below vmWaitCnt.
     *
     * expWaitCnt: once set, we wait for the outstanding
     *             number outstanding VM writes or EXP
     *             insts to be at, or below expWaitCnt.
     *
     * lgkmWaitCnt: once set, we wait for the oustanding
     *              number of LDS, GDS, scalar memory,
     *              and message instructions to be at, or
     *              below lgkmCount. we currently do not
     *              support GDS/message ops.
     */
    int vmWaitCnt;
    int expWaitCnt;
    int lgkmWaitCnt;
    int vmemInstsIssued;
    int expInstsIssued;
    int lgkmInstsIssued;
    status_e status;
    Addr _pc;
    VectorMask _execMask;
    int barId;
};

#endif // __GPU_COMPUTE_WAVEFRONT_HH__