gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model

Change-Id: Ibe46970f3ba25d62ca2ade5cbc2054ad746b2254 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29912 Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com> Tested-by: kokoro <noreply+kokoro@google.com>
2018-05-01 16:59:35 -04:00
parent b0eac7857a
commit b8da9abba7
86 changed files with 10299 additions and 3734 deletions
--- a/src/gpu-compute/wavefront.hh
+++ b/src/gpu-compute/wavefront.hh
@@ -31,161 +31,116 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

-#ifndef __WAVEFRONT_HH__
-#define __WAVEFRONT_HH__
+#ifndef __GPU_COMPUTE_WAVEFRONT_HH__
+#define __GPU_COMPUTE_WAVEFRONT_HH__

 #include <cassert>
 #include <deque>
+#include <list>
 #include <memory>
-#include <stack>
+#include <unordered_map>
 #include <vector>

 #include "arch/gpu_isa.hh"
 #include "base/logging.hh"
 #include "base/types.hh"
 #include "config/the_gpu_isa.hh"
-#include "gpu-compute/condition_register_state.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/hsa_queue_entry.hh"
 #include "gpu-compute/lds_state.hh"
 #include "gpu-compute/misc.hh"
-#include "gpu-compute/ndrange.hh"
 #include "params/Wavefront.hh"
 #include "sim/sim_object.hh"

-static const int MAX_NUM_INSTS_PER_WF = 12;
-
-/**
- * A reconvergence stack entry conveys the necessary state to implement
- * control flow divergence.
- */
-struct ReconvergenceStackEntry {
-    /**
-     * PC of current instruction.
-     */
-    uint32_t pc;
-    /**
-     * PC of the immediate post-dominator instruction, i.e., the value of
-     * @a pc for the first instruction that will be executed by the wavefront
-     * when a reconvergence point is reached.
-     */
-    uint32_t rpc;
-    /**
-     * Execution mask.
-     */
-    VectorMask execMask;
-};
-
-/*
- * Arguments for the hsail opcode call, are user defined and variable length.
- * The hardware/finalizer can support arguments in hardware or use memory to
- * pass arguments. For now, let's assume that an unlimited number of arguments
- * are supported in hardware (the compiler inlines functions whenver it can
- * anyways, so unless someone is interested in the implications of linking/
- * library functions, I think this is a reasonable assumption given the typical
- * size of an OpenCL kernel).
- *
- * Note that call args are different than kernel arguments:
- *   * All work-items in a kernel refer the same set of kernel arguments
- *   * Each work-item has it's on set of call args. So a call argument at
- *     address 0x4 is different for work-item 0 and work-item 1.
- *
- * Ok, the table below shows an example of how we organize the call arguments in
- * the CallArgMem class.
- *
- * int foo(int arg1, double arg2)
- *  ___________________________________________________
- * | 0: return.0 | 4: return.1 | ... | 252: return.63  |
- * |---------------------------------------------------|
- * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63    |
- * |---------------------------------------------------|
- * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63   |
- *  ___________________________________________________
- */
-class CallArgMem
-{
-  public:
-    // pointer to buffer for storing function arguments
-    uint8_t *mem;
-    int wfSize;
-    // size of function args
-    int funcArgsSizePerItem;
-
-    template<typename CType>
-    int
-    getLaneOffset(int lane, int addr)
-    {
-        return addr * wfSize + sizeof(CType) * lane;
-    }
-
-    CallArgMem(int func_args_size_per_item, int wf_size)
-        : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
-    {
-        mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
-    }
-
-    ~CallArgMem()
-    {
-        free(mem);
-    }
-
-    template<typename CType>
-    uint8_t*
-    getLaneAddr(int lane, int addr)
-    {
-        return mem + getLaneOffset<CType>(lane, addr);
-    }
-
-    template<typename CType>
-    void
-    setLaneAddr(int lane, int addr, CType val)
-    {
-        *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
-    }
-};
-
 class Wavefront : public SimObject
 {
  public:
-    enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
-    enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
-
-    // Base pointer for array of instruction pointers
-    uint64_t basePtr;
+    enum status_e {
+        // wavefront is stalled
+        S_STOPPED,
+        // wavefront is returning from a kernel
+        S_RETURNING,
+        // wavefront is running normally
+        S_RUNNING,
+        // wavefront is stalled
+        S_STALLED,
+        /**
+         * wavefront has unsatisfied wait counts
+         *
+         * while in this state the WF will only execute if
+         * the oldest instruction is the waitcnt. while in
+         * S_WAITCNT, the wavefront will not be ready until
+         * all of its waitcnts have been satisfied. the
+         * scoreboard ready() function will check the status
+         * of the waitcnts whenever the WF is in S_WAITCNT,
+         * and once they are satisfied, it will resume normal
+         * operation.
+         */
+        S_WAITCNT
+    };

    uint32_t oldBarrierCnt;
    uint32_t barrierCnt;
    uint32_t barrierId;
    uint32_t barrierSlots;
-    status_e status;
    // HW slot id where the WF is mapped to inside a SIMD unit
-    int wfSlotId;
+    const int wfSlotId;
    int kernId;
    // SIMD unit where the WV has been scheduled
-    int simdId;
+    const int simdId;
+    // id of the execution unit (or pipeline) where the oldest instruction
+    // of the WF is scheduled
+    int execUnitId;
+    int flatLmUnitId;
+    int flatGmUnitId;
    // pointer to parent CU
    ComputeUnit *computeUnit;
+    int maxIbSize;

    std::deque<GPUDynInstPtr> instructionBuffer;

    bool pendingFetch;
    bool dropFetch;
+    // last tick during which all WFs in the CU are not idle
+    Tick lastNonIdleTick;

-    // Condition Register State (for HSAIL simulations only)
-    class ConditionRegisterState *condRegState;
-    // number of single precision VGPRs required by WF
-    uint32_t maxSpVgprs;
-    // number of double precision VGPRs required by WF
-    uint32_t maxDpVgprs;
-    // map virtual to physical vector register
-    uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
-    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
+    // Execution unit resource ID's associated with this WF
+    // These are static mappings set at WF slot construction and
+    // based off of the simdId and wfSlotId.
+
+    // Index to scalarALUs resource vector in CU
+    int scalarAlu;
+
+    // Indices into readyList/dispatchList of resources used by this
+    // wavefront
+    int scalarAluGlobalIdx;
+    int globalMem;
+    int localMem;
+    int scalarMem;
+
+    // number of VGPRs required by WF
+    uint32_t maxVgprs;
+    // number of SGPRs required by WF
+    uint32_t maxSgprs;
+    void freeResources();
+    GPUDynInstPtr nextInstr();
+    void setStatus(status_e newStatus);
+    status_e getStatus() { return status; }
+    void resizeRegFiles(int num_vregs, int num_sregs);
    bool isGmInstruction(GPUDynInstPtr ii);
    bool isLmInstruction(GPUDynInstPtr ii);
+    bool isOldestInstWaitcnt();
    bool isOldestInstGMem();
    bool isOldestInstLMem();
    bool isOldestInstPrivMem();
    bool isOldestInstFlatMem();
-    bool isOldestInstALU();
+    bool isOldestInstVectorALU();
+    bool isOldestInstScalarALU();
+    bool isOldestInstScalarMem();
    bool isOldestInstBarrier();
+
    // used for passing spill address to DDInstGPU
    std::vector<Addr> lastAddr;
    std::vector<uint32_t> workItemId[3];
@@ -199,36 +154,44 @@ class Wavefront : public SimObject
    /* the actual WG size can differ than the maximum size */
    uint32_t actualWgSz[3];
    uint32_t actualWgSzTotal;
-    void computeActualWgSz(NDRange *ndr);
+    void computeActualWgSz(HSAQueueEntry *task);
    // wavefront id within a workgroup
    uint32_t wfId;
    uint32_t maxDynWaveId;
    uint32_t dispatchId;
-    // outstanding global+local memory requests
-    uint32_t outstandingReqs;
-    // memory requests between scoreboard
-    // and execute stage not yet executed
-    uint32_t memReqsInPipe;
+    // vector and scalar memory requests pending in memory system
+    int outstandingReqs;
    // outstanding global memory write requests
-    uint32_t outstandingReqsWrGm;
+    int outstandingReqsWrGm;
    // outstanding local memory write requests
-    uint32_t outstandingReqsWrLm;
+    int outstandingReqsWrLm;
    // outstanding global memory read requests
-    uint32_t outstandingReqsRdGm;
+    int outstandingReqsRdGm;
    // outstanding local memory read requests
-    uint32_t outstandingReqsRdLm;
-    uint32_t rdLmReqsInPipe;
-    uint32_t rdGmReqsInPipe;
-    uint32_t wrLmReqsInPipe;
-    uint32_t wrGmReqsInPipe;
+    int outstandingReqsRdLm;
+    // outstanding scalar memory read requests
+    int scalarOutstandingReqsRdGm;
+    // outstanding scalar memory write requests
+    int scalarOutstandingReqsWrGm;
+    int rdLmReqsInPipe;
+    int rdGmReqsInPipe;
+    int wrLmReqsInPipe;
+    int wrGmReqsInPipe;
+    int scalarRdGmReqsInPipe;
+    int scalarWrGmReqsInPipe;

    int memTraceBusy;
    uint64_t lastTrace;
-    // number of vector registers reserved by WF
+    // number of virtual vector registers reserved by WF
    int reservedVectorRegs;
+    // number of virtual scalar registers reserved by WF
+    int reservedScalarRegs;
    // Index into the Vector Register File's namespace where the WF's registers
    // will live while the WF is executed
    uint32_t startVgprIndex;
+    // Index into the Scalar Register File's namespace where the WF's registers
+    // will live while the WF is executed
+    uint32_t startSgprIndex;

    // Old value of destination gpr (for trace)
    std::vector<uint32_t> oldVgpr;
@@ -257,64 +220,63 @@ class Wavefront : public SimObject
    // to this workgroup (thus this wavefront)
    LdsChunk *ldsChunk;

-    // A pointer to the spill area
-    Addr spillBase;
-    // The size of the spill area
-    uint32_t spillSizePerItem;
-    // The vector width of the spill area
-    uint32_t spillWidth;
-
-    // A pointer to the private memory area
-    Addr privBase;
-    // The size of the private memory area
-    uint32_t privSizePerItem;
-
-    // A pointer ot the read-only memory area
-    Addr roBase;
-    // size of the read-only memory area
-    uint32_t roSize;
-
-    // pointer to buffer for storing kernel arguments
-    uint8_t *kernelArgs;
    // unique WF id over all WFs executed across all CUs
    uint64_t wfDynId;

-    // number of times instruction issue for this wavefront is blocked
-    // due to VRF port availability
-    Stats::Scalar numTimesBlockedDueVrfPortAvail;
+    // Wavefront slot stats
+
+    // Number of instructions executed by this wavefront slot across all
+    // dynamic wavefronts
+    Stats::Scalar numInstrExecuted;
+
+    // Number of cycles this WF spends in SCH stage
+    Stats::Scalar schCycles;
+
+    // Number of stall cycles encounterd by this WF in SCH stage
+    Stats::Scalar schStalls;
+
+    // The following stats sum to the value of schStalls, and record, per
+    // WF slot, what the cause of each stall was at a coarse granularity.
+
+    // Cycles WF is selected by scheduler, but RFs cannot support instruction
+    Stats::Scalar schRfAccessStalls;
+    // Cycles spent waiting for execution resources
+    Stats::Scalar schResourceStalls;
+    // cycles spent waiting for RF reads to complete in SCH stage
+    Stats::Scalar schOpdNrdyStalls;
+    // LDS arbitration stall cycles. WF attempts to execute LM instruction,
+    // but another wave is executing FLAT, which requires LM and GM and forces
+    // this WF to stall.
+    Stats::Scalar schLdsArbStalls;
+
    // number of times an instruction of a WF is blocked from being issued
    // due to WAR and WAW dependencies
    Stats::Scalar numTimesBlockedDueWAXDependencies;
    // number of times an instruction of a WF is blocked from being issued
    // due to WAR and WAW dependencies
    Stats::Scalar numTimesBlockedDueRAWDependencies;
-    // distribution of executed instructions based on their register
-    // operands; this is used to highlight the load on the VRF
-    Stats::Distribution srcRegOpDist;
-    Stats::Distribution dstRegOpDist;

-    // Functions to operate on call argument memory
-    // argument memory for hsail call instruction
-    CallArgMem *callArgMem;
-    void
-    initCallArgMem(int func_args_size_per_item, int wf_size)
-    {
-        callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
-    }
+    // dyn inst id (per SIMD) of last instruction exec from this wave
+    uint64_t lastInstExec;

-    template<typename CType>
-    CType
-    readCallArgMem(int lane, int addr)
-    {
-        return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
-    }
+    // Distribution to track the distance between producer and consumer
+    // for vector register values
+    Stats::Distribution vecRawDistance;
+    // Map to track the dyn instruction id of each vector register value
+    // produced, indexed by physical vector register ID
+    std::unordered_map<int,uint64_t> rawDist;

-    template<typename CType>
-    void
-    writeCallArgMem(int lane, int addr, CType val)
-    {
-        callArgMem->setLaneAddr<CType>(lane, addr, val);
-    }
+    // Distribution to track the number of times every vector register
+    // value produced is consumed.
+    Stats::Distribution readsPerWrite;
+    // Counts the number of reads performed to each physical register
+    // - counts are reset to 0 for each dynamic wavefront launched
+    std::vector<int> vecReads;
+
+    void initRegState(HSAQueueEntry *task, int wgSizeInWorkItems);
+
+    // context for save/restore
+    uint8_t *context;

    typedef WavefrontParams Params;
    Wavefront(const Params *p);
@@ -327,50 +289,31 @@ class Wavefront : public SimObject
        computeUnit = cu;
    }

+    void validateRequestCounters();
    void start(uint64_t _wfDynId, uint64_t _base_ptr);
    void exec();
-    void updateResources();
-    int ready(itype_e type);
-    bool instructionBufferHasBranch();
+    // called by SCH stage to reserve
+    std::vector<int> reserveResources();
+    bool stopFetch();
    void regStats();
-    VectorMask getPred() { return execMask() & initMask; }

    bool waitingAtBarrier(int lane);

-    void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
-                                  const VectorMask& exec_mask);
-
-    void popFromReconvergenceStack();
-
-    uint32_t pc() const;
-
-    uint32_t rpc() const;
-
-    VectorMask execMask() const;
+    Addr pc() const;
+    void pc(Addr new_pc);

+    VectorMask& execMask();
    bool execMask(int lane) const;

-    void pc(uint32_t new_pc);

    void discardFetch();

-    /**
-     * Returns the size of the static hardware context of a particular wavefront
-     * This should be updated everytime the context is changed
-     */
-    uint32_t getStaticContextSize() const;
+    bool waitCntsSatisfied();
+    void setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt);
+    void clearWaitCnts();

-    /**
-     * Returns the hardware context as a stream of bytes
-     * This method is designed for HSAIL execution
-     */
-    void getContext(const void *out);
-
-    /**
-     * Sets the hardware context fromt a stream of bytes
-     * This method is designed for HSAIL execution
-     */
-    void setContext(const void *in);
+    /** Freeing VRF space */
+    void freeRegisterFile();

    TheGpuISA::GPUISA&
    gpuISA()
@@ -380,14 +323,32 @@ class Wavefront : public SimObject

  private:
    TheGpuISA::GPUISA _gpuISA;
+
+    void reserveGmResource(GPUDynInstPtr ii);
+    void reserveLmResource(GPUDynInstPtr ii);
+
    /**
-     * Stack containing Control Flow Graph nodes (i.e., kernel instructions)
-     * to be visited by the wavefront, and the associated execution masks. The
-     * reconvergence stack grows every time the wavefront reaches a divergence
-     * point (branch instruction), and shrinks every time the wavefront
-     * reaches a reconvergence point (immediate post-dominator instruction).
+     * the following are used for waitcnt instructions
+     * vmWaitCnt: once set, we wait for the oustanding
+     *            number of vector mem instructions to be
+     *            at, or below vmWaitCnt.
+     *
+     * expWaitCnt: once set, we wait for the outstanding
+     *             number outstanding VM writes or EXP
+     *             insts to be at, or below expWaitCnt.
+     *
+     * lgkmWaitCnt: once set, we wait for the oustanding
+     *              number of LDS, GDS, scalar memory,
+     *              and message instructions to be at, or
+     *              below lgkmCount. we currently do not
+     *              support GDS/message ops.
     */
-    std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
+    int vmWaitCnt;
+    int expWaitCnt;
+    int lgkmWaitCnt;
+    status_e status;
+    Addr _pc;
+    VectorMask _execMask;
 };

-#endif // __WAVEFRONT_HH__
+#endif // __GPU_COMPUTE_WAVEFRONT_HH__