gpu-compute,mem-ruby: Refactor GPU coalescer

Remove the read/write tables and coalescing table and introduce a two levels of tables for uncoalesced and coalesced packets. Tokens are granted to GPU instructions to place in uncoalesced table. If tokens are available, the operation always succeeds such that the 'Aliased' status is never returned. Coalesced accesses are placed in the coalesced table while requests are outstanding. Requests to the same address are added as targets to the table similar to how MSHRs operate. Change-Id: I44983610307b638a97472db3576d0a30df2de600 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/27429 Reviewed-by: Bradford Beckmann <brad.beckmann@amd.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Bradford Beckmann <brad.beckmann@amd.com> Tested-by: kokoro <noreply+kokoro@google.com>
2020-05-08 10:37:59 -05:00
parent 2f9cc04a5f
commit 3d57eaf9f5
10 changed files with 572 additions and 643 deletions
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -129,6 +129,8 @@ class ComputeUnit(ClockedObject):
                                      "memory pipeline's queues")
    local_mem_queue_size = Param.Int(256, "Number of entries in the local "
                                      "memory pipeline's queues")
+    max_cu_tokens = Param.Int(4, "Maximum number of tokens, i.e., the number"\
+                            " of instructions that can be sent to coalescer")
    ldsBus = Bridge() # the bridge between the CU and its LDS
    ldsPort = MasterPort("The port that goes to the LDS")
    localDataStore = Param.LdsState("the LDS for this CU")
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -74,9 +74,9 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), fetchStage(p),
    req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
    resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
    _masterId(p->system->getMasterId(this, "ComputeUnit")),
-    lds(*p->localDataStore), _cacheLineSize(p->system->cacheLineSize()),
-    globalSeqNum(0), wavefrontSize(p->wfSize),
-    kernelLaunchInst(new KernelLaunchStaticInst())
+    lds(*p->localDataStore), gmTokenPort(name() + ".gmTokenPort", this),
+    _cacheLineSize(p->system->cacheLineSize()), globalSeqNum(0),
+    wavefrontSize(p->wfSize), kernelLaunchInst(new KernelLaunchStaticInst())
 {
    /**
     * This check is necessary because std::bitset only provides conversion
@@ -139,6 +139,10 @@ ComputeUnit::ComputeUnit(const Params *p) : ClockedObject(p), fetchStage(p),

    memPort.resize(wfSize());

+    // Setup tokens for slave ports. The number of tokens in memSlaveTokens
+    // is the total token count for the entire vector port (i.e., this CU).
+    memPortTokens = new TokenManager(p->max_cu_tokens);
+
    // resize the tlbPort vectorArray
    int tlbPort_width = perLaneTLB ? wfSize() : 1;
    tlbPort.resize(tlbPort_width);
@@ -612,6 +616,8 @@ ComputeUnit::init()
    vectorAluInstAvail.resize(numSIMDs, false);
    shrMemInstAvail = 0;
    glbMemInstAvail = 0;
+
+    gmTokenPort.setTokenManager(memPortTokens);
 }

 bool
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -51,6 +51,7 @@
 #include "gpu-compute/schedule_stage.hh"
 #include "gpu-compute/scoreboard_check_stage.hh"
 #include "mem/port.hh"
+#include "mem/token_port.hh"
 #include "sim/clocked_object.hh"

 static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
@@ -415,6 +416,26 @@ class ComputeUnit : public ClockedObject

    CUExitCallback *cuExitCallback;

+    class GMTokenPort : public TokenMasterPort
+    {
+      public:
+        GMTokenPort(const std::string& name, SimObject *owner,
+                    PortID id = InvalidPortID)
+            : TokenMasterPort(name, owner, id)
+        { }
+        ~GMTokenPort() { }
+
+      protected:
+        bool recvTimingResp(PacketPtr) { return false; }
+        void recvReqRetry() { }
+    };
+
+    // Manager for the number of tokens available to this compute unit to
+    // send global memory request packets to the coalescer this is only used
+    // between global memory pipe and TCP coalescer.
+    TokenManager *memPortTokens;
+    GMTokenPort gmTokenPort;
+
    /** Data access Port **/
    class DataPort : public MasterPort
    {
@@ -677,6 +698,12 @@ class ComputeUnit : public ClockedObject
        return ldsPort;
    }

+    TokenManager *
+    getTokenManager()
+    {
+        return memPortTokens;
+    }
+
    /** The memory port for SIMD data accesses.
     *  Can be connected to PhysMem for Ruby for timing simulations
     */
@@ -712,6 +739,8 @@ class ComputeUnit : public ClockedObject
            }
            ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
            return *ldsPort;
+        } else if (if_name == "gmTokenPort") {
+            return gmTokenPort;
        } else {
            panic("incorrect port name");
        }
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -33,6 +33,7 @@

 #include "gpu-compute/global_memory_pipeline.hh"

+#include "debug/GPUCoalescer.hh"
 #include "debug/GPUMem.hh"
 #include "debug/GPUReg.hh"
 #include "gpu-compute/compute_unit.hh"
@@ -56,6 +57,25 @@ GlobalMemPipeline::init(ComputeUnit *cu)
    _name = computeUnit->name() + ".GlobalMemPipeline";
 }

+bool
+GlobalMemPipeline::coalescerReady(GPUDynInstPtr mp) const
+{
+    // We require one token from the coalescer's uncoalesced table to
+    // proceed
+    int token_count = 1;
+
+    // Make sure the vector port has tokens. There is a single pool
+    // of tokens so only one port in the vector port needs to be checked.
+    // Lane 0 is chosen arbirarily.
+    DPRINTF(GPUCoalescer, "Checking for %d tokens\n", token_count);
+    if (!mp->computeUnit()->getTokenManager()->haveTokens(token_count)) {
+        DPRINTF(GPUCoalescer, "Stalling inst because coalsr is busy!\n");
+        return false;
+    }
+
+    return true;
+}
+
 void
 GlobalMemPipeline::exec()
 {
@@ -124,6 +144,14 @@ GlobalMemPipeline::exec()
            }
        }

+        DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n",
+                mp->disassemble(), mp->seqNum());
+        // Memfences will not return tokens and must be issued so we should
+        // not request one as this will deplete the token count until deadlock
+        if (!mp->isMemFence()) {
+            assert(mp->computeUnit()->getTokenManager()->haveTokens(1));
+            mp->computeUnit()->getTokenManager()->acquireTokens(1);
+        }
        mp->initiateAcc(mp);

        if (!outOfOrderDataDelivery && !mp->isMemFence()) {
--- a/src/gpu-compute/global_memory_pipeline.hh
+++ b/src/gpu-compute/global_memory_pipeline.hh
@@ -121,6 +121,8 @@ class GlobalMemPipeline
        loadVrfBankConflictCycles += num_cycles;
    }

+    bool coalescerReady(GPUDynInstPtr mp) const;
+
  private:
    ComputeUnit *computeUnit;
    std::string _name;
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -434,6 +434,11 @@ Wavefront::ready(itype_e type)
            return 0;
        }

+        // Does the coalescer have space for our instruction?
+        if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
+            return 0;
+        }
+
        if (!computeUnit->globalMemoryPipe.
            isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
            // Can we insert a new request to the Global Mem Request FIFO?
@@ -504,6 +509,12 @@ Wavefront::ready(itype_e type)
        if (!locMemIssueRdy) {
            return 0;
        }
+
+        // Does the coalescer have space for our instruction?
+        if (!computeUnit->globalMemoryPipe.coalescerReady(ii)) {
+            return 0;
+        }
+
        if (!computeUnit->globalMemoryPipe.
            isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
            // Can we insert a new request to the Global Mem Request FIFO?
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -48,6 +48,7 @@
 #include "mem/ruby/protocol/RubyRequestType.hh"
 #include "mem/ruby/protocol/SequencerRequestType.hh"
 #include "mem/ruby/system/Sequencer.hh"
+#include "mem/token_port.hh"

 class DataBlock;
 class CacheMsg;
@@ -59,47 +60,99 @@ class RubyGPUCoalescerParams;
 HSAScope reqScopeToHSAScope(const RequestPtr &req);
 HSASegment reqSegmentToHSASegment(const RequestPtr &req);

-struct GPUCoalescerRequest
-{
-    PacketPtr pkt;
-    RubyRequestType m_type;
-    Cycles issue_time;
+// List of packets that belongs to a specific instruction.
+typedef std::list<PacketPtr> PerInstPackets;

-    GPUCoalescerRequest(PacketPtr _pkt, RubyRequestType _m_type,
-                        Cycles _issue_time)
-        : pkt(_pkt), m_type(_m_type), issue_time(_issue_time)
-    {}
-};
-
-class RequestDesc
+class UncoalescedTable
 {
  public:
-    RequestDesc(PacketPtr pkt, RubyRequestType p_type, RubyRequestType s_type)
-        : pkt(pkt), primaryType(p_type), secondaryType(s_type)
-    {
-    }
+    UncoalescedTable(GPUCoalescer *gc);
+    ~UncoalescedTable() {}

-    RequestDesc() : pkt(nullptr), primaryType(RubyRequestType_NULL),
-        secondaryType(RubyRequestType_NULL)
-    {
-    }
+    void insertPacket(PacketPtr pkt);
+    bool packetAvailable();
+    void printRequestTable(std::stringstream& ss);

-    PacketPtr pkt;
-    RubyRequestType primaryType;
-    RubyRequestType secondaryType;
+    // Returns a pointer to the list of packets corresponding to an
+    // instruction in the instruction map or nullptr if there are no
+    // instructions at the offset.
+    PerInstPackets* getInstPackets(int offset);
+    void updateResources();
+
+    // Check if a packet hasn't been removed from instMap in too long.
+    // Panics if a deadlock is detected and returns nothing otherwise.
+    void checkDeadlock(Tick threshold);
+
+  private:
+    GPUCoalescer *coalescer;
+
+    // Maps an instructions unique sequence number to a queue of packets
+    // which need responses. This data structure assumes the sequence number
+    // is monotonically increasing (which is true for CU class) in order to
+    // issue packets in age order.
+    std::map<uint64_t, PerInstPackets> instMap;
 };

-std::ostream& operator<<(std::ostream& out, const GPUCoalescerRequest& obj);
+class CoalescedRequest
+{
+  public:
+    CoalescedRequest(uint64_t _seqNum)
+        : seqNum(_seqNum), issueTime(Cycles(0)),
+          rubyType(RubyRequestType_NULL)
+    {}
+    ~CoalescedRequest() {}
+
+    void insertPacket(PacketPtr pkt) { pkts.push_back(pkt); }
+    void setSeqNum(uint64_t _seqNum) { seqNum = _seqNum; }
+    void setIssueTime(Cycles _issueTime) { issueTime = _issueTime; }
+    void setRubyType(RubyRequestType type) { rubyType = type; }
+
+    uint64_t getSeqNum() const { return seqNum; }
+    PacketPtr getFirstPkt() const { return pkts[0]; }
+    Cycles getIssueTime() const { return issueTime; }
+    RubyRequestType getRubyType() const { return rubyType; }
+    std::vector<PacketPtr>& getPackets() { return pkts; }
+
+  private:
+    uint64_t seqNum;
+    Cycles issueTime;
+    RubyRequestType rubyType;
+    std::vector<PacketPtr> pkts;
+};

 class GPUCoalescer : public RubyPort
 {
  public:
+    class GMTokenPort : public TokenSlavePort
+    {
+      public:
+        GMTokenPort(const std::string& name, ClockedObject *owner,
+                    PortID id = InvalidPortID)
+            : TokenSlavePort(name, owner, id)
+        { }
+        ~GMTokenPort() { }
+
+      protected:
+        Tick recvAtomic(PacketPtr) { return Tick(0); }
+        void recvFunctional(PacketPtr) { }
+        bool recvTimingReq(PacketPtr) { return false; }
+        AddrRangeList getAddrRanges() const
+        {
+            AddrRangeList ranges;
+            return ranges;
+        }
+    };
+
    typedef RubyGPUCoalescerParams Params;
    GPUCoalescer(const Params *);
    ~GPUCoalescer();

+    Port &getPort(const std::string &if_name,
+                  PortID idx = InvalidPortID) override;
+
    // Public Methods
    void wakeup(); // Used only for deadlock detection
+    void printRequestTable(std::stringstream& ss);

    void printProgress(std::ostream& out) const;
    void resetStats() override;
@@ -177,13 +230,13 @@ class GPUCoalescer : public RubyPort

    void print(std::ostream& out) const;

-    void markRemoved();
-    void removeRequest(GPUCoalescerRequest* request);
    void evictionCallback(Addr address);
    void completeIssue();

    void insertKernel(int wavefront_id, PacketPtr pkt);

+    GMTokenPort& getGMTokenPort() { return gmTokenPort; }
+
    void recordRequestType(SequencerRequestType requestType);
    Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }

@@ -224,11 +277,11 @@ class GPUCoalescer : public RubyPort
                        Addr pc, RubyAccessMode access_mode,
                        int size, DataBlock*& data_ptr);
    // Alternate implementations in VIPER Coalescer
-    virtual void issueRequest(PacketPtr pkt, RubyRequestType type);
+    virtual void issueRequest(CoalescedRequest* crequest);

    void kernelCallback(int wavfront_id);

-    void hitCallback(GPUCoalescerRequest* request,
+    void hitCallback(CoalescedRequest* crequest,
                     MachineType mach,
                     DataBlock& data,
                     bool success,
@@ -236,21 +289,23 @@ class GPUCoalescer : public RubyPort
                     Cycles forwardRequestTime,
                     Cycles firstResponseTime,
                     bool isRegion);
-    void recordMissLatency(GPUCoalescerRequest* request,
+    void recordMissLatency(CoalescedRequest* crequest,
                           MachineType mach,
                           Cycles initialRequestTime,
                           Cycles forwardRequestTime,
                           Cycles firstResponseTime,
                           bool success, bool isRegion);
-    void completeHitCallback(std::vector<PacketPtr> & mylist, int len);
-    PacketPtr mapAddrToPkt(Addr address);
+    void completeHitCallback(std::vector<PacketPtr> & mylist);


-    RequestStatus getRequestStatus(PacketPtr pkt,
-                                   RubyRequestType request_type);
-    bool insertRequest(PacketPtr pkt, RubyRequestType request_type);
+    virtual RubyRequestType getRequestType(PacketPtr pkt);

-    bool handleLlsc(Addr address, GPUCoalescerRequest* request);
+    // Attempt to remove a packet from the uncoalescedTable and coalesce
+    // with a previous request from the same instruction. If there is no
+    // previous instruction and the max number of outstanding requests has
+    // not be reached, a new coalesced request is created and added to the
+    // "target" list of the coalescedTable.
+    bool coalescePacket(PacketPtr pkt);

    EventFunctionWrapper issueEvent;

@@ -258,22 +313,27 @@ class GPUCoalescer : public RubyPort
  // Changed to protected to enable inheritance by VIPER Coalescer
  protected:
    int m_max_outstanding_requests;
-    int m_deadlock_threshold;
+    Cycles m_deadlock_threshold;

    CacheMemory* m_dataCache_ptr;
    CacheMemory* m_instCache_ptr;

-    // We need to track both the primary and secondary request types.
-    // The secondary request type comprises a subset of RubyRequestTypes that
-    // are understood by the L1 Controller. A primary request type can be any
-    // RubyRequestType.
-    typedef std::unordered_map<Addr, std::vector<RequestDesc>> CoalescingTable;
-    CoalescingTable reqCoalescer;
-    std::vector<Addr> newRequests;
+    // coalescingWindow is the maximum number of instructions that are
+    // allowed to be coalesced in a single cycle.
+    int coalescingWindow;
+
+    // The uncoalescedTable contains several "columns" which hold memory
+    // request packets for an instruction. The maximum size is the number of
+    // columns * the wavefront size.
+    UncoalescedTable uncoalescedTable;
+
+    // An MSHR-like struct for holding coalesced requests. The requests in
+    // this table may or may not be outstanding in the memory hierarchy. The
+    // maximum size is equal to the maximum outstanding requests for a CU
+    // (typically the number of blocks in TCP). If there are duplicates of
+    // an address, the are serviced in age order.
+    std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;

-    typedef std::unordered_map<Addr, GPUCoalescerRequest*> RequestTable;
-    RequestTable m_writeRequestTable;
-    RequestTable m_readRequestTable;
    // Global outstanding request count, across all request tables
    int m_outstanding_count;
    bool m_deadlock_check_scheduled;
@@ -334,7 +394,12 @@ class GPUCoalescer : public RubyPort
    std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
    std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;

-private:
+  private:
+    // Token port is used to send/receive tokens to/from GPU's global memory
+    // pipeline across the port boundary. There is one per <wave size> data
+    // ports in the CU.
+    GMTokenPort gmTokenPort;
+
    // Private copy constructor and assignment operator
    GPUCoalescer(const GPUCoalescer& obj);
    GPUCoalescer& operator=(const GPUCoalescer& obj);
--- a/src/mem/ruby/system/GPUCoalescer.py
+++ b/src/mem/ruby/system/GPUCoalescer.py
@@ -42,6 +42,8 @@ class RubyGPUCoalescer(RubyPort):
   # max_outstanding_requests = (wave front slots) x (wave front size)
   max_outstanding_requests = Param.Int(40*64,
                                "max requests (incl. prefetches) outstanding")
+   max_coalesces_per_cycle = Param.Int(1, "max instructions that can be " \
+                                "coalesced in a single cycle")
   assume_rfo = Param.Bool(True, "assume protocol implementes Read for "
                           "Ownership coherence");

--- a/src/mem/ruby/system/VIPERCoalescer.cc
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -76,15 +76,8 @@ VIPERCoalescer::~VIPERCoalescer()
 {
 }

-// Analyzes the packet to see if this request can be coalesced.
-// If request can be coalesced, this request is added to the reqCoalescer table
-// and makeRequest returns RequestStatus_Issued;
-// If this is the first request to a cacheline, request is added to both
-// newRequests queue and to the reqCoalescer table; makeRequest
-// returns RequestStatus_Issued.
-// If there is a pending request to this cacheline and this request
-// can't be coalesced, RequestStatus_Aliased is returned and
-// the packet needs to be reissued.
+// Places an uncoalesced packet in uncoalescedTable. If the packet is a
+// special type (MemFence, scoping, etc), it is issued immediately.
 RequestStatus
 VIPERCoalescer::makeRequest(PacketPtr pkt)
 {
@@ -109,7 +102,6 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)

            return RequestStatus_Issued;
        }
-//        return RequestStatus_Aliased;
    } else if (pkt->req->isKernel() && pkt->req->isRelease()) {
        // Flush Dirty Data on Kernel End
        // isKernel + isRelease
@@ -123,13 +115,10 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
        }
        return RequestStatus_Issued;
    }
-    RequestStatus requestStatus = GPUCoalescer::makeRequest(pkt);
-    if (requestStatus!=RequestStatus_Issued) {
-        // Request not isssued
-        // enqueue Retry
-        DPRINTF(GPUCoalescer, "Request not issued by GPUCoaleser\n");
-        return requestStatus;
-    } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
+
+    GPUCoalescer::makeRequest(pkt);
+
+    if (pkt->req->isKernel() && pkt->req->isAcquire()) {
        // Invalidate clean Data on Kernel Begin
        // isKernel + isAcquire
        invL1();