arch-gcn3: add support for unaligned accesses

Previously, with HSAIL, we were guaranteed by the HSA specification that the GPU will never issue unaligned accesses. However, now that we are directly running GCN this is no longer true. Accordingly, this commit adds support for unaligned accesses. Moreover, to reduce the replication of nearly identical code for the different request types, I also added new helper functions that are called by all the different memory request producing instruction types in op_encodings.hh. Adding support for unaligned instructions requires changing the statusBitVector used to track the status of the memory requests for each lane from a bit per lane to an int per lane. This is necessary because an unaligned access may span multiple cache lines. In the worst case, each lane may span multiple cache lines. There are corresponding changes in the files that use the statusBitVector. Change-Id: I319bf2f0f644083e98ca546d2bfe68cf87a5f967 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29920 Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com> Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com> Tested-by: kokoro <noreply+kokoro@google.com>
2018-03-07 17:54:19 -05:00
parent fbcdf880ee
commit 8177fc4392
7 changed files with 298 additions and 242 deletions
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -832,7 +832,7 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
                        gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
                        gpuDynInst->disassemble(), w->outstandingReqs,
                        w->outstandingReqs - 1);
-        if (gpuDynInst->statusBitVector.none()) {
+        if (gpuDynInst->allLanesZero()) {
            // ask gm pipe to decrement request counters, instead of directly
            // performing here, to avoid asynchronous counter update and
            // instruction retirement (which may hurt waincnt effects)
@@ -1078,7 +1078,6 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
            gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
            gpuDynInst->tlbHitLevel[index] = hit_level;

-
            // translation is done. Schedule the mem_req_event at the
            // appropriate cycle to send the timing memory request to ruby
            EventFunctionWrapper *mem_req_event =
@@ -1116,9 +1115,9 @@ ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
        }
    } else {
        if (pkt->cmd == MemCmd::MemSyncReq) {
-            gpuDynInst->statusBitVector = VectorMask(0);
+            gpuDynInst->resetEntireStatusVector();
        } else {
-            gpuDynInst->statusBitVector &= (~(1ll << index));
+            gpuDynInst->decrementStatusVector(index);
        }

        // New SenderState for the memory access
@@ -1289,12 +1288,10 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
    gpuDynInst->memStatusVector[paddr].pop_back();
    gpuDynInst->pAddr = pkt->req->getPaddr();

-    gpuDynInst->statusBitVector &= (~(1ULL << index));
+    gpuDynInst->decrementStatusVector(index);
+    DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector());

-    DPRINTF(GPUMem, "bitvector is now %#x\n",
-            gpuDynInst->statusBitVector);
-
-    if (gpuDynInst->statusBitVector == VectorMask(0)) {
+    if (gpuDynInst->allLanesZero()) {
        auto iter = gpuDynInst->memStatusVector.begin();
        auto end = gpuDynInst->memStatusVector.end();

--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -42,9 +42,10 @@
 GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
                       GPUStaticInst *static_inst, InstSeqNum instSeqNum)
    : GPUExecContext(_cu, _wf), scalarAddr(0), addr(computeUnit()->wfSize(),
-      (Addr)0), statusBitVector(0), numScalarReqs(0), isSaveRestore(false),
+      (Addr)0), numScalarReqs(0), isSaveRestore(false),
      _staticInst(static_inst), _seqNum(instSeqNum)
 {
+    statusVector.assign(TheGpuISA::NumVecElemPerVecReg, 0);
    tlbHitLevel.assign(computeUnit()->wfSize(), -1);
    // vector instructions can have up to 4 source/destination operands
    d_data = new uint8_t[computeUnit()->wfSize() * 4 * sizeof(double)];
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -39,6 +39,8 @@

 #include "base/amo.hh"
 #include "base/logging.hh"
+#include "base/trace.hh"
+#include "debug/GPUMem.hh"
 #include "enums/StorageClassType.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_exec_context.hh"
@@ -307,13 +309,103 @@ class GPUDynInst : public GPUExecContext
        }
    }

+    // reset the number of pending memory requests for all lanes
+    void
+    resetEntireStatusVector()
+    {
+        assert(statusVector.size() == TheGpuISA::NumVecElemPerVecReg);
+        for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
+            resetStatusVector(lane);
+        }
+    }
+
+    // reset the number of pending memory requests for the inputted lane
+    void
+    resetStatusVector(int lane)
+    {
+        setStatusVector(lane, 0);
+    }
+
+    // set the number of pending memory requests for the inputted lane
+    void
+    setStatusVector(int lane, int newVal)
+    {
+        // currently we can have up to 2 memory requests per lane (if the
+        // lane's request goes across multiple cache lines)
+        assert((newVal >= 0) && (newVal <= 2));
+        statusVector[lane] = newVal;
+    }
+
+    // subtracts the number of pending memory requests for the inputted lane
+    // by 1
+    void
+    decrementStatusVector(int lane)
+    {
+        // this lane may have multiple requests, so only subtract one for
+        // this request
+        assert(statusVector[lane] >= 1);
+        statusVector[lane]--;
+    }
+
+    // return the current number of pending memory requests for the inputted
+    // lane
+    int
+    getLaneStatus(int lane) const
+    {
+        return statusVector[lane];
+    }
+
+    // returns true if all memory requests from all lanes have been received,
+    // else returns false
+    bool
+    allLanesZero() const
+    {
+        // local variables
+        bool allZero = true;
+
+        // iterate over all lanes, checking the number of pending memory
+        // requests they have
+        for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
+            // if any lane still has pending requests, return false
+            if (statusVector[lane] > 0) {
+                DPRINTF(GPUMem, "CU%d: WF[%d][%d]: lane: %d has %d pending "
+                        "request(s) for %#x\n", cu_id, simdId, wfSlotId, lane,
+                        statusVector[lane], addr[lane]);
+                allZero = false;
+            }
+        }
+
+        if (allZero) {
+            DPRINTF(GPUMem, "CU%d: WF[%d][%d]: all lanes have no pending"
+                    " requests for %#x\n", cu_id, simdId, wfSlotId, addr[0]);
+        }
+        return allZero;
+    }
+
+    // returns a string representing the current state of the statusVector
+    std::string
+    printStatusVector() const
+    {
+        std::string statusVec_str = "[";
+
+        // iterate over all lanes, adding the current number of pending
+        // requests for this lane to the string
+        for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
+            statusVec_str += std::to_string(statusVector[lane]);
+        }
+        statusVec_str += "]";
+
+        return statusVec_str;
+    }
+
    // Map returned packets and the addresses they satisfy with which lane they
    // were requested from
    typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
    StatusVector memStatusVector;

-    // Track the status of memory requests per lane, a bit per lane
-    VectorMask statusBitVector;
+    // Track the status of memory requests per lane, an int per lane to allow
+    // unaligned accesses
+    std::vector<int> statusVector;
    // for ld_v# or st_v#
    std::vector<int> tlbHitLevel;