gpu-compute: Support Scalar and Vector access to system pages

The amdgpu driver supports reading and writing scalar and vector memory addresses that reside in system memory. This is commonly used for things like blit kernels that perform host-to-device or device-to-host copies using GPU load/store instructions. This is done by utilizing the system hub device added in a prior changeset. Memory packets translated by the Scalar or VMEM TLBs will have the correspoding system request field set from the PTE in the TLB which can be used in the compute unit to determine if a request is for system memory or not. Another important change is to return global memory tokens for system requests. Since these do not flow through the GPU coalescer where the token is returned, the token can be returned once the request is known to be a system request. Change-Id: I35030e0b3698f10c63a397f96b81267271e3130e Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/57711 Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
2022-03-15 13:34:58 -05:00
parent 347364ab0f
commit f375e79bcf
4 changed files with 125 additions and 9 deletions
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -112,6 +112,12 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p),
    scheduleToExecute(p),
    stats(this, p.n_wf)
 {
+    // This is not currently supported and would require adding more handling
+    // for system vs. device memory requests on the functional paths, so we
+    // fatal immediately in the constructor if this configuration is seen.
+    fatal_if(functionalTLB && FullSystem,
+             "Functional TLB not supported in full-system GPU simulation");
+
    /**
     * This check is necessary because std::bitset only provides conversion
     * to unsigned long or unsigned long long via to_ulong() or to_ullong().
@@ -800,6 +806,12 @@ ComputeUnit::init()

 bool
 ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
+{
+    return handleResponse(pkt);
+}
+
+bool
+ComputeUnit::DataPort::handleResponse(PacketPtr pkt)
 {
    // Ruby has completed the memory op. Schedule the mem_resp_event at the
    // appropriate cycle to process the timing memory response
@@ -901,6 +913,12 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)

 bool
 ComputeUnit::ScalarDataPort::recvTimingResp(PacketPtr pkt)
+{
+    return handleResponse(pkt);
+}
+
+bool
+ComputeUnit::ScalarDataPort::handleResponse(PacketPtr pkt)
 {
    assert(!pkt->req->isKernel());

@@ -1241,9 +1259,13 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
    assert(gpuDynInst->isGlobalSeg() ||
           gpuDynInst->executedAs() == enums::SC_GLOBAL);

+    // Fences will never be issued to system memory, so we can mark the
+    // requestor as a device memory ID here.
    if (!req) {
        req = std::make_shared<Request>(
-            0, 0, 0, requestorId(), 0, gpuDynInst->wfDynId);
+            0, 0, 0, vramRequestorId(), 0, gpuDynInst->wfDynId);
+    } else {
+        req->requestorId(vramRequestorId());
    }

    // all mem sync requests have Paddr == 0
@@ -1544,6 +1566,24 @@ ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
            new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
                                                   nullptr);

+    // Set VRAM ID for device requests
+    // For now, system vmem requests use functional reads. This is not that
+    // critical to model as the region of interest should always be accessing
+    // device memory. System vmem requests are used by blit kernels to do
+    // memcpys and load code objects into device memory.
+    if (new_pkt->req->systemReq()) {
+        // There will be multiple packets returned for the same gpuDynInst,
+        // so first check if systemReq is not already set and if so, return
+        // the token acquired when the dispatch list is filled as system
+        // requests do not require a GPU coalescer token.
+        if (!gpuDynInst->isSystemReq()) {
+            computeUnit->getTokenManager()->recvTokens(1);
+            gpuDynInst->setSystemReq();
+        }
+    } else {
+        new_pkt->req->requestorId(computeUnit->vramRequestorId());
+    }
+
    // translation is done. Schedule the mem_req_event at the appropriate
    // cycle to send the timing memory request to ruby
    EventFunctionWrapper *mem_req_event =
@@ -1582,7 +1622,11 @@ ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt)
    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
    [[maybe_unused]] ComputeUnit *compute_unit = computeUnit;

-    if (!(sendTimingReq(pkt))) {
+    if (pkt->req->systemReq()) {
+        assert(compute_unit->shader->systemHub);
+        SystemHubEvent *resp_event = new SystemHubEvent(pkt, this);
+        compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
+    } else if (!(sendTimingReq(pkt))) {
        retries.push_back(std::make_pair(pkt, gpuDynInst));

        DPRINTF(GPUPort,
@@ -1611,7 +1655,11 @@ ComputeUnit::ScalarDataPort::MemReqEvent::process()
    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
    [[maybe_unused]] ComputeUnit *compute_unit = scalarDataPort.computeUnit;

-    if (!(scalarDataPort.sendTimingReq(pkt))) {
+    if (pkt->req->systemReq()) {
+        assert(compute_unit->shader->systemHub);
+        SystemHubEvent *resp_event = new SystemHubEvent(pkt, &scalarDataPort);
+        compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
+    } else if (!(scalarDataPort.sendTimingReq(pkt))) {
        scalarDataPort.retries.push_back(pkt);

        DPRINTF(GPUPort,
@@ -1712,15 +1760,26 @@ ComputeUnit::ScalarDTLBPort::recvTimingResp(PacketPtr pkt)
    req_pkt->senderState =
        new ComputeUnit::ScalarDataPort::SenderState(gpuDynInst);

-    if (!computeUnit->scalarDataPort.sendTimingReq(req_pkt)) {
-        computeUnit->scalarDataPort.retries.push_back(req_pkt);
-        DPRINTF(GPUMem, "send scalar req failed for: %s\n",
-                gpuDynInst->disassemble());
+    // For a system request we want to mark the GPU instruction as a system
+    // load/store so that after the request is issued to system memory we can
+    // return any token acquired for the request. Since tokens are returned
+    // by the coalescer and system requests do not take that path, this needs
+    // to be tracked.
+    //
+    // Device requests change the requestor ID to something in the device
+    // memory Ruby network.
+    if (req_pkt->req->systemReq()) {
+        gpuDynInst->setSystemReq();
    } else {
-        DPRINTF(GPUMem, "send scalar req for: %s\n",
-                gpuDynInst->disassemble());
+        req_pkt->req->requestorId(computeUnit->vramRequestorId());
    }

+    ComputeUnit::ScalarDataPort::MemReqEvent *scalar_mem_req_event
+            = new ComputeUnit::ScalarDataPort::MemReqEvent
+                (computeUnit->scalarDataPort, req_pkt);
+    computeUnit->schedule(scalar_mem_req_event, curTick() +
+                          computeUnit->req_tick_latency);
+
    return true;
 }

--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -529,6 +529,28 @@ class ComputeUnit : public ClockedObject
                  saved(sender_state) { }
        };

+        class SystemHubEvent : public Event
+        {
+          DataPort *dataPort;
+          PacketPtr reqPkt;
+
+          public:
+            SystemHubEvent(PacketPtr pkt, DataPort *_dataPort)
+                : dataPort(_dataPort), reqPkt(pkt)
+            {
+                setFlags(Event::AutoDelete);
+            }
+
+            void
+            process()
+            {
+                // DMAs do not operate on packets and therefore do not
+                // convert to a response. Do that here instead.
+                reqPkt->makeResponse();
+                dataPort->handleResponse(reqPkt);
+            }
+        };
+
        void processMemReqEvent(PacketPtr pkt);
        EventFunctionWrapper *createMemReqEvent(PacketPtr pkt);

@@ -537,6 +559,8 @@ class ComputeUnit : public ClockedObject

        std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;

+        bool handleResponse(PacketPtr pkt);
+
      protected:
        ComputeUnit *computeUnit;

@@ -596,6 +620,30 @@ class ComputeUnit : public ClockedObject
            const char *description() const;
        };

+        class SystemHubEvent : public Event
+        {
+          ScalarDataPort *dataPort;
+          PacketPtr reqPkt;
+
+          public:
+            SystemHubEvent(PacketPtr pkt, ScalarDataPort *_dataPort)
+                : dataPort(_dataPort), reqPkt(pkt)
+            {
+                setFlags(Event::AutoDelete);
+            }
+
+            void
+            process()
+            {
+                // DMAs do not operate on packets and therefore do not
+                // convert to a response. Do that here instead.
+                reqPkt->makeResponse();
+                dataPort->handleResponse(reqPkt);
+            }
+        };
+
+        bool handleResponse(PacketPtr pkt);
+
        std::deque<PacketPtr> retries;

      private:
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -62,6 +62,10 @@ GlobalMemPipeline::init()
 bool
 GlobalMemPipeline::coalescerReady(GPUDynInstPtr mp) const
 {
+    // System requests do not need GPU coalescer tokens. Make sure nothing
+    // has bypassed the operand gather check stage.
+    assert(!mp->isSystemReq());
+
    // We require one token from the coalescer's uncoalesced table to
    // proceed
    int token_count = 1;
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -476,11 +476,16 @@ class GPUDynInst : public GPUExecContext

    // inst used to save/restore a wavefront context
    bool isSaveRestore;
+
+    bool isSystemReq() { return systemReq; }
+    void setSystemReq() { systemReq = true; }
+
  private:
    GPUStaticInst *_staticInst;
    const InstSeqNum _seqNum;
    int maxSrcVecRegOpSize;
    int maxSrcScalarRegOpSize;
+    bool systemReq = false;

    // the time the request was started
    Tick accessTime = -1;