dev-amdgpu, gpu-compute, mem-ruby: Add support for writeback L2 in GPU (#1692)

Previously, GPU L2 caches could be configured in either writeback or writethrough mode when used in an APU. However, in a CPU+dGPU system, only writethrough worked. This is mainly because in CPU+dGPU system, the CPU sends either PCI or SDMA requests to transfer data from the GPU memory to CPU. When L2 cache is configured to be writeback, the dirty data resides in L2 when CPU transfers data from GPU memory. This leads to the wrong version being transferred. A similar issue also crops up when the GPU command processor reads kernel information before kernel dispatch, only to incorrect data. This PR contains a set of commits that fix both these issues.
2024-11-05 12:45:46 -06:00
parent 940f49b63b
commit d463868f28
11 changed files with 286 additions and 23 deletions
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -364,14 +364,27 @@ AMDGPUDevice::readFrame(PacketPtr pkt, Addr offset)
     * because this method is called by the PCIDevice::read method which
     * is a non-timing read.
     */
-    RequestPtr req = std::make_shared<Request>(offset, pkt->getSize(), 0,
-                                               vramRequestorId());
-    PacketPtr readPkt = Packet::createRead(req);
+    RequestPtr req = std::make_shared<Request>(
+            offset, pkt->getSize(), 0, vramRequestorId());
+
+    PacketPtr readPkt = new Packet(req, MemCmd::ReadReq);
    uint8_t *dataPtr = new uint8_t[pkt->getSize()];
    readPkt->dataDynamic(dataPtr);
+    readPkt->req->setGPUFuncAccess(true);
+    readPkt->setSuppressFuncError();
+    cp->shader()->cuList[0]->memPort[0].sendFunctional(readPkt);
+    if (readPkt->cmd == MemCmd::FunctionalReadError) {
+        delete readPkt;
+        delete[] dataPtr;
+        RequestPtr req = std::make_shared<Request>(offset, pkt->getSize(), 0,
+                                               vramRequestorId());
+        PacketPtr readPkt = Packet::createRead(req);
+        uint8_t *dataPtr = new uint8_t[pkt->getSize()];
+        readPkt->dataDynamic(dataPtr);

-    auto system = cp->shader()->gpuCmdProc.system();
-    system->getDeviceMemory(readPkt)->access(readPkt);
+        auto system = cp->shader()->gpuCmdProc.system();
+        system->getDeviceMemory(readPkt)->access(readPkt);
+    }

    pkt->setUintX(readPkt->getUintX(ByteOrder::little), ByteOrder::little);
    delete readPkt;
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -1062,7 +1062,18 @@ ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
     * and doesn't have a wavefront or instruction associated with it.
     */
    if (sender_state->wavefront != nullptr) {
-        computeUnit->handleSQCReturn(pkt);
+        RequestPtr req = pkt->req;
+        // If the sender state's isKernDispath is set, then the request came
+        // from the gpu command processor. The request fetches information
+        // that will be used in the kernel dispatch process. It should be
+        // handled in the gpu command processor. If the flag isn't set,
+        // then the request is an instruction fetch and can be handled in
+        // the compute unit
+        if (sender_state->isKernDispatch) {
+          computeUnit->shader->gpuCmdProc.completeTimingRead();
+        } else {
+          computeUnit->handleSQCReturn(pkt);
+        }
    } else {
        delete pkt->senderState;
        delete pkt;
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -685,11 +685,17 @@ class ComputeUnit : public ClockedObject
            Packet::SenderState *saved;
            // kernel id to be used in handling I-Cache invalidate response
            int kernId;
-
+            bool isKernDispatch;
            SenderState(Wavefront *_wavefront, Packet::SenderState
                    *sender_state=nullptr, int _kernId=-1)
                : wavefront(_wavefront), saved(sender_state),
-                kernId(_kernId){ }
+                kernId(_kernId), isKernDispatch(false){ }
+
+            SenderState(Wavefront *_wavefront, bool _isKernDispatch,
+                    Packet::SenderState *sender_state=nullptr, int _kernId=-1)
+                : wavefront(_wavefront), saved(sender_state),
+                kernId(_kernId), isKernDispatch(_isKernDispatch){ }
+
        };

        class MemReqEvent : public Event
--- a/src/gpu-compute/gpu_command_processor.cc
+++ b/src/gpu-compute/gpu_command_processor.cc
@@ -40,6 +40,7 @@
 #include "debug/GPUInitAbi.hh"
 #include "debug/GPUKernelInfo.hh"
 #include "dev/amdgpu/amdgpu_device.hh"
+#include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/dispatcher.hh"
 #include "gpu-compute/shader.hh"
 #include "mem/abstract_mem.hh"
@@ -100,6 +101,37 @@ GPUCommandProcessor::translate(Addr vaddr, Addr size)
                                         1 /* vmid */, vaddr, size));
 }

+void
+GPUCommandProcessor::performTimingRead(PacketPtr pkt)
+{
+        // Use the shader to access the CUs and call the read request from
+        // the SQC port. Call submit kernel dispatch in the timing response
+        // function in receive timing response of SQC port. Schedule this
+        // timing read when...just currTick
+        ComputeUnit *cu = shader()->cuList[0];
+        pkt->senderState = new ComputeUnit::SQCPort::SenderState(
+                cu->wfList[0][0], true);
+        ComputeUnit::SQCPort::SenderState *sender_state =
+            safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
+        ComputeUnit::SQCPort sqc_port = cu->sqcPort;
+        if (!sqc_port.sendTimingReq(pkt)) {
+                sqc_port.retries.push_back(
+                        std::pair<PacketPtr, Wavefront*>(pkt,
+                            sender_state->wavefront));
+        }
+}
+
+void
+GPUCommandProcessor::completeTimingRead()
+{
+        struct KernelDispatchData dispatchData = kernelDispatchList.front();
+        kernelDispatchList.pop_front();
+        delete dispatchData.readPkt;
+        if (kernelDispatchList.size() == 0)
+                dispatchKernelObject(dispatchData.akc, dispatchData.raw_pkt,
+                        dispatchData.queue_id, dispatchData.host_pkt_addr);
+}
+
 /**
 * submitDispatchPkt() is the entry point into the CP from the HSAPP
 * and is only meant to be used with AQL kernel dispatch packets.
@@ -236,16 +268,20 @@ GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
                RequestPtr request = std::make_shared<Request>(chunk_addr,
                    akc_alignment_granularity, flags,
                    walker->getDevRequestor());
-                Packet *readPkt = new Packet(request, MemCmd::ReadReq);
+                PacketPtr readPkt = new Packet(request, MemCmd::ReadReq);
                readPkt->dataStatic((uint8_t *)akc + gen.complete());
                // If the request spans two device memories, the device memory
                // returned will be null.
                assert(system()->getDeviceMemory(readPkt) != nullptr);
-                system()->getDeviceMemory(readPkt)->access(readPkt);
-                delete readPkt;
+                struct KernelDispatchData dispatchData;
+                dispatchData.akc = akc;
+                dispatchData.raw_pkt = raw_pkt;
+                dispatchData.queue_id = queue_id;
+                dispatchData.host_pkt_addr = host_pkt_addr;
+                dispatchData.readPkt = readPkt;
+                kernelDispatchList.push_back(dispatchData);
+                performTimingRead(readPkt);
            }
-
-            dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
        }
    }
 }
--- a/src/gpu-compute/gpu_command_processor.hh
+++ b/src/gpu-compute/gpu_command_processor.hh
@@ -85,12 +85,27 @@ class GPUCommandProcessor : public DmaVirtDevice
    Shader* shader();
    GPUComputeDriver* driver();

+    struct KernelDispatchData
+    {
+        AMDKernelCode *akc;
+        void *raw_pkt;
+        uint32_t queue_id;
+        Addr host_pkt_addr;
+        PacketPtr readPkt;
+    };
+
+    std::list<struct KernelDispatchData> kernelDispatchList;
+
    enum AgentCmd
    {
      Nop = 0,
      Steal = 1
    };

+    void performTimingRead(PacketPtr pkt);
+
+    void completeTimingRead();
+
    void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
                           Addr host_pkt_addr);
    void submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -473,6 +473,8 @@ class Request : public Extensible<Request>
    /** The cause for HTM transaction abort */
    HtmFailureFaultCause _htmAbortCause = HtmFailureFaultCause::INVALID;

+    bool _isGPUFuncAccess;
+
  public:

    /**
@@ -493,6 +495,7 @@ class Request : public Extensible<Request>
        _flags.set(flags);
        privateFlags.set(VALID_PADDR|VALID_SIZE);
        _byteEnable = std::vector<bool>(size, true);
+        _isGPUFuncAccess = false;
    }

    Request(Addr vaddr, unsigned size, Flags flags,
@@ -502,6 +505,7 @@ class Request : public Extensible<Request>
        setVirt(vaddr, size, flags, id, pc, std::move(atomic_op));
        setContext(cid);
        _byteEnable = std::vector<bool>(size, true);
+        _isGPUFuncAccess = false;
    }

    Request(const Request& other)
@@ -1124,6 +1128,17 @@ class Request : public Extensible<Request>
    bool isCacheInvalidate() const { return _flags.isSet(INVALIDATE); }
    bool isCacheMaintenance() const { return _flags.isSet(CLEAN|INVALIDATE); }
    /** @} */
+
+    void
+    setGPUFuncAccess(bool flag) {
+        _isGPUFuncAccess = flag;
+    }
+
+    bool
+    getGPUFuncAccess()
+    {
+        return _isGPUFuncAccess;
+    }
 };

 } // namespace gem5
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -73,6 +73,7 @@ machine(MachineType:TCC, "TCC Cache")
    // Probes
    PrbInv,                 desc="Invalidating probe";
    InvCache,               desc="Invalidating probe from TCP";
+    PrbDowngrade,	        desc="Downgrading probe";
    // Coming from Memory Controller
    WBAck,                  desc="writethrough ack from memory";
    Bypass,                 desc="Bypass the entire L2 cache";
@@ -180,8 +181,11 @@ machine(MachineType:TCC, "TCC Cache")

  void functionalRead(Addr addr, Packet *pkt) {
    TBE tbe := TBEs.lookup(addr);
+    Entry cache_entry := getCacheEntry(addr);
    if(is_valid(tbe)) {
      testAndRead(addr, tbe.DataBlk, pkt);
+    } else if (is_valid(cache_entry)) {
+      testAndRead(addr, cache_entry.DataBlk, pkt);
    } else {
      functionalMemoryRead(pkt);
    }
@@ -345,7 +349,14 @@ machine(MachineType:TCC, "TCC Cache")
        DPRINTF(RubySlicc, "%s\n", in_msg);
        Entry cache_entry := getCacheEntry(in_msg.addr);
        TBE tbe := TBEs.lookup(in_msg.addr);
-        trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+        if (in_msg.Type == ProbeRequestType:PrbInv) {
+          // Invalidate data and send it downstream
+          trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+        } else {
+          // If data present in cache, then downgrade it and send it
+          // downstream
+          trigger(Event:PrbDowngrade, in_msg.addr, cache_entry, tbe);
+        }
      }
    }
  }
@@ -815,6 +826,28 @@ machine(MachineType:TCC, "TCC Cache")
    }
  }

+  action(pd_sendProbeResponseDowngrade, "pd", desc="send probe downgrade") {
+    enqueue(responseToNB_out, ResponseMsg, 1) {
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:CPUPrbResp;  // TCC, L3  respond in same way to probes
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
+      if (getState(tbe, cache_entry, address) == State:V || getState(tbe, cache_entry, address) == State:M || getState(tbe, cache_entry, address) == State:W) {
+        out_msg.Hit := true;
+        out_msg.Dirty := true;
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.MessageSize := MessageSizeType:Response_Data;
+      } else {
+	    out_msg.Hit := false;
+        out_msg.Dirty := false;
+        out_msg.MessageSize := MessageSizeType:Response_Control;
+      }
+      out_msg.Ntsl := true;
+      out_msg.State := CoherenceState:NA;
+    }
+  }
+
+
  action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
    enqueue(responseToNB_out, ResponseMsg, 1) {
      out_msg.addr := address;
@@ -1212,6 +1245,25 @@ machine(MachineType:TCC, "TCC Cache")
    p_popRequestQueue;
  }

+  transition(I, PrbDowngrade) {TagArrayRead} {
+    pd_sendProbeResponseDowngrade;
+    pp_popProbeQueue;
+  }
+
+  transition(V, PrbDowngrade) {TagArrayRead} {
+    pd_sendProbeResponseDowngrade;
+    pp_popProbeQueue;
+  }
+
+  transition({M, W}, PrbDowngrade, V) {TagArrayRead, TagArrayWrite} {
+    pd_sendProbeResponseDowngrade;
+    pp_popProbeQueue;
+  }
+
+  transition({A, IV, WI, WIB}, PrbDowngrade) {TagArrayRead, TagArrayWrite} {
+    st_stallAndWaitRequest;
+  }
+
  transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
    pi_sendProbeResponseInv;
    pp_popProbeQueue;
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -173,8 +173,11 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")

  void functionalRead(Addr addr, Packet *pkt) {
    TBE tbe := TBEs.lookup(addr);
+    Entry cache_entry := getCacheEntry(addr);
    if(is_valid(tbe)) {
      testAndRead(addr, tbe.DataBlk, pkt);
+    } else if (is_valid(cache_entry)) {
+      testAndRead(addr, cache_entry.DataBlk, pkt);
    } else {
      functionalMemoryRead(pkt);
    }
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
@@ -84,7 +84,9 @@ machine(MachineType:Directory, "AMD Baseline protocol")
    BM_Pm, AccessPermission:Backing_Store,      desc="blocked waiting for probes, already got memory";
    B_Pm, AccessPermission:Backing_Store,       desc="blocked waiting for probes, already got memory";
    B, AccessPermission:Backing_Store,          desc="sent response, Blocked til ack";
-
+    BL2, AccessPermission:Backing_Store,	    desc="Blocked checking for data in L2";
+    BL2_Pm, AccessPermission:Backing_Store,	    desc="Blocked waiting for probes, already got memory";
+    BL2_M, AccessPermission:Backing_Store,	    desc="Blocked waiting for memory";
    F, AccessPermission:Busy, desc="sent Flus, blocked till ack";
  }

@@ -105,6 +107,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")

    // probe responses
    CPUPrbResp,            desc="Probe Response Msg";
+    CPUPrbRespWB,	       desc="Probe Response Msg and Data";

    ProbeAcksComplete,  desc="Probe Acks Complete";

@@ -121,6 +124,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")

    // DMA
    DmaRead,            desc="DMA read";
+    DmaReadWB,		    desc="DMA read write back";
    DmaWrite,           desc="DMA write";

    // Flush
@@ -300,7 +304,11 @@ machine(MachineType:Directory, "AMD Baseline protocol")
        TBE tbe := TBEs.lookup(in_msg.LineAddress);
        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.LineAddress));
        if (in_msg.Type == DMARequestType:READ) {
-          trigger(Event:DmaRead, in_msg.LineAddress, entry, tbe);
+          if (L2isWB) {
+            trigger(Event:DmaReadWB, in_msg.LineAddress, entry, tbe);
+          } else {
+            trigger(Event:DmaRead, in_msg.LineAddress, entry, tbe);
+          }
        } else if (in_msg.Type == DMARequestType:WRITE) {
          trigger(Event:DmaWrite, in_msg.LineAddress, entry, tbe);
        } else {
@@ -359,7 +367,11 @@ machine(MachineType:Directory, "AMD Baseline protocol")
        TBE tbe := TBEs.lookup(in_msg.addr);
        CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
        if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
-          trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe);
+          if (in_msg.Hit == true && L2isWB) {
+            trigger(Event:CPUPrbRespWB, in_msg.addr, entry, tbe);
+          } else {
+            trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe);
+          }
        } else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
            trigger(Event:StaleWB, in_msg.addr, entry, tbe);
        } else {
@@ -825,6 +837,12 @@ machine(MachineType:Directory, "AMD Baseline protocol")
                                            TCC_select_low_bit,
                                            TCC_select_num_bits));
        }
+
+        if (GPUonly && L2isWB) {
+          probe_dests.add(mapAddressToRange(address, MachineType:TCC,
+                                            TCC_select_low_bit,
+                                            TCC_select_num_bits));
+        }
      }
      probe_dests.remove(in_msg.Requestor);

@@ -1100,6 +1118,7 @@ machine(MachineType:Directory, "AMD Baseline protocol")

  action(mt_writeMemDataToTBE, "mt", desc="write Mem data to TBE") {
    peek(memQueue_in, MemoryMsg) {
+      DPRINTF(RubySlicc, "%s\n", in_msg);
      if (tbe.wtData == true) {
        // Keep the write-through data based on mask, but use the memory block
        // for the masked-off data. If we received a probe with data, the mask
@@ -1115,6 +1134,20 @@ machine(MachineType:Directory, "AMD Baseline protocol")
    }
  }

+
+  action(yw_writeProbeDataToTBEWB, "yw", desc="write Probe Data to TBE") {
+    peek(responseNetwork_in, ResponseMsg) {
+      DPRINTF(RubySlicc, "%s\n", in_msg);
+      if (tbe.Dirty == false) {
+        tbe.DataBlk := in_msg.DataBlk;
+        tbe.Dirty := in_msg.Dirty;
+        tbe.LastSender := in_msg.Sender;
+        tbe.Cached := true;
+        tbe.MemData := true;
+      }
+    }
+  }
+
  action(y_writeProbeDataToTBE, "y", desc="write Probe Data to TBE") {
    peek(responseNetwork_in, ResponseMsg) {
      if (in_msg.Dirty) {
@@ -1315,22 +1348,29 @@ machine(MachineType:Directory, "AMD Baseline protocol")
  */

  // TRANSITIONS
-  transition({BL, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) {
+  transition({BL, BL2, BL2_Pm, BL2_M, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) {
      st_stallAndWaitRequest;
  }

-  // It may be possible to save multiple invalidations here!
-  transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {Atomic, WriteThrough}) {
+    // It may be possible to save multiple invalidations here!
+  transition({BL, BL2, BL2_Pm, BL2_M, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {Atomic, WriteThrough}) {
      st_stallAndWaitRequest;
  }

  // The exit state is always going to be U, so wakeUpDependents logic should be covered in all the
  // transitions which are flowing into U.
-  transition({BL, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, B}, {DmaRead,DmaWrite}){
+  transition({BL, BL2, BL2_Pm, BL2_M, BDR_M, BDW_M, BS_M, BM_M, B_M, BP, BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, B}, {DmaRead, DmaReadWB, DmaWrite}){
    sd_stallAndWaitRequest;
  }

  // transitions from U
+  transition(U, DmaReadWB, BL2) {
+    atd_allocateTBEforDMA; // Allocate a TBE
+    qdr_queueDmaRdReq;
+    pr_profileL3HitMiss;
+    scd_probeShrCoreDataForDma; // Send probes to the Ruby Network
+  }
+
  transition(U, DmaRead, BDR_PM) {L3TagArrayRead} {
    atd_allocateTBEforDMA;
    qdr_queueDmaRdReq;
@@ -1567,13 +1607,75 @@ machine(MachineType:Directory, "AMD Baseline protocol")
    ptl_popTriggerQueue;
  }

-  transition({BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, BP}, CPUPrbResp) {
+  transition({BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, BP, BL2}, CPUPrbResp) {
    y_writeProbeDataToTBE;
    x_decrementAcks;
    o_checkForCompletion;
    pr_popResponseQueue;
  }

+  transition({BDR_PM, BDW_PM, BS_PM, BM_PM, B_PM, BDR_Pm, BDW_Pm, BS_Pm, BM_Pm, B_Pm, BP}, CPUPrbRespWB) {
+    y_writeProbeDataToTBE;
+    x_decrementAcks;
+    o_checkForCompletion;
+    pr_popResponseQueue;
+  }
+
+  transition(BL2, L3Hit, BL2_Pm) {
+    ptl_popTriggerQueue;
+  }
+
+  transition({BL2, BL2_Pm}, CPUPrbRespWB, BL2_Pm) {
+    // Blocked on L2 and waiting for probes
+    yw_writeProbeDataToTBEWB;
+    x_decrementAcks;
+    o_checkForCompletion;
+    pr_popResponseQueue;
+  }
+
+  transition(BL2_Pm, CPUPrbResp) {
+    // Blocked on L2 probes, got the memory
+    x_decrementAcks;
+    o_checkForCompletion;
+    pr_popResponseQueue;
+  }
+
+  transition(BL2, ProbeAcksComplete, BL2_M) {
+    // We probed all the TCC dirs but didn't find the memory
+    // Send out memory request
+    // Transition to waiting on memory
+    pt_popTriggerQueue;
+  }
+
+  transition(BL2_Pm, ProbeAcksComplete, U) {
+    // We were waiting for all probes to come back now that they have we can unblock
+    // Send WBAck back to TCC
+    dd_sendResponseDmaData;
+    wada_wakeUpAllDependentsAddr;
+    dt_deallocateTBE;
+    pd_popDmaRequestQueue;
+    pt_popTriggerQueue;
+  }
+
+  transition(BL2, MemData, BL2_Pm) {
+    mt_writeMemDataToTBE;
+    pm_popMemQueue;
+  }
+
+  transition({BL2_Pm, U}, MemData) {
+    pm_popMemQueue;
+  }
+
+  transition(BL2_M, MemData, U) {
+    // Got the memory we were waiting for. We can unblock now.
+    mt_writeMemDataToTBE;
+    dd_sendResponseDmaData;
+    wada_wakeUpAllDependentsAddr;
+    pd_popDmaRequestQueue;
+    dt_deallocateTBE;
+    pm_popMemQueue;
+  }
+
  transition(BDR_PM, ProbeAcksComplete, BDR_M) {
    pt_popTriggerQueue;
  }
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -393,6 +393,11 @@ RubyPort::MemResponsePort::recvFunctional(PacketPtr pkt)
 {
    DPRINTF(RubyPort, "Functional access for address: %#x\n", pkt->getAddr());

+    // In a CPU+dGPU system, GPU functional packets are injected into
+    // the CPU network. This happens because the requestorId is automatically
+    // set to that of the CPU network for these packets. Here, we set it
+    // to that of the GPU RubyPort so that it uses the right network to
+    // access GPU caches
    RubySystem *rs = owner.m_ruby_system;

    // Check for pio requests and directly send them to the dedicated
@@ -407,6 +412,10 @@ RubyPort::MemResponsePort::recvFunctional(PacketPtr pkt)
    assert(pkt->getAddr() + pkt->getSize() <=
           owner.makeLineAddress(pkt->getAddr()) + rs->getBlockSizeBytes());

+    if (pkt->req->getGPUFuncAccess()) {
+        pkt->req->requestorId(owner.m_controller->getRequestorId());
+    }
+
    if (access_backing_store) {
        // The attached physmem contains the official version of data.
        // The following command performs the real functional access.
--- a/src/mem/ruby/system/RubySystem.cc
+++ b/src/mem/ruby/system/RubySystem.cc
@@ -560,7 +560,8 @@ RubySystem::functionalRead(PacketPtr pkt)
    // it only if it's not in the cache hierarchy at all.
    int num_controllers = netCntrls[request_net_id].size();
    if (num_invalid == (num_controllers - 1) && num_backing_store == 1) {
-        DPRINTF(RubySystem, "only copy in Backing_Store memory, read from it\n");
+        DPRINTF(RubySystem,
+                "only copy in Backing_Store memory, read from it\n");
        ctrl_backing_store->functionalRead(line_address, pkt);
        return true;
    } else if (num_ro > 0 || num_rw >= 1) {