diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py
index 8c9895f1c9..972a4f98de 100644
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -133,6 +133,10 @@ def makeGpuFSSystem(args):
     gpu_mem_mgr = AMDGPUMemoryManager()
     system.pc.south_bridge.gpu.memory_manager = gpu_mem_mgr
 
+    # CPU data path (SystemHub)
+    system_hub = AMDGPUSystemHub()
+    shader.system_hub = system_hub
+
     # GPU, HSAPP, and GPUCommandProc are DMA devices
     system._dma_ports.append(gpu_hsapp)
     system._dma_ports.append(gpu_cmd_proc)
@@ -141,6 +145,7 @@ def makeGpuFSSystem(args):
     system._dma_ports.append(sdma1)
     system._dma_ports.append(device_ih)
     system._dma_ports.append(pm4_pkt_proc)
+    system._dma_ports.append(system_hub)
     system._dma_ports.append(gpu_mem_mgr)
     system._dma_ports.append(hsapp_pt_walker)
     system._dma_ports.append(cp_pt_walker)
@@ -154,6 +159,7 @@ def makeGpuFSSystem(args):
     sdma1.pio = system.iobus.mem_side_ports
     device_ih.pio = system.iobus.mem_side_ports
     pm4_pkt_proc.pio = system.iobus.mem_side_ports
+    system_hub.pio = system.iobus.mem_side_ports
 
     # Full system needs special TLBs for SQC, Scalar, and vector data ports
     args.full_system = True
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index 3e5fba67f9..a0154a77dc 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -224,6 +224,7 @@ class Shader(ClockedObject):
     CUs = VectorParam.ComputeUnit('Number of compute units')
     gpu_cmd_proc = Param.GPUCommandProcessor('Command processor for GPU')
     dispatcher = Param.GPUDispatcher('GPU workgroup dispatcher')
+    system_hub = Param.AMDGPUSystemHub(NULL, 'GPU System Hub (FS Mode only)')
     n_wf = Param.Int(10, 'Number of wavefront slots per SIMD')
     impl_kern_launch_acq = Param.Bool(True, """Insert acq packet into
                                          ruby at kernel launch""")
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index cc6244b2e9..e1794a8b0f 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -979,10 +979,17 @@ ComputeUnit::DataPort::recvReqRetry()
 bool
 ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
 {
-    computeUnit->fetchStage.processFetchReturn(pkt);
+    computeUnit->handleSQCReturn(pkt);
+
     return true;
 }
 
+void
+ComputeUnit::handleSQCReturn(PacketPtr pkt)
+{
+    fetchStage.processFetchReturn(pkt);
+}
+
 void
 ComputeUnit::SQCPort::recvReqRetry()
 {
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index 87ed541d9d..1c211d9d74 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -463,6 +463,8 @@ class ComputeUnit : public ClockedObject
     bool isDone() const;
     bool isVectorAluIdle(uint32_t simdId) const;
 
+    void handleSQCReturn(PacketPtr pkt);
+
   protected:
     RequestorID _requestorId;
 
diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc
index 6e35818001..640e29bdba 100644
--- a/src/gpu-compute/fetch_unit.cc
+++ b/src/gpu-compute/fetch_unit.cc
@@ -206,6 +206,15 @@ FetchUnit::initiateFetch(Wavefront *wavefront)
 
         computeUnit.sqcTLBPort.sendFunctional(pkt);
 
+        /**
+         * For full system, if this is a device request we need to set the
+         * requestor ID of the packet to the GPU memory manager so it is routed
+         * through Ruby as a memory request and not a PIO request.
+         */
+        if (!pkt->req->systemReq()) {
+            pkt->req->requestorId(computeUnit.vramRequestorId());
+        }
+
         GpuTranslationState *sender_state =
              safe_cast<GpuTranslationState*>(pkt->senderState);
 
@@ -249,6 +258,15 @@ FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
         return;
     }
 
+    /**
+     * For full system, if this is a device request we need to set the
+     * requestor ID of the packet to the GPU memory manager so it is routed
+     * through Ruby as a memory request and not a PIO request.
+     */
+    if (!pkt->req->systemReq()) {
+        pkt->req->requestorId(computeUnit.vramRequestorId());
+    }
+
     /**
      * we should have reserved an entry in the fetch buffer
      * for this cache line. here we get the pointer to the
@@ -263,7 +281,11 @@ FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
     if (timingSim) {
         // translation is done. Send the appropriate timing memory request.
 
-        if (!computeUnit.sqcPort.sendTimingReq(pkt)) {
+        if (pkt->req->systemReq()) {
+            SystemHubEvent *resp_event = new SystemHubEvent(pkt, this);
+            assert(computeUnit.shader->systemHub);
+            computeUnit.shader->systemHub->sendRequest(pkt, resp_event);
+        } else if (!computeUnit.sqcPort.sendTimingReq(pkt)) {
             computeUnit.sqcPort.retries.push_back(std::make_pair(pkt,
                                                                    wavefront));
 
@@ -643,4 +665,11 @@ FetchUnit::FetchBufDesc::fetchBytesRemaining() const
     return bytes_remaining;
 }
 
+void
+FetchUnit::SystemHubEvent::process()
+{
+    reqPkt->makeResponse();
+    fetchUnit->computeUnit.handleSQCReturn(reqPkt);
+}
+
 } // namespace gem5
diff --git a/src/gpu-compute/fetch_unit.hh b/src/gpu-compute/fetch_unit.hh
index 6002665bb0..0ba88c7d95 100644
--- a/src/gpu-compute/fetch_unit.hh
+++ b/src/gpu-compute/fetch_unit.hh
@@ -44,6 +44,7 @@
 #include "config/the_gpu_isa.hh"
 #include "gpu-compute/scheduler.hh"
 #include "mem/packet.hh"
+#include "sim/eventq.hh"
 
 namespace gem5
 {
@@ -238,6 +239,21 @@ class FetchUnit
         TheGpuISA::Decoder *_decoder;
     };
 
+    class SystemHubEvent : public Event
+    {
+      FetchUnit *fetchUnit;
+      PacketPtr reqPkt;
+
+      public:
+        SystemHubEvent(PacketPtr pkt, FetchUnit *fetch_unit)
+            : fetchUnit(fetch_unit), reqPkt(pkt)
+        {
+            setFlags(Event::AutoDelete);
+        }
+
+        void process();
+    };
+
     bool timingSim;
     ComputeUnit &computeUnit;
     TheGpuISA::Decoder decoder;
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
index ebacbb5c1f..73d2366b74 100644
--- a/src/gpu-compute/shader.cc
+++ b/src/gpu-compute/shader.cc
@@ -65,7 +65,7 @@ Shader::Shader(const Params &p) : ClockedObject(p),
     trace_vgpr_all(1), n_cu((p.CUs).size()), n_wf(p.n_wf),
     globalMemSize(p.globalmem),
     nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc),
-    _dispatcher(*p.dispatcher),
+    _dispatcher(*p.dispatcher), systemHub(p.system_hub),
     max_valu_insts(p.max_valu_insts), total_valu_insts(0),
     stats(this, p.CUs[0]->wfSize())
 {
diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh
index 96ad15db8e..0978acb376 100644
--- a/src/gpu-compute/shader.hh
+++ b/src/gpu-compute/shader.hh
@@ -44,6 +44,7 @@
 #include "cpu/simple_thread.hh"
 #include "cpu/thread_context.hh"
 #include "cpu/thread_state.hh"
+#include "dev/amdgpu/system_hub.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
 #include "gpu-compute/hsa_queue_entry.hh"
@@ -225,6 +226,7 @@ class Shader : public ClockedObject
 
     GPUCommandProcessor &gpuCmdProc;
     GPUDispatcher &_dispatcher;
+    AMDGPUSystemHub *systemHub;
 
     int64_t max_valu_insts;
     int64_t total_valu_insts;