From 359ac63280fa591c1343f897ba495b292f390f4e Mon Sep 17 00:00:00 2001
From: KaiBatley <kaisbat@gmail.com>
Date: Wed, 3 Jan 2024 21:30:20 -0600
Subject: [PATCH] gpu-compute: Added register file cache support

The RFC is defaulted to a size of 0 which removes it completely. To use
the RFC set the --register-file-cache-size to a non-zero multiple of
two. In addition, rfc_pipe_length may be altrered to increase or
decrease RFC latency benefit.

Change-Id: I6f5bf5b750eb64155fbc8c8343e9feadce5c9f79
---
 configs/example/apu_se.py                  |  13 ++
 configs/example/gpufs/amd/AmdGPUOptions.py |   6 +
 configs/example/gpufs/system/amdgpu.py     |   7 +
 src/gpu-compute/GPU.py                     |  16 ++
 src/gpu-compute/SConscript                 |   8 +-
 src/gpu-compute/compute_unit.cc            |   4 +
 src/gpu-compute/compute_unit.hh            |   6 +
 src/gpu-compute/register_file.cc           |   4 +
 src/gpu-compute/register_file.hh           |   4 +
 src/gpu-compute/register_file_cache.cc     | 170 +++++++++++++++++++++
 src/gpu-compute/register_file_cache.hh     | 113 ++++++++++++++
 src/gpu-compute/schedule_stage.cc          |   1 +
 src/gpu-compute/vector_register_file.cc    |  23 ++-
 src/gpu-compute/wavefront.cc               |   2 +
 14 files changed, 372 insertions(+), 5 deletions(-)
 create mode 100644 src/gpu-compute/register_file_cache.cc
 create mode 100644 src/gpu-compute/register_file_cache.hh

diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py
index b20779fcdb..68e678f76b 100644
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -331,6 +331,12 @@ parser.add_argument(
     default="dynamic",
     help="register allocation policy (simple/dynamic)",
 )
+parser.add_argument(
+    "--register-file-cache-size",
+    type=int,
+    default=0,
+    help="number of registers in cache",
+)
 
 parser.add_argument(
     "--dgpu",
@@ -489,6 +495,7 @@ for i in range(n_cu):
     vrfs = []
     vrf_pool_mgrs = []
     srfs = []
+    rfcs = []
     srf_pool_mgrs = []
     for j in range(args.simds_per_cu):
         for k in range(shader.n_wf):
@@ -533,10 +540,16 @@ for i in range(n_cu):
                 simd_id=j, wf_size=args.wf_size, num_regs=args.sreg_file_size
             )
         )
+        rfcs.append(
+            RegisterFileCache(
+                simd_id=j, cache_size=args.register_file_cache_size
+            )
+        )
 
     compute_units[-1].wavefronts = wavefronts
     compute_units[-1].vector_register_file = vrfs
     compute_units[-1].scalar_register_file = srfs
+    compute_units[-1].register_file_cache = rfcs
     compute_units[-1].register_manager = RegisterManager(
         policy=args.registerManagerPolicy,
         vrf_pool_managers=vrf_pool_mgrs,
diff --git a/configs/example/gpufs/amd/AmdGPUOptions.py b/configs/example/gpufs/amd/AmdGPUOptions.py
index 3d6a8cc48e..9996d33a2e 100644
--- a/configs/example/gpufs/amd/AmdGPUOptions.py
+++ b/configs/example/gpufs/amd/AmdGPUOptions.py
@@ -247,3 +247,9 @@ def addAmdGPUOptions(parser):
         default="simple",
         help="register allocation policy (simple/dynamic)",
     )
+    parser.add_argument(
+        "--register-file-cache-size",
+        type=int,
+        default=0,
+        help="number of registers in cache",
+    )
diff --git a/configs/example/gpufs/system/amdgpu.py b/configs/example/gpufs/system/amdgpu.py
index 4bca52c77e..30e059d154 100644
--- a/configs/example/gpufs/system/amdgpu.py
+++ b/configs/example/gpufs/system/amdgpu.py
@@ -84,6 +84,7 @@ def createGPU(system, args):
         vrfs = []
         vrf_pool_mgrs = []
         srfs = []
+        rfcs = []
         srf_pool_mgrs = []
         for j in range(args.simds_per_cu):
             for k in range(shader.n_wf):
@@ -133,10 +134,16 @@ def createGPU(system, args):
                     num_regs=args.sreg_file_size,
                 )
             )
+            rfcs.append(
+                RegisterFileCache(
+                    simd_id=j, cache_size=args.register_file_cache_size
+                )
+            )
 
         compute_units[-1].wavefronts = wavefronts
         compute_units[-1].vector_register_file = vrfs
         compute_units[-1].scalar_register_file = srfs
+        compute_units[-1].register_file_cache = rfcs
         compute_units[-1].register_manager = RegisterManager(
             policy=args.registerManagerPolicy,
             vrf_pool_managers=vrf_pool_mgrs,
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
index 1b6c6a7494..fe0586fc16 100644
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -96,6 +96,14 @@ class VectorRegisterFile(RegisterFile):
     cxx_header = "gpu-compute/vector_register_file.hh"
 
 
+class RegisterFileCache(SimObject):
+    type = "RegisterFileCache"
+    cxx_class = "gem5::RegisterFileCache"
+    cxx_header = "gpu-compute/register_file_cache.hh"
+    simd_id = Param.Int("SIMD ID associated with this Register File Cache")
+    cache_size = Param.Int(0, "number of entries of rfc")
+
+
 class RegisterManager(SimObject):
     type = "RegisterManager"
     cxx_class = "gem5::RegisterManager"
@@ -150,6 +158,11 @@ class ComputeUnit(ClockedObject):
     dpbypass_pipe_length = Param.Int(
         4, "vector ALU Double Precision bypass latency"
     )
+
+    rfc_pipe_length = Param.Int(
+        2, "number of cycles per register file cache access"
+    )
+
     scalar_pipe_length = Param.Int(1, "number of pipe stages per scalar ALU")
     issue_period = Param.Int(4, "number of cycles per issue period")
 
@@ -261,6 +274,9 @@ class ComputeUnit(ClockedObject):
     scalar_register_file = VectorParam.ScalarRegisterFile(
         "Scalar register file"
     )
+
+    register_file_cache = VectorParam.RegisterFileCache("Register file cache")
+
     out_of_order_data_delivery = Param.Bool(
         False, "enable OoO data delivery in the GM pipeline"
     )
diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript
index 81f02d8282..e4536ba2a5 100644
--- a/src/gpu-compute/SConscript
+++ b/src/gpu-compute/SConscript
@@ -38,7 +38,7 @@ SimObject('GPU.py', sim_objects=[
     'PoolManager', 'SimplePoolManager', 'DynPoolManager', 'RegisterFile',
     'ScalarRegisterFile', 'VectorRegisterFile', 'RegisterManager', 'Wavefront',
     'ComputeUnit', 'Shader', 'GPUComputeDriver', 'GPURenderDriver',
-    'GPUDispatcher', 'GPUCommandProcessor'],
+    'GPUDispatcher', 'GPUCommandProcessor', 'RegisterFileCache'],
     enums=['PrefetchType', 'GfxVersion', 'StorageClassType'])
 SimObject('GPUStaticInstFlags.py', enums=['GPUStaticInstFlags'])
 SimObject('LdsState.py', sim_objects=['LdsState'])
@@ -71,6 +71,7 @@ Source('dyn_pool_manager.cc')
 Source('simple_pool_manager.cc')
 Source('static_register_manager_policy.cc')
 Source('vector_register_file.cc')
+Source('register_file_cache.cc')
 Source('wavefront.cc')
 
 DebugFlag('GPUAgentDisp')
@@ -96,6 +97,7 @@ DebugFlag('GPUSRF')
 DebugFlag('GPUSync')
 DebugFlag('GPUTLB')
 DebugFlag('GPUVRF')
+DebugFlag('GPURFC')
 DebugFlag('GPUVRFSched')
 DebugFlag('GPUWgLatency')
 DebugFlag('Predictor')
@@ -103,5 +105,5 @@ DebugFlag('WavefrontStack')
 
 CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
                         'GPUMem', 'GPUPort', 'GPUSched', 'GPUSRF', 'GPUSync',
-                        'GPUTLB', 'GPUVRF', 'GPUWgLatency', 'GPUKernelInfo',
-                        'GPUInitAbi'])
+                        'GPUTLB', 'GPUVRF', 'GPURFC', 'GPUWgLatency',
+                        'GPUKernelInfo', 'GPUInitAbi'])
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index 8d6deeb85a..8259f0a950 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -50,6 +50,7 @@
 #include "gpu-compute/gpu_command_processor.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
 #include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/register_file_cache.hh"
 #include "gpu-compute/scalar_register_file.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/simple_pool_manager.hh"
@@ -82,9 +83,11 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p),
           false, Event::CPU_Tick_Pri),
     cu_id(p.cu_id),
     vrf(p.vector_register_file), srf(p.scalar_register_file),
+    rfc(p.register_file_cache),
     simdWidth(p.simd_width),
     spBypassPipeLength(p.spbypass_pipe_length),
     dpBypassPipeLength(p.dpbypass_pipe_length),
+    rfcPipeLength(p.rfc_pipe_length),
     scalarPipeStages(p.scalar_pipe_length),
     operandNetworkLength(p.operand_network_length),
     issuePeriod(p.issue_period),
@@ -207,6 +210,7 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p),
 
     for (int i = 0; i < vrf.size(); ++i) {
         vrf[i]->setParent(this);
+        rfc[i]->setParent(this);
     }
     for (int i = 0; i < srf.size(); ++i) {
         srf[i]->setParent(this);
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
index cf73aa2723..e6bc03da7d 100644
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -66,6 +66,7 @@ class LdsChunk;
 class ScalarRegisterFile;
 class Shader;
 class VectorRegisterFile;
+class RegisterFileCache;
 
 struct ComputeUnitParams;
 
@@ -296,6 +297,8 @@ class ComputeUnit : public ClockedObject
     // array of scalar register files, one per SIMD
     std::vector<ScalarRegisterFile*> srf;
 
+    std::vector<RegisterFileCache*> rfc;
+
     // Width per VALU/SIMD unit: number of work items that can be executed
     // on the vector ALU simultaneously in a SIMD unit
     int simdWidth;
@@ -305,6 +308,8 @@ class ComputeUnit : public ClockedObject
     // number of pipe stages for bypassing data to next dependent double
     // precision vector instruction inside the vector ALU pipeline
     int dpBypassPipeLength;
+    // number of pipe stages for register file cache
+    int rfcPipeLength;
     // number of pipe stages for scalar ALU
     int scalarPipeStages;
     // number of pipe stages for operand collection & distribution network
@@ -390,6 +395,7 @@ class ComputeUnit : public ClockedObject
     int simdUnitWidth() const { return simdWidth; }
     int spBypassLength() const { return spBypassPipeLength; }
     int dpBypassLength() const { return dpBypassPipeLength; }
+    int rfcLength() const { return rfcPipeLength; }
     int scalarPipeLength() const { return scalarPipeStages; }
     int storeBusLength() const { return numCyclesPerStoreTransfer; }
     int loadBusLength() const { return numCyclesPerLoadTransfer; }
diff --git a/src/gpu-compute/register_file.cc b/src/gpu-compute/register_file.cc
index 62510e47b7..ec35c01528 100644
--- a/src/gpu-compute/register_file.cc
+++ b/src/gpu-compute/register_file.cc
@@ -194,6 +194,10 @@ RegisterFile::RegisterFileStats::RegisterFileStats(statistics::Group *parent)
     : statistics::Group(parent),
       ADD_STAT(registerReads,
               "Total number of DWORDs read from register file"),
+      ADD_STAT(rfc_cache_read_hits,
+              "Total number of DWORDs read from register file cache"),
+      ADD_STAT(rfc_cache_write_hits,
+              "Total number of writes to existing registers in the rfc"),
       ADD_STAT(registerWrites,
               "Total number of DWORDS written to register file"),
       ADD_STAT(sramReads,
diff --git a/src/gpu-compute/register_file.hh b/src/gpu-compute/register_file.hh
index 089516ce27..0d1a12b7b7 100644
--- a/src/gpu-compute/register_file.hh
+++ b/src/gpu-compute/register_file.hh
@@ -158,6 +158,10 @@ class RegisterFile : public SimObject
 
         // Total number of register reads per DWORD per thread
         statistics::Scalar registerReads;
+
+        statistics::Scalar rfc_cache_read_hits;
+        statistics::Scalar rfc_cache_write_hits;
+
         // Total number of register writes per DWORD per thread
         statistics::Scalar registerWrites;
 
diff --git a/src/gpu-compute/register_file_cache.cc b/src/gpu-compute/register_file_cache.cc
new file mode 100644
index 0000000000..f23cf399da
--- /dev/null
+++ b/src/gpu-compute/register_file_cache.cc
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2023 The University of Wisconsin
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "gpu-compute/register_file_cache.hh"
+
+#include <sstream>
+#include <string>
+
+#include "base/intmath.hh"
+#include "base/logging.hh"
+#include "debug/GPURFC.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/RegisterFileCache.hh"
+
+namespace gem5
+{
+
+RegisterFileCache::RegisterFileCache(const RegisterFileCacheParams &p)
+    : SimObject(p), simdId(p.simd_id), _capacity(p.cache_size)
+{
+    fatal_if(simdId < 0, "Illegal SIMD id for rfc");
+}
+
+RegisterFileCache::~RegisterFileCache()
+{
+}
+
+void
+RegisterFileCache::setParent(ComputeUnit *_computeUnit)
+{
+    computeUnit = _computeUnit;
+}
+
+bool
+RegisterFileCache::inRFC(int regIdx)
+{
+    return (lruHash.find(regIdx) != lruHash.end());
+}
+
+std::string
+RegisterFileCache::dumpLL() const
+{
+    std::stringstream ss;
+    ss << "lru_order: ";
+    for (auto i=lruHead; i!=nullptr; i=i->next) {
+        if (i->prev == nullptr) {
+            ss << "reg: " << i->regIdx << " ";
+        } else {
+            ss << "reg: " << i->regIdx << " (prev: " << i->prev->regIdx<<") ";
+        }
+        if (i->next != nullptr) {
+            ss << " (next: " << i->next->regIdx<<") ";
+        }
+    }
+    ss << "\n";
+    return ss.str();
+}
+
+void
+RegisterFileCache::markRFC(int regIdx)
+{
+    if (_capacity == 0) {
+        return;
+    }
+    if (lruHash.find(regIdx) == lruHash.end()) {
+        if (lruHead == nullptr) {
+            DPRINTF(GPURFC, "RFC SIMD[%d] cache miss inserting physReg[%d]\n",
+                simdId, regIdx);
+            OrderedRegs *oreg = new OrderedRegs(regIdx);
+            lruHash[regIdx] = oreg;
+            lruHead = oreg;
+            lruTail = oreg;
+            return;
+        }
+
+        if (lruHash.size() >= _capacity) {
+            int val = lruTail->regIdx;
+            DPRINTF(GPURFC, "RFC SIMD[%d] cache miss inserting "
+                "physReg[%d] evicting physReg[%d]\n", simdId, regIdx, val);
+
+            lruTail = lruTail->prev;
+            lruTail->next = nullptr;
+            lruHash.erase(val);
+        } else {
+            DPRINTF(GPURFC, "RFC SIMD[%d] cache miss inserting physReg[%d]\n",
+                simdId, regIdx);
+        }
+    } else { // Exists in cache need to update
+        DPRINTF(GPURFC, "RFC SIMD[%d] cache hit physReg[%d]\n",
+            simdId, regIdx);
+
+        if (lruHead->regIdx == regIdx) {
+            return;
+        }
+        if (lruHash[regIdx]==lruTail) {
+            lruTail = lruHash[regIdx]->prev;
+        }
+        if (lruHash[regIdx]->next != nullptr) {
+            lruHash[regIdx]->next->prev = lruHash[regIdx]->prev;
+        }
+        lruHash[regIdx]->prev->next = lruHash[regIdx]->next;
+        lruHash.erase(regIdx);
+    }
+
+    OrderedRegs *oreg = new OrderedRegs(regIdx);
+    lruHash[regIdx] = oreg;
+    oreg->next = lruHead;
+    lruHead->prev = oreg;
+    lruHead = oreg;
+}
+
+void
+RegisterFileCache::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
+{
+    if (!ii->isLoad()
+        && !(ii->isAtomic() || ii->isMemSync())) {
+        Cycles delay(computeUnit->rfcLength());
+        Tick tickDelay = computeUnit->cyclesToTicks(delay);
+
+        for (const auto& dstVecOp : ii->dstVecRegOperands()) {
+            for (const auto& physIdx : dstVecOp.physIndices()) {
+                enqCacheInsertEvent(physIdx, tickDelay);
+            }
+        }
+    }
+}
+
+void
+RegisterFileCache::enqCacheInsertEvent(uint32_t regIdx, uint64_t delay)
+{
+    schedule(new MarkRegCachedEvent(this, regIdx),
+                curTick() + delay);
+}
+
+void
+RegisterFileCache::MarkRegCachedEvent::process()
+{
+    rfc->markRFC(regIdx);
+}
+
+}
diff --git a/src/gpu-compute/register_file_cache.hh b/src/gpu-compute/register_file_cache.hh
new file mode 100644
index 0000000000..040f174033
--- /dev/null
+++ b/src/gpu-compute/register_file_cache.hh
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2023 The University of Wisconsin
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __REGISTER_FILE_CACHE_HH__
+#define __REGISTER_FILE_CACHE_HH__
+
+#include <limits>
+#include <unordered_set>
+#include <vector>
+
+#include "base/statistics.hh"
+#include "base/types.hh"
+#include "gpu-compute/misc.hh"
+#include "sim/sim_object.hh"
+
+namespace gem5
+{
+
+class ComputeUnit;
+class Wavefront;
+
+struct RegisterFileCacheParams;
+
+class RegisterFileCache : public SimObject
+{
+  public:
+    RegisterFileCache(const RegisterFileCacheParams &p);
+    virtual ~RegisterFileCache();
+    virtual void setParent(ComputeUnit *_computeUnit);
+    int cacheSize() const { return _capacity; }
+
+    // Debug functions
+    virtual std::string dumpLL() const;
+
+    // Abstract Register Event
+    class RegisterCacheEvent : public Event
+    {
+      protected:
+        RegisterFileCache *rfc;
+        int regIdx;
+
+      public:
+        RegisterCacheEvent(RegisterFileCache *rfc, int regIdx)
+            : rfc(rfc), regIdx(regIdx) { setFlags(AutoDelete); }
+    };
+
+    class MarkRegCachedEvent : public RegisterCacheEvent
+    {
+      public:
+        MarkRegCachedEvent(RegisterFileCache *rfc, int regIdx)
+            : RegisterCacheEvent(rfc, regIdx) { }
+        void process();
+    };
+
+    virtual void enqCacheInsertEvent(uint32_t regIdx, uint64_t delay);
+
+    virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii);
+
+    // Add register to rfc using LRU replacement policy
+    virtual void markRFC(int regIdx);
+
+    virtual bool inRFC(int regIdx);
+
+  protected:
+    ComputeUnit* computeUnit;
+    int simdId, _capacity;
+
+    class OrderedRegs
+    {
+      public:
+        int regIdx;
+
+        OrderedRegs* next;
+        OrderedRegs* prev;
+        OrderedRegs(int val) : regIdx(val), next(nullptr), prev(nullptr) {}
+    };
+
+    // Doubly linked list, head is the most recently used
+    std::unordered_map<int, OrderedRegs*> lruHash;
+    OrderedRegs* lruHead = nullptr;
+    OrderedRegs* lruTail = nullptr;
+
+};
+
+} // namespace gem5
+
+#endif // __REGISTER_FILE_CACHE_HH__
diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc
index 0d475c577e..13dc423897 100644
--- a/src/gpu-compute/schedule_stage.cc
+++ b/src/gpu-compute/schedule_stage.cc
@@ -38,6 +38,7 @@
 #include "debug/GPUVRF.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/register_file_cache.hh"
 #include "gpu-compute/scalar_register_file.hh"
 #include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"
diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc
index b5f17c82d0..8935332860 100644
--- a/src/gpu-compute/vector_register_file.cc
+++ b/src/gpu-compute/vector_register_file.cc
@@ -38,6 +38,7 @@
 #include "debug/GPUVRF.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/register_file_cache.hh"
 #include "gpu-compute/simple_pool_manager.hh"
 #include "gpu-compute/wavefront.hh"
 #include "params/VectorRegisterFile.hh"
@@ -60,7 +61,8 @@ VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
 {
     for (const auto& srcVecOp : ii->srcVecRegOperands()) {
         for (const auto& physIdx : srcVecOp.physIndices()) {
-            if (regBusy(physIdx)) {
+            if (regBusy(physIdx) &&
+                    !computeUnit->rfc[simdId]->inRFC(physIdx)) {
                 DPRINTF(GPUVRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
                         w->wfDynId, ii->disassemble(), physIdx);
                 w->stats.numTimesBlockedDueRAWDependencies++;
@@ -71,7 +73,8 @@ VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
 
     for (const auto& dstVecOp : ii->dstVecRegOperands()) {
         for (const auto& physIdx : dstVecOp.physIndices()) {
-            if (regBusy(physIdx)) {
+            if (regBusy(physIdx) &&
+                    !computeUnit->rfc[simdId]->inRFC(physIdx)) {
                 DPRINTF(GPUVRF, "WAX stall: WV[%d]: %s: physReg[%d]\n",
                         w->wfDynId, ii->disassemble(), physIdx);
                 w->stats.numTimesBlockedDueWAXDependencies++;
@@ -114,6 +117,22 @@ VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
     int DWords = ii->numSrcVecDWords();
     stats.registerReads += (DWords * w->execMask().count());
 
+    for (const auto& dstVecOp : ii->dstVecRegOperands()) {
+        for (const auto& physIdx : dstVecOp.physIndices()) {
+            if (computeUnit->rfc[simdId]->inRFC(physIdx)) {
+                stats.rfc_cache_write_hits += w->execMask().count();
+            }
+        }
+    }
+
+    for (const auto& srcVecOp : ii->srcVecRegOperands()) {
+        for (const auto& physIdx : srcVecOp.physIndices()) {
+            if (computeUnit->rfc[simdId]->inRFC(physIdx)) {
+                stats.rfc_cache_read_hits += w->execMask().count();
+            }
+        }
+    }
+
     uint64_t mask = w->execMask().to_ullong();
     int srams = w->execMask().size() / 4;
     for (int i = 0; i < srams; i++) {
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
index 0bca152e08..c7d2bc40e9 100644
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -37,6 +37,7 @@
 #include "debug/WavefrontStack.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/register_file_cache.hh"
 #include "gpu-compute/scalar_register_file.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/simple_pool_manager.hh"
@@ -933,6 +934,7 @@ Wavefront::exec()
     // inform VRF of instruction execution to schedule write-back
     // and scoreboard ready for registers
     if (!ii->isScalar()) {
+        computeUnit->rfc[simdId]->waveExecuteInst(this, ii);
         computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
     }
     computeUnit->srf[simdId]->waveExecuteInst(this, ii);