gpu-compute: Added register file cache support (#730)

The RFC is defaulted to a size of 0 which removes it completely. To use the RFC set the --register-file-cache-size to a non-zero multiple of two. In addition, rfc_pipe_length may be altered to increase or decrease RFC latency benefit.
2024-01-05 12:57:06 -06:00
parent b652ab8558 359ac63280
commit dc85d1492c
14 changed files with 372 additions and 5 deletions
--- a/configs/example/apu_se.py
+++ b/configs/example/apu_se.py
@@ -335,6 +335,12 @@ parser.add_argument(
    default="dynamic",
    help="register allocation policy (simple/dynamic)",
 )
+parser.add_argument(
+    "--register-file-cache-size",
+    type=int,
+    default=0,
+    help="number of registers in cache",
+)

 parser.add_argument(
    "--dgpu",
@@ -493,6 +499,7 @@ for i in range(n_cu):
    vrfs = []
    vrf_pool_mgrs = []
    srfs = []
+    rfcs = []
    srf_pool_mgrs = []
    for j in range(args.simds_per_cu):
        for k in range(shader.n_wf):
@@ -537,10 +544,16 @@ for i in range(n_cu):
                simd_id=j, wf_size=args.wf_size, num_regs=args.sreg_file_size
            )
        )
+        rfcs.append(
+            RegisterFileCache(
+                simd_id=j, cache_size=args.register_file_cache_size
+            )
+        )

    compute_units[-1].wavefronts = wavefronts
    compute_units[-1].vector_register_file = vrfs
    compute_units[-1].scalar_register_file = srfs
+    compute_units[-1].register_file_cache = rfcs
    compute_units[-1].register_manager = RegisterManager(
        policy=args.registerManagerPolicy,
        vrf_pool_managers=vrf_pool_mgrs,
--- a/configs/example/gpufs/amd/AmdGPUOptions.py
+++ b/configs/example/gpufs/amd/AmdGPUOptions.py
@@ -247,3 +247,9 @@ def addAmdGPUOptions(parser):
        default="simple",
        help="register allocation policy (simple/dynamic)",
    )
+    parser.add_argument(
+        "--register-file-cache-size",
+        type=int,
+        default=0,
+        help="number of registers in cache",
+    )
--- a/configs/example/gpufs/system/amdgpu.py
+++ b/configs/example/gpufs/system/amdgpu.py
@@ -84,6 +84,7 @@ def createGPU(system, args):
        vrfs = []
        vrf_pool_mgrs = []
        srfs = []
+        rfcs = []
        srf_pool_mgrs = []
        for j in range(args.simds_per_cu):
            for k in range(shader.n_wf):
@@ -133,10 +134,16 @@ def createGPU(system, args):
                    num_regs=args.sreg_file_size,
                )
            )
+            rfcs.append(
+                RegisterFileCache(
+                    simd_id=j, cache_size=args.register_file_cache_size
+                )
+            )

        compute_units[-1].wavefronts = wavefronts
        compute_units[-1].vector_register_file = vrfs
        compute_units[-1].scalar_register_file = srfs
+        compute_units[-1].register_file_cache = rfcs
        compute_units[-1].register_manager = RegisterManager(
            policy=args.registerManagerPolicy,
            vrf_pool_managers=vrf_pool_mgrs,
--- a/src/gpu-compute/GPU.py
+++ b/src/gpu-compute/GPU.py
@@ -95,6 +95,14 @@ class VectorRegisterFile(RegisterFile):
    cxx_header = "gpu-compute/vector_register_file.hh"


+class RegisterFileCache(SimObject):
+    type = "RegisterFileCache"
+    cxx_class = "gem5::RegisterFileCache"
+    cxx_header = "gpu-compute/register_file_cache.hh"
+    simd_id = Param.Int("SIMD ID associated with this Register File Cache")
+    cache_size = Param.Int(0, "number of entries of rfc")
+
+
 class RegisterManager(SimObject):
    type = "RegisterManager"
    cxx_class = "gem5::RegisterManager"
@@ -149,6 +157,11 @@ class ComputeUnit(ClockedObject):
    dpbypass_pipe_length = Param.Int(
        4, "vector ALU Double Precision bypass latency"
    )
+
+    rfc_pipe_length = Param.Int(
+        2, "number of cycles per register file cache access"
+    )
+
    scalar_pipe_length = Param.Int(1, "number of pipe stages per scalar ALU")
    issue_period = Param.Int(4, "number of cycles per issue period")

@@ -260,6 +273,9 @@ class ComputeUnit(ClockedObject):
    scalar_register_file = VectorParam.ScalarRegisterFile(
        "Scalar register file"
    )
+
+    register_file_cache = VectorParam.RegisterFileCache("Register file cache")
+
    out_of_order_data_delivery = Param.Bool(
        False, "enable OoO data delivery in the GM pipeline"
    )
--- a/src/gpu-compute/SConscript
+++ b/src/gpu-compute/SConscript
@@ -38,7 +38,7 @@ SimObject('GPU.py', sim_objects=[
    'PoolManager', 'SimplePoolManager', 'DynPoolManager', 'RegisterFile',
    'ScalarRegisterFile', 'VectorRegisterFile', 'RegisterManager', 'Wavefront',
    'ComputeUnit', 'Shader', 'GPUComputeDriver', 'GPURenderDriver',
-    'GPUDispatcher', 'GPUCommandProcessor'],
+    'GPUDispatcher', 'GPUCommandProcessor', 'RegisterFileCache'],
    enums=['PrefetchType', 'GfxVersion', 'StorageClassType'])
 SimObject('GPUStaticInstFlags.py', enums=['GPUStaticInstFlags'])
 SimObject('LdsState.py', sim_objects=['LdsState'])
@@ -71,6 +71,7 @@ Source('dyn_pool_manager.cc')
 Source('simple_pool_manager.cc')
 Source('static_register_manager_policy.cc')
 Source('vector_register_file.cc')
+Source('register_file_cache.cc')
 Source('wavefront.cc')

 DebugFlag('GPUAgentDisp')
@@ -96,6 +97,7 @@ DebugFlag('GPUSRF')
 DebugFlag('GPUSync')
 DebugFlag('GPUTLB')
 DebugFlag('GPUVRF')
+DebugFlag('GPURFC')
 DebugFlag('GPUVRFSched')
 DebugFlag('GPUWgLatency')
 DebugFlag('Predictor')
@@ -103,5 +105,5 @@ DebugFlag('WavefrontStack')

 CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
                        'GPUMem', 'GPUPort', 'GPUSched', 'GPUSRF', 'GPUSync',
-                        'GPUTLB', 'GPUVRF', 'GPUWgLatency', 'GPUKernelInfo',
-                        'GPUInitAbi'])
+                        'GPUTLB', 'GPUVRF', 'GPURFC', 'GPUWgLatency',
+                        'GPUKernelInfo', 'GPUInitAbi'])
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -50,6 +50,7 @@
 #include "gpu-compute/gpu_command_processor.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
 #include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/register_file_cache.hh"
 #include "gpu-compute/scalar_register_file.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/simple_pool_manager.hh"
@@ -82,9 +83,11 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p),
          false, Event::CPU_Tick_Pri),
    cu_id(p.cu_id),
    vrf(p.vector_register_file), srf(p.scalar_register_file),
+    rfc(p.register_file_cache),
    simdWidth(p.simd_width),
    spBypassPipeLength(p.spbypass_pipe_length),
    dpBypassPipeLength(p.dpbypass_pipe_length),
+    rfcPipeLength(p.rfc_pipe_length),
    scalarPipeStages(p.scalar_pipe_length),
    operandNetworkLength(p.operand_network_length),
    issuePeriod(p.issue_period),
@@ -207,6 +210,7 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p),

    for (int i = 0; i < vrf.size(); ++i) {
        vrf[i]->setParent(this);
+        rfc[i]->setParent(this);
    }
    for (int i = 0; i < srf.size(); ++i) {
        srf[i]->setParent(this);
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@@ -66,6 +66,7 @@ class LdsChunk;
 class ScalarRegisterFile;
 class Shader;
 class VectorRegisterFile;
+class RegisterFileCache;

 struct ComputeUnitParams;

@@ -296,6 +297,8 @@ class ComputeUnit : public ClockedObject
    // array of scalar register files, one per SIMD
    std::vector<ScalarRegisterFile*> srf;

+    std::vector<RegisterFileCache*> rfc;
+
    // Width per VALU/SIMD unit: number of work items that can be executed
    // on the vector ALU simultaneously in a SIMD unit
    int simdWidth;
@@ -305,6 +308,8 @@ class ComputeUnit : public ClockedObject
    // number of pipe stages for bypassing data to next dependent double
    // precision vector instruction inside the vector ALU pipeline
    int dpBypassPipeLength;
+    // number of pipe stages for register file cache
+    int rfcPipeLength;
    // number of pipe stages for scalar ALU
    int scalarPipeStages;
    // number of pipe stages for operand collection & distribution network
@@ -390,6 +395,7 @@ class ComputeUnit : public ClockedObject
    int simdUnitWidth() const { return simdWidth; }
    int spBypassLength() const { return spBypassPipeLength; }
    int dpBypassLength() const { return dpBypassPipeLength; }
+    int rfcLength() const { return rfcPipeLength; }
    int scalarPipeLength() const { return scalarPipeStages; }
    int storeBusLength() const { return numCyclesPerStoreTransfer; }
    int loadBusLength() const { return numCyclesPerLoadTransfer; }
--- a/src/gpu-compute/register_file.cc
+++ b/src/gpu-compute/register_file.cc
@@ -194,6 +194,10 @@ RegisterFile::RegisterFileStats::RegisterFileStats(statistics::Group *parent)
    : statistics::Group(parent),
      ADD_STAT(registerReads,
              "Total number of DWORDs read from register file"),
+      ADD_STAT(rfc_cache_read_hits,
+              "Total number of DWORDs read from register file cache"),
+      ADD_STAT(rfc_cache_write_hits,
+              "Total number of writes to existing registers in the rfc"),
      ADD_STAT(registerWrites,
              "Total number of DWORDS written to register file"),
      ADD_STAT(sramReads,
--- a/src/gpu-compute/register_file.hh
+++ b/src/gpu-compute/register_file.hh
@@ -158,6 +158,10 @@ class RegisterFile : public SimObject

        // Total number of register reads per DWORD per thread
        statistics::Scalar registerReads;
+
+        statistics::Scalar rfc_cache_read_hits;
+        statistics::Scalar rfc_cache_write_hits;
+
        // Total number of register writes per DWORD per thread
        statistics::Scalar registerWrites;

--- a/src/gpu-compute/register_file_cache.cc
+++ b/src/gpu-compute/register_file_cache.cc
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2023 The University of Wisconsin
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "gpu-compute/register_file_cache.hh"
+
+#include <sstream>
+#include <string>
+
+#include "base/intmath.hh"
+#include "base/logging.hh"
+#include "debug/GPURFC.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/RegisterFileCache.hh"
+
+namespace gem5
+{
+
+RegisterFileCache::RegisterFileCache(const RegisterFileCacheParams &p)
+    : SimObject(p), simdId(p.simd_id), _capacity(p.cache_size)
+{
+    fatal_if(simdId < 0, "Illegal SIMD id for rfc");
+}
+
+RegisterFileCache::~RegisterFileCache()
+{
+}
+
+void
+RegisterFileCache::setParent(ComputeUnit *_computeUnit)
+{
+    computeUnit = _computeUnit;
+}
+
+bool
+RegisterFileCache::inRFC(int regIdx)
+{
+    return (lruHash.find(regIdx) != lruHash.end());
+}
+
+std::string
+RegisterFileCache::dumpLL() const
+{
+    std::stringstream ss;
+    ss << "lru_order: ";
+    for (auto i=lruHead; i!=nullptr; i=i->next) {
+        if (i->prev == nullptr) {
+            ss << "reg: " << i->regIdx << " ";
+        } else {
+            ss << "reg: " << i->regIdx << " (prev: " << i->prev->regIdx<<") ";
+        }
+        if (i->next != nullptr) {
+            ss << " (next: " << i->next->regIdx<<") ";
+        }
+    }
+    ss << "\n";
+    return ss.str();
+}
+
+void
+RegisterFileCache::markRFC(int regIdx)
+{
+    if (_capacity == 0) {
+        return;
+    }
+    if (lruHash.find(regIdx) == lruHash.end()) {
+        if (lruHead == nullptr) {
+            DPRINTF(GPURFC, "RFC SIMD[%d] cache miss inserting physReg[%d]\n",
+                simdId, regIdx);
+            OrderedRegs *oreg = new OrderedRegs(regIdx);
+            lruHash[regIdx] = oreg;
+            lruHead = oreg;
+            lruTail = oreg;
+            return;
+        }
+
+        if (lruHash.size() >= _capacity) {
+            int val = lruTail->regIdx;
+            DPRINTF(GPURFC, "RFC SIMD[%d] cache miss inserting "
+                "physReg[%d] evicting physReg[%d]\n", simdId, regIdx, val);
+
+            lruTail = lruTail->prev;
+            lruTail->next = nullptr;
+            lruHash.erase(val);
+        } else {
+            DPRINTF(GPURFC, "RFC SIMD[%d] cache miss inserting physReg[%d]\n",
+                simdId, regIdx);
+        }
+    } else { // Exists in cache need to update
+        DPRINTF(GPURFC, "RFC SIMD[%d] cache hit physReg[%d]\n",
+            simdId, regIdx);
+
+        if (lruHead->regIdx == regIdx) {
+            return;
+        }
+        if (lruHash[regIdx]==lruTail) {
+            lruTail = lruHash[regIdx]->prev;
+        }
+        if (lruHash[regIdx]->next != nullptr) {
+            lruHash[regIdx]->next->prev = lruHash[regIdx]->prev;
+        }
+        lruHash[regIdx]->prev->next = lruHash[regIdx]->next;
+        lruHash.erase(regIdx);
+    }
+
+    OrderedRegs *oreg = new OrderedRegs(regIdx);
+    lruHash[regIdx] = oreg;
+    oreg->next = lruHead;
+    lruHead->prev = oreg;
+    lruHead = oreg;
+}
+
+void
+RegisterFileCache::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
+{
+    if (!ii->isLoad()
+        && !(ii->isAtomic() || ii->isMemSync())) {
+        Cycles delay(computeUnit->rfcLength());
+        Tick tickDelay = computeUnit->cyclesToTicks(delay);
+
+        for (const auto& dstVecOp : ii->dstVecRegOperands()) {
+            for (const auto& physIdx : dstVecOp.physIndices()) {
+                enqCacheInsertEvent(physIdx, tickDelay);
+            }
+        }
+    }
+}
+
+void
+RegisterFileCache::enqCacheInsertEvent(uint32_t regIdx, uint64_t delay)
+{
+    schedule(new MarkRegCachedEvent(this, regIdx),
+                curTick() + delay);
+}
+
+void
+RegisterFileCache::MarkRegCachedEvent::process()
+{
+    rfc->markRFC(regIdx);
+}
+
+}
--- a/src/gpu-compute/register_file_cache.hh
+++ b/src/gpu-compute/register_file_cache.hh
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2023 The University of Wisconsin
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __REGISTER_FILE_CACHE_HH__
+#define __REGISTER_FILE_CACHE_HH__
+
+#include <limits>
+#include <unordered_set>
+#include <vector>
+
+#include "base/statistics.hh"
+#include "base/types.hh"
+#include "gpu-compute/misc.hh"
+#include "sim/sim_object.hh"
+
+namespace gem5
+{
+
+class ComputeUnit;
+class Wavefront;
+
+struct RegisterFileCacheParams;
+
+class RegisterFileCache : public SimObject
+{
+  public:
+    RegisterFileCache(const RegisterFileCacheParams &p);
+    virtual ~RegisterFileCache();
+    virtual void setParent(ComputeUnit *_computeUnit);
+    int cacheSize() const { return _capacity; }
+
+    // Debug functions
+    virtual std::string dumpLL() const;
+
+    // Abstract Register Event
+    class RegisterCacheEvent : public Event
+    {
+      protected:
+        RegisterFileCache *rfc;
+        int regIdx;
+
+      public:
+        RegisterCacheEvent(RegisterFileCache *rfc, int regIdx)
+            : rfc(rfc), regIdx(regIdx) { setFlags(AutoDelete); }
+    };
+
+    class MarkRegCachedEvent : public RegisterCacheEvent
+    {
+      public:
+        MarkRegCachedEvent(RegisterFileCache *rfc, int regIdx)
+            : RegisterCacheEvent(rfc, regIdx) { }
+        void process();
+    };
+
+    virtual void enqCacheInsertEvent(uint32_t regIdx, uint64_t delay);
+
+    virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii);
+
+    // Add register to rfc using LRU replacement policy
+    virtual void markRFC(int regIdx);
+
+    virtual bool inRFC(int regIdx);
+
+  protected:
+    ComputeUnit* computeUnit;
+    int simdId, _capacity;
+
+    class OrderedRegs
+    {
+      public:
+        int regIdx;
+
+        OrderedRegs* next;
+        OrderedRegs* prev;
+        OrderedRegs(int val) : regIdx(val), next(nullptr), prev(nullptr) {}
+    };
+
+    // Doubly linked list, head is the most recently used
+    std::unordered_map<int, OrderedRegs*> lruHash;
+    OrderedRegs* lruHead = nullptr;
+    OrderedRegs* lruTail = nullptr;
+
+};
+
+} // namespace gem5
+
+#endif // __REGISTER_FILE_CACHE_HH__
--- a/src/gpu-compute/schedule_stage.cc
+++ b/src/gpu-compute/schedule_stage.cc
@@ -38,6 +38,7 @@
 #include "debug/GPUVRF.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/register_file_cache.hh"
 #include "gpu-compute/scalar_register_file.hh"
 #include "gpu-compute/vector_register_file.hh"
 #include "gpu-compute/wavefront.hh"
--- a/src/gpu-compute/vector_register_file.cc
+++ b/src/gpu-compute/vector_register_file.cc
@@ -38,6 +38,7 @@
 #include "debug/GPUVRF.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/register_file_cache.hh"
 #include "gpu-compute/simple_pool_manager.hh"
 #include "gpu-compute/wavefront.hh"
 #include "params/VectorRegisterFile.hh"
@@ -60,7 +61,8 @@ VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
 {
    for (const auto& srcVecOp : ii->srcVecRegOperands()) {
        for (const auto& physIdx : srcVecOp.physIndices()) {
-            if (regBusy(physIdx)) {
+            if (regBusy(physIdx) &&
+                    !computeUnit->rfc[simdId]->inRFC(physIdx)) {
                DPRINTF(GPUVRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
                        w->wfDynId, ii->disassemble(), physIdx);
                w->stats.numTimesBlockedDueRAWDependencies++;
@@ -71,7 +73,8 @@ VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const

    for (const auto& dstVecOp : ii->dstVecRegOperands()) {
        for (const auto& physIdx : dstVecOp.physIndices()) {
-            if (regBusy(physIdx)) {
+            if (regBusy(physIdx) &&
+                    !computeUnit->rfc[simdId]->inRFC(physIdx)) {
                DPRINTF(GPUVRF, "WAX stall: WV[%d]: %s: physReg[%d]\n",
                        w->wfDynId, ii->disassemble(), physIdx);
                w->stats.numTimesBlockedDueWAXDependencies++;
@@ -114,6 +117,22 @@ VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
    int DWords = ii->numSrcVecDWords();
    stats.registerReads += (DWords * w->execMask().count());

+    for (const auto& dstVecOp : ii->dstVecRegOperands()) {
+        for (const auto& physIdx : dstVecOp.physIndices()) {
+            if (computeUnit->rfc[simdId]->inRFC(physIdx)) {
+                stats.rfc_cache_write_hits += w->execMask().count();
+            }
+        }
+    }
+
+    for (const auto& srcVecOp : ii->srcVecRegOperands()) {
+        for (const auto& physIdx : srcVecOp.physIndices()) {
+            if (computeUnit->rfc[simdId]->inRFC(physIdx)) {
+                stats.rfc_cache_read_hits += w->execMask().count();
+            }
+        }
+    }
+
    uint64_t mask = w->execMask().to_ullong();
    int srams = w->execMask().size() / 4;
    for (int i = 0; i < srams; i++) {
--- a/src/gpu-compute/wavefront.cc
+++ b/src/gpu-compute/wavefront.cc
@@ -37,6 +37,7 @@
 #include "debug/WavefrontStack.hh"
 #include "gpu-compute/compute_unit.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/register_file_cache.hh"
 #include "gpu-compute/scalar_register_file.hh"
 #include "gpu-compute/shader.hh"
 #include "gpu-compute/simple_pool_manager.hh"
@@ -933,6 +934,7 @@ Wavefront::exec()
    // inform VRF of instruction execution to schedule write-back
    // and scoreboard ready for registers
    if (!ii->isScalar()) {
+        computeUnit->rfc[simdId]->waveExecuteInst(this, ii);
        computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
    }
    computeUnit->srf[simdId]->waveExecuteInst(this, ii);