From 359ac63280fa591c1343f897ba495b292f390f4e Mon Sep 17 00:00:00 2001 From: KaiBatley Date: Wed, 3 Jan 2024 21:30:20 -0600 Subject: [PATCH] gpu-compute: Added register file cache support The RFC is defaulted to a size of 0 which removes it completely. To use the RFC set the --register-file-cache-size to a non-zero multiple of two. In addition, rfc_pipe_length may be altrered to increase or decrease RFC latency benefit. Change-Id: I6f5bf5b750eb64155fbc8c8343e9feadce5c9f79 --- configs/example/apu_se.py | 13 ++ configs/example/gpufs/amd/AmdGPUOptions.py | 6 + configs/example/gpufs/system/amdgpu.py | 7 + src/gpu-compute/GPU.py | 16 ++ src/gpu-compute/SConscript | 8 +- src/gpu-compute/compute_unit.cc | 4 + src/gpu-compute/compute_unit.hh | 6 + src/gpu-compute/register_file.cc | 4 + src/gpu-compute/register_file.hh | 4 + src/gpu-compute/register_file_cache.cc | 170 +++++++++++++++++++++ src/gpu-compute/register_file_cache.hh | 113 ++++++++++++++ src/gpu-compute/schedule_stage.cc | 1 + src/gpu-compute/vector_register_file.cc | 23 ++- src/gpu-compute/wavefront.cc | 2 + 14 files changed, 372 insertions(+), 5 deletions(-) create mode 100644 src/gpu-compute/register_file_cache.cc create mode 100644 src/gpu-compute/register_file_cache.hh diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py index b20779fcdb..68e678f76b 100644 --- a/configs/example/apu_se.py +++ b/configs/example/apu_se.py @@ -331,6 +331,12 @@ parser.add_argument( default="dynamic", help="register allocation policy (simple/dynamic)", ) +parser.add_argument( + "--register-file-cache-size", + type=int, + default=0, + help="number of registers in cache", +) parser.add_argument( "--dgpu", @@ -489,6 +495,7 @@ for i in range(n_cu): vrfs = [] vrf_pool_mgrs = [] srfs = [] + rfcs = [] srf_pool_mgrs = [] for j in range(args.simds_per_cu): for k in range(shader.n_wf): @@ -533,10 +540,16 @@ for i in range(n_cu): simd_id=j, wf_size=args.wf_size, num_regs=args.sreg_file_size ) ) + rfcs.append( + RegisterFileCache( + simd_id=j, cache_size=args.register_file_cache_size + ) + ) compute_units[-1].wavefronts = wavefronts compute_units[-1].vector_register_file = vrfs compute_units[-1].scalar_register_file = srfs + compute_units[-1].register_file_cache = rfcs compute_units[-1].register_manager = RegisterManager( policy=args.registerManagerPolicy, vrf_pool_managers=vrf_pool_mgrs, diff --git a/configs/example/gpufs/amd/AmdGPUOptions.py b/configs/example/gpufs/amd/AmdGPUOptions.py index 3d6a8cc48e..9996d33a2e 100644 --- a/configs/example/gpufs/amd/AmdGPUOptions.py +++ b/configs/example/gpufs/amd/AmdGPUOptions.py @@ -247,3 +247,9 @@ def addAmdGPUOptions(parser): default="simple", help="register allocation policy (simple/dynamic)", ) + parser.add_argument( + "--register-file-cache-size", + type=int, + default=0, + help="number of registers in cache", + ) diff --git a/configs/example/gpufs/system/amdgpu.py b/configs/example/gpufs/system/amdgpu.py index 4bca52c77e..30e059d154 100644 --- a/configs/example/gpufs/system/amdgpu.py +++ b/configs/example/gpufs/system/amdgpu.py @@ -84,6 +84,7 @@ def createGPU(system, args): vrfs = [] vrf_pool_mgrs = [] srfs = [] + rfcs = [] srf_pool_mgrs = [] for j in range(args.simds_per_cu): for k in range(shader.n_wf): @@ -133,10 +134,16 @@ def createGPU(system, args): num_regs=args.sreg_file_size, ) ) + rfcs.append( + RegisterFileCache( + simd_id=j, cache_size=args.register_file_cache_size + ) + ) compute_units[-1].wavefronts = wavefronts compute_units[-1].vector_register_file = vrfs compute_units[-1].scalar_register_file = srfs + compute_units[-1].register_file_cache = rfcs compute_units[-1].register_manager = RegisterManager( policy=args.registerManagerPolicy, vrf_pool_managers=vrf_pool_mgrs, diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py index 1b6c6a7494..fe0586fc16 100644 --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -96,6 +96,14 @@ class VectorRegisterFile(RegisterFile): cxx_header = "gpu-compute/vector_register_file.hh" +class RegisterFileCache(SimObject): + type = "RegisterFileCache" + cxx_class = "gem5::RegisterFileCache" + cxx_header = "gpu-compute/register_file_cache.hh" + simd_id = Param.Int("SIMD ID associated with this Register File Cache") + cache_size = Param.Int(0, "number of entries of rfc") + + class RegisterManager(SimObject): type = "RegisterManager" cxx_class = "gem5::RegisterManager" @@ -150,6 +158,11 @@ class ComputeUnit(ClockedObject): dpbypass_pipe_length = Param.Int( 4, "vector ALU Double Precision bypass latency" ) + + rfc_pipe_length = Param.Int( + 2, "number of cycles per register file cache access" + ) + scalar_pipe_length = Param.Int(1, "number of pipe stages per scalar ALU") issue_period = Param.Int(4, "number of cycles per issue period") @@ -261,6 +274,9 @@ class ComputeUnit(ClockedObject): scalar_register_file = VectorParam.ScalarRegisterFile( "Scalar register file" ) + + register_file_cache = VectorParam.RegisterFileCache("Register file cache") + out_of_order_data_delivery = Param.Bool( False, "enable OoO data delivery in the GM pipeline" ) diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript index 81f02d8282..e4536ba2a5 100644 --- a/src/gpu-compute/SConscript +++ b/src/gpu-compute/SConscript @@ -38,7 +38,7 @@ SimObject('GPU.py', sim_objects=[ 'PoolManager', 'SimplePoolManager', 'DynPoolManager', 'RegisterFile', 'ScalarRegisterFile', 'VectorRegisterFile', 'RegisterManager', 'Wavefront', 'ComputeUnit', 'Shader', 'GPUComputeDriver', 'GPURenderDriver', - 'GPUDispatcher', 'GPUCommandProcessor'], + 'GPUDispatcher', 'GPUCommandProcessor', 'RegisterFileCache'], enums=['PrefetchType', 'GfxVersion', 'StorageClassType']) SimObject('GPUStaticInstFlags.py', enums=['GPUStaticInstFlags']) SimObject('LdsState.py', sim_objects=['LdsState']) @@ -71,6 +71,7 @@ Source('dyn_pool_manager.cc') Source('simple_pool_manager.cc') Source('static_register_manager_policy.cc') Source('vector_register_file.cc') +Source('register_file_cache.cc') Source('wavefront.cc') DebugFlag('GPUAgentDisp') @@ -96,6 +97,7 @@ DebugFlag('GPUSRF') DebugFlag('GPUSync') DebugFlag('GPUTLB') DebugFlag('GPUVRF') +DebugFlag('GPURFC') DebugFlag('GPUVRFSched') DebugFlag('GPUWgLatency') DebugFlag('Predictor') @@ -103,5 +105,5 @@ DebugFlag('WavefrontStack') CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch', 'GPUMem', 'GPUPort', 'GPUSched', 'GPUSRF', 'GPUSync', - 'GPUTLB', 'GPUVRF', 'GPUWgLatency', 'GPUKernelInfo', - 'GPUInitAbi']) + 'GPUTLB', 'GPUVRF', 'GPURFC', 'GPUWgLatency', + 'GPUKernelInfo', 'GPUInitAbi']) diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 8d6deeb85a..8259f0a950 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -50,6 +50,7 @@ #include "gpu-compute/gpu_command_processor.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/register_file_cache.hh" #include "gpu-compute/scalar_register_file.hh" #include "gpu-compute/shader.hh" #include "gpu-compute/simple_pool_manager.hh" @@ -82,9 +83,11 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p), false, Event::CPU_Tick_Pri), cu_id(p.cu_id), vrf(p.vector_register_file), srf(p.scalar_register_file), + rfc(p.register_file_cache), simdWidth(p.simd_width), spBypassPipeLength(p.spbypass_pipe_length), dpBypassPipeLength(p.dpbypass_pipe_length), + rfcPipeLength(p.rfc_pipe_length), scalarPipeStages(p.scalar_pipe_length), operandNetworkLength(p.operand_network_length), issuePeriod(p.issue_period), @@ -207,6 +210,7 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p), for (int i = 0; i < vrf.size(); ++i) { vrf[i]->setParent(this); + rfc[i]->setParent(this); } for (int i = 0; i < srf.size(); ++i) { srf[i]->setParent(this); diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index cf73aa2723..e6bc03da7d 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -66,6 +66,7 @@ class LdsChunk; class ScalarRegisterFile; class Shader; class VectorRegisterFile; +class RegisterFileCache; struct ComputeUnitParams; @@ -296,6 +297,8 @@ class ComputeUnit : public ClockedObject // array of scalar register files, one per SIMD std::vector srf; + std::vector rfc; + // Width per VALU/SIMD unit: number of work items that can be executed // on the vector ALU simultaneously in a SIMD unit int simdWidth; @@ -305,6 +308,8 @@ class ComputeUnit : public ClockedObject // number of pipe stages for bypassing data to next dependent double // precision vector instruction inside the vector ALU pipeline int dpBypassPipeLength; + // number of pipe stages for register file cache + int rfcPipeLength; // number of pipe stages for scalar ALU int scalarPipeStages; // number of pipe stages for operand collection & distribution network @@ -390,6 +395,7 @@ class ComputeUnit : public ClockedObject int simdUnitWidth() const { return simdWidth; } int spBypassLength() const { return spBypassPipeLength; } int dpBypassLength() const { return dpBypassPipeLength; } + int rfcLength() const { return rfcPipeLength; } int scalarPipeLength() const { return scalarPipeStages; } int storeBusLength() const { return numCyclesPerStoreTransfer; } int loadBusLength() const { return numCyclesPerLoadTransfer; } diff --git a/src/gpu-compute/register_file.cc b/src/gpu-compute/register_file.cc index 62510e47b7..ec35c01528 100644 --- a/src/gpu-compute/register_file.cc +++ b/src/gpu-compute/register_file.cc @@ -194,6 +194,10 @@ RegisterFile::RegisterFileStats::RegisterFileStats(statistics::Group *parent) : statistics::Group(parent), ADD_STAT(registerReads, "Total number of DWORDs read from register file"), + ADD_STAT(rfc_cache_read_hits, + "Total number of DWORDs read from register file cache"), + ADD_STAT(rfc_cache_write_hits, + "Total number of writes to existing registers in the rfc"), ADD_STAT(registerWrites, "Total number of DWORDS written to register file"), ADD_STAT(sramReads, diff --git a/src/gpu-compute/register_file.hh b/src/gpu-compute/register_file.hh index 089516ce27..0d1a12b7b7 100644 --- a/src/gpu-compute/register_file.hh +++ b/src/gpu-compute/register_file.hh @@ -158,6 +158,10 @@ class RegisterFile : public SimObject // Total number of register reads per DWORD per thread statistics::Scalar registerReads; + + statistics::Scalar rfc_cache_read_hits; + statistics::Scalar rfc_cache_write_hits; + // Total number of register writes per DWORD per thread statistics::Scalar registerWrites; diff --git a/src/gpu-compute/register_file_cache.cc b/src/gpu-compute/register_file_cache.cc new file mode 100644 index 0000000000..f23cf399da --- /dev/null +++ b/src/gpu-compute/register_file_cache.cc @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2023 The University of Wisconsin + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "gpu-compute/register_file_cache.hh" + +#include +#include + +#include "base/intmath.hh" +#include "base/logging.hh" +#include "debug/GPURFC.hh" +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" +#include "gpu-compute/wavefront.hh" +#include "params/RegisterFileCache.hh" + +namespace gem5 +{ + +RegisterFileCache::RegisterFileCache(const RegisterFileCacheParams &p) + : SimObject(p), simdId(p.simd_id), _capacity(p.cache_size) +{ + fatal_if(simdId < 0, "Illegal SIMD id for rfc"); +} + +RegisterFileCache::~RegisterFileCache() +{ +} + +void +RegisterFileCache::setParent(ComputeUnit *_computeUnit) +{ + computeUnit = _computeUnit; +} + +bool +RegisterFileCache::inRFC(int regIdx) +{ + return (lruHash.find(regIdx) != lruHash.end()); +} + +std::string +RegisterFileCache::dumpLL() const +{ + std::stringstream ss; + ss << "lru_order: "; + for (auto i=lruHead; i!=nullptr; i=i->next) { + if (i->prev == nullptr) { + ss << "reg: " << i->regIdx << " "; + } else { + ss << "reg: " << i->regIdx << " (prev: " << i->prev->regIdx<<") "; + } + if (i->next != nullptr) { + ss << " (next: " << i->next->regIdx<<") "; + } + } + ss << "\n"; + return ss.str(); +} + +void +RegisterFileCache::markRFC(int regIdx) +{ + if (_capacity == 0) { + return; + } + if (lruHash.find(regIdx) == lruHash.end()) { + if (lruHead == nullptr) { + DPRINTF(GPURFC, "RFC SIMD[%d] cache miss inserting physReg[%d]\n", + simdId, regIdx); + OrderedRegs *oreg = new OrderedRegs(regIdx); + lruHash[regIdx] = oreg; + lruHead = oreg; + lruTail = oreg; + return; + } + + if (lruHash.size() >= _capacity) { + int val = lruTail->regIdx; + DPRINTF(GPURFC, "RFC SIMD[%d] cache miss inserting " + "physReg[%d] evicting physReg[%d]\n", simdId, regIdx, val); + + lruTail = lruTail->prev; + lruTail->next = nullptr; + lruHash.erase(val); + } else { + DPRINTF(GPURFC, "RFC SIMD[%d] cache miss inserting physReg[%d]\n", + simdId, regIdx); + } + } else { // Exists in cache need to update + DPRINTF(GPURFC, "RFC SIMD[%d] cache hit physReg[%d]\n", + simdId, regIdx); + + if (lruHead->regIdx == regIdx) { + return; + } + if (lruHash[regIdx]==lruTail) { + lruTail = lruHash[regIdx]->prev; + } + if (lruHash[regIdx]->next != nullptr) { + lruHash[regIdx]->next->prev = lruHash[regIdx]->prev; + } + lruHash[regIdx]->prev->next = lruHash[regIdx]->next; + lruHash.erase(regIdx); + } + + OrderedRegs *oreg = new OrderedRegs(regIdx); + lruHash[regIdx] = oreg; + oreg->next = lruHead; + lruHead->prev = oreg; + lruHead = oreg; +} + +void +RegisterFileCache::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) +{ + if (!ii->isLoad() + && !(ii->isAtomic() || ii->isMemSync())) { + Cycles delay(computeUnit->rfcLength()); + Tick tickDelay = computeUnit->cyclesToTicks(delay); + + for (const auto& dstVecOp : ii->dstVecRegOperands()) { + for (const auto& physIdx : dstVecOp.physIndices()) { + enqCacheInsertEvent(physIdx, tickDelay); + } + } + } +} + +void +RegisterFileCache::enqCacheInsertEvent(uint32_t regIdx, uint64_t delay) +{ + schedule(new MarkRegCachedEvent(this, regIdx), + curTick() + delay); +} + +void +RegisterFileCache::MarkRegCachedEvent::process() +{ + rfc->markRFC(regIdx); +} + +} diff --git a/src/gpu-compute/register_file_cache.hh b/src/gpu-compute/register_file_cache.hh new file mode 100644 index 0000000000..040f174033 --- /dev/null +++ b/src/gpu-compute/register_file_cache.hh @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2023 The University of Wisconsin + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __REGISTER_FILE_CACHE_HH__ +#define __REGISTER_FILE_CACHE_HH__ + +#include +#include +#include + +#include "base/statistics.hh" +#include "base/types.hh" +#include "gpu-compute/misc.hh" +#include "sim/sim_object.hh" + +namespace gem5 +{ + +class ComputeUnit; +class Wavefront; + +struct RegisterFileCacheParams; + +class RegisterFileCache : public SimObject +{ + public: + RegisterFileCache(const RegisterFileCacheParams &p); + virtual ~RegisterFileCache(); + virtual void setParent(ComputeUnit *_computeUnit); + int cacheSize() const { return _capacity; } + + // Debug functions + virtual std::string dumpLL() const; + + // Abstract Register Event + class RegisterCacheEvent : public Event + { + protected: + RegisterFileCache *rfc; + int regIdx; + + public: + RegisterCacheEvent(RegisterFileCache *rfc, int regIdx) + : rfc(rfc), regIdx(regIdx) { setFlags(AutoDelete); } + }; + + class MarkRegCachedEvent : public RegisterCacheEvent + { + public: + MarkRegCachedEvent(RegisterFileCache *rfc, int regIdx) + : RegisterCacheEvent(rfc, regIdx) { } + void process(); + }; + + virtual void enqCacheInsertEvent(uint32_t regIdx, uint64_t delay); + + virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii); + + // Add register to rfc using LRU replacement policy + virtual void markRFC(int regIdx); + + virtual bool inRFC(int regIdx); + + protected: + ComputeUnit* computeUnit; + int simdId, _capacity; + + class OrderedRegs + { + public: + int regIdx; + + OrderedRegs* next; + OrderedRegs* prev; + OrderedRegs(int val) : regIdx(val), next(nullptr), prev(nullptr) {} + }; + + // Doubly linked list, head is the most recently used + std::unordered_map lruHash; + OrderedRegs* lruHead = nullptr; + OrderedRegs* lruTail = nullptr; + +}; + +} // namespace gem5 + +#endif // __REGISTER_FILE_CACHE_HH__ diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc index 0d475c577e..13dc423897 100644 --- a/src/gpu-compute/schedule_stage.cc +++ b/src/gpu-compute/schedule_stage.cc @@ -38,6 +38,7 @@ #include "debug/GPUVRF.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_static_inst.hh" +#include "gpu-compute/register_file_cache.hh" #include "gpu-compute/scalar_register_file.hh" #include "gpu-compute/vector_register_file.hh" #include "gpu-compute/wavefront.hh" diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc index b5f17c82d0..8935332860 100644 --- a/src/gpu-compute/vector_register_file.cc +++ b/src/gpu-compute/vector_register_file.cc @@ -38,6 +38,7 @@ #include "debug/GPUVRF.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/register_file_cache.hh" #include "gpu-compute/simple_pool_manager.hh" #include "gpu-compute/wavefront.hh" #include "params/VectorRegisterFile.hh" @@ -60,7 +61,8 @@ VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const { for (const auto& srcVecOp : ii->srcVecRegOperands()) { for (const auto& physIdx : srcVecOp.physIndices()) { - if (regBusy(physIdx)) { + if (regBusy(physIdx) && + !computeUnit->rfc[simdId]->inRFC(physIdx)) { DPRINTF(GPUVRF, "RAW stall: WV[%d]: %s: physReg[%d]\n", w->wfDynId, ii->disassemble(), physIdx); w->stats.numTimesBlockedDueRAWDependencies++; @@ -71,7 +73,8 @@ VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const for (const auto& dstVecOp : ii->dstVecRegOperands()) { for (const auto& physIdx : dstVecOp.physIndices()) { - if (regBusy(physIdx)) { + if (regBusy(physIdx) && + !computeUnit->rfc[simdId]->inRFC(physIdx)) { DPRINTF(GPUVRF, "WAX stall: WV[%d]: %s: physReg[%d]\n", w->wfDynId, ii->disassemble(), physIdx); w->stats.numTimesBlockedDueWAXDependencies++; @@ -114,6 +117,22 @@ VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii) int DWords = ii->numSrcVecDWords(); stats.registerReads += (DWords * w->execMask().count()); + for (const auto& dstVecOp : ii->dstVecRegOperands()) { + for (const auto& physIdx : dstVecOp.physIndices()) { + if (computeUnit->rfc[simdId]->inRFC(physIdx)) { + stats.rfc_cache_write_hits += w->execMask().count(); + } + } + } + + for (const auto& srcVecOp : ii->srcVecRegOperands()) { + for (const auto& physIdx : srcVecOp.physIndices()) { + if (computeUnit->rfc[simdId]->inRFC(physIdx)) { + stats.rfc_cache_read_hits += w->execMask().count(); + } + } + } + uint64_t mask = w->execMask().to_ullong(); int srams = w->execMask().size() / 4; for (int i = 0; i < srams; i++) { diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc index 0bca152e08..c7d2bc40e9 100644 --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -37,6 +37,7 @@ #include "debug/WavefrontStack.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/register_file_cache.hh" #include "gpu-compute/scalar_register_file.hh" #include "gpu-compute/shader.hh" #include "gpu-compute/simple_pool_manager.hh" @@ -933,6 +934,7 @@ Wavefront::exec() // inform VRF of instruction execution to schedule write-back // and scoreboard ready for registers if (!ii->isScalar()) { + computeUnit->rfc[simdId]->waveExecuteInst(this, ii); computeUnit->vrf[simdId]->waveExecuteInst(this, ii); } computeUnit->srf[simdId]->waveExecuteInst(this, ii);