gpu-compute: Added register file cache support (#730)
The RFC is defaulted to a size of 0 which removes it completely. To use the RFC set the --register-file-cache-size to a non-zero multiple of two. In addition, rfc_pipe_length may be altered to increase or decrease RFC latency benefit.
This commit is contained in:
@@ -335,6 +335,12 @@ parser.add_argument(
|
||||
default="dynamic",
|
||||
help="register allocation policy (simple/dynamic)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--register-file-cache-size",
|
||||
type=int,
|
||||
default=0,
|
||||
help="number of registers in cache",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dgpu",
|
||||
@@ -493,6 +499,7 @@ for i in range(n_cu):
|
||||
vrfs = []
|
||||
vrf_pool_mgrs = []
|
||||
srfs = []
|
||||
rfcs = []
|
||||
srf_pool_mgrs = []
|
||||
for j in range(args.simds_per_cu):
|
||||
for k in range(shader.n_wf):
|
||||
@@ -537,10 +544,16 @@ for i in range(n_cu):
|
||||
simd_id=j, wf_size=args.wf_size, num_regs=args.sreg_file_size
|
||||
)
|
||||
)
|
||||
rfcs.append(
|
||||
RegisterFileCache(
|
||||
simd_id=j, cache_size=args.register_file_cache_size
|
||||
)
|
||||
)
|
||||
|
||||
compute_units[-1].wavefronts = wavefronts
|
||||
compute_units[-1].vector_register_file = vrfs
|
||||
compute_units[-1].scalar_register_file = srfs
|
||||
compute_units[-1].register_file_cache = rfcs
|
||||
compute_units[-1].register_manager = RegisterManager(
|
||||
policy=args.registerManagerPolicy,
|
||||
vrf_pool_managers=vrf_pool_mgrs,
|
||||
|
||||
@@ -247,3 +247,9 @@ def addAmdGPUOptions(parser):
|
||||
default="simple",
|
||||
help="register allocation policy (simple/dynamic)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--register-file-cache-size",
|
||||
type=int,
|
||||
default=0,
|
||||
help="number of registers in cache",
|
||||
)
|
||||
|
||||
@@ -84,6 +84,7 @@ def createGPU(system, args):
|
||||
vrfs = []
|
||||
vrf_pool_mgrs = []
|
||||
srfs = []
|
||||
rfcs = []
|
||||
srf_pool_mgrs = []
|
||||
for j in range(args.simds_per_cu):
|
||||
for k in range(shader.n_wf):
|
||||
@@ -133,10 +134,16 @@ def createGPU(system, args):
|
||||
num_regs=args.sreg_file_size,
|
||||
)
|
||||
)
|
||||
rfcs.append(
|
||||
RegisterFileCache(
|
||||
simd_id=j, cache_size=args.register_file_cache_size
|
||||
)
|
||||
)
|
||||
|
||||
compute_units[-1].wavefronts = wavefronts
|
||||
compute_units[-1].vector_register_file = vrfs
|
||||
compute_units[-1].scalar_register_file = srfs
|
||||
compute_units[-1].register_file_cache = rfcs
|
||||
compute_units[-1].register_manager = RegisterManager(
|
||||
policy=args.registerManagerPolicy,
|
||||
vrf_pool_managers=vrf_pool_mgrs,
|
||||
|
||||
@@ -95,6 +95,14 @@ class VectorRegisterFile(RegisterFile):
|
||||
cxx_header = "gpu-compute/vector_register_file.hh"
|
||||
|
||||
|
||||
class RegisterFileCache(SimObject):
|
||||
type = "RegisterFileCache"
|
||||
cxx_class = "gem5::RegisterFileCache"
|
||||
cxx_header = "gpu-compute/register_file_cache.hh"
|
||||
simd_id = Param.Int("SIMD ID associated with this Register File Cache")
|
||||
cache_size = Param.Int(0, "number of entries of rfc")
|
||||
|
||||
|
||||
class RegisterManager(SimObject):
|
||||
type = "RegisterManager"
|
||||
cxx_class = "gem5::RegisterManager"
|
||||
@@ -149,6 +157,11 @@ class ComputeUnit(ClockedObject):
|
||||
dpbypass_pipe_length = Param.Int(
|
||||
4, "vector ALU Double Precision bypass latency"
|
||||
)
|
||||
|
||||
rfc_pipe_length = Param.Int(
|
||||
2, "number of cycles per register file cache access"
|
||||
)
|
||||
|
||||
scalar_pipe_length = Param.Int(1, "number of pipe stages per scalar ALU")
|
||||
issue_period = Param.Int(4, "number of cycles per issue period")
|
||||
|
||||
@@ -260,6 +273,9 @@ class ComputeUnit(ClockedObject):
|
||||
scalar_register_file = VectorParam.ScalarRegisterFile(
|
||||
"Scalar register file"
|
||||
)
|
||||
|
||||
register_file_cache = VectorParam.RegisterFileCache("Register file cache")
|
||||
|
||||
out_of_order_data_delivery = Param.Bool(
|
||||
False, "enable OoO data delivery in the GM pipeline"
|
||||
)
|
||||
|
||||
@@ -38,7 +38,7 @@ SimObject('GPU.py', sim_objects=[
|
||||
'PoolManager', 'SimplePoolManager', 'DynPoolManager', 'RegisterFile',
|
||||
'ScalarRegisterFile', 'VectorRegisterFile', 'RegisterManager', 'Wavefront',
|
||||
'ComputeUnit', 'Shader', 'GPUComputeDriver', 'GPURenderDriver',
|
||||
'GPUDispatcher', 'GPUCommandProcessor'],
|
||||
'GPUDispatcher', 'GPUCommandProcessor', 'RegisterFileCache'],
|
||||
enums=['PrefetchType', 'GfxVersion', 'StorageClassType'])
|
||||
SimObject('GPUStaticInstFlags.py', enums=['GPUStaticInstFlags'])
|
||||
SimObject('LdsState.py', sim_objects=['LdsState'])
|
||||
@@ -71,6 +71,7 @@ Source('dyn_pool_manager.cc')
|
||||
Source('simple_pool_manager.cc')
|
||||
Source('static_register_manager_policy.cc')
|
||||
Source('vector_register_file.cc')
|
||||
Source('register_file_cache.cc')
|
||||
Source('wavefront.cc')
|
||||
|
||||
DebugFlag('GPUAgentDisp')
|
||||
@@ -96,6 +97,7 @@ DebugFlag('GPUSRF')
|
||||
DebugFlag('GPUSync')
|
||||
DebugFlag('GPUTLB')
|
||||
DebugFlag('GPUVRF')
|
||||
DebugFlag('GPURFC')
|
||||
DebugFlag('GPUVRFSched')
|
||||
DebugFlag('GPUWgLatency')
|
||||
DebugFlag('Predictor')
|
||||
@@ -103,5 +105,5 @@ DebugFlag('WavefrontStack')
|
||||
|
||||
CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
|
||||
'GPUMem', 'GPUPort', 'GPUSched', 'GPUSRF', 'GPUSync',
|
||||
'GPUTLB', 'GPUVRF', 'GPUWgLatency', 'GPUKernelInfo',
|
||||
'GPUInitAbi'])
|
||||
'GPUTLB', 'GPUVRF', 'GPURFC', 'GPUWgLatency',
|
||||
'GPUKernelInfo', 'GPUInitAbi'])
|
||||
|
||||
@@ -50,6 +50,7 @@
|
||||
#include "gpu-compute/gpu_command_processor.hh"
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
#include "gpu-compute/gpu_static_inst.hh"
|
||||
#include "gpu-compute/register_file_cache.hh"
|
||||
#include "gpu-compute/scalar_register_file.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "gpu-compute/simple_pool_manager.hh"
|
||||
@@ -82,9 +83,11 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p),
|
||||
false, Event::CPU_Tick_Pri),
|
||||
cu_id(p.cu_id),
|
||||
vrf(p.vector_register_file), srf(p.scalar_register_file),
|
||||
rfc(p.register_file_cache),
|
||||
simdWidth(p.simd_width),
|
||||
spBypassPipeLength(p.spbypass_pipe_length),
|
||||
dpBypassPipeLength(p.dpbypass_pipe_length),
|
||||
rfcPipeLength(p.rfc_pipe_length),
|
||||
scalarPipeStages(p.scalar_pipe_length),
|
||||
operandNetworkLength(p.operand_network_length),
|
||||
issuePeriod(p.issue_period),
|
||||
@@ -207,6 +210,7 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p),
|
||||
|
||||
for (int i = 0; i < vrf.size(); ++i) {
|
||||
vrf[i]->setParent(this);
|
||||
rfc[i]->setParent(this);
|
||||
}
|
||||
for (int i = 0; i < srf.size(); ++i) {
|
||||
srf[i]->setParent(this);
|
||||
|
||||
@@ -66,6 +66,7 @@ class LdsChunk;
|
||||
class ScalarRegisterFile;
|
||||
class Shader;
|
||||
class VectorRegisterFile;
|
||||
class RegisterFileCache;
|
||||
|
||||
struct ComputeUnitParams;
|
||||
|
||||
@@ -296,6 +297,8 @@ class ComputeUnit : public ClockedObject
|
||||
// array of scalar register files, one per SIMD
|
||||
std::vector<ScalarRegisterFile*> srf;
|
||||
|
||||
std::vector<RegisterFileCache*> rfc;
|
||||
|
||||
// Width per VALU/SIMD unit: number of work items that can be executed
|
||||
// on the vector ALU simultaneously in a SIMD unit
|
||||
int simdWidth;
|
||||
@@ -305,6 +308,8 @@ class ComputeUnit : public ClockedObject
|
||||
// number of pipe stages for bypassing data to next dependent double
|
||||
// precision vector instruction inside the vector ALU pipeline
|
||||
int dpBypassPipeLength;
|
||||
// number of pipe stages for register file cache
|
||||
int rfcPipeLength;
|
||||
// number of pipe stages for scalar ALU
|
||||
int scalarPipeStages;
|
||||
// number of pipe stages for operand collection & distribution network
|
||||
@@ -390,6 +395,7 @@ class ComputeUnit : public ClockedObject
|
||||
int simdUnitWidth() const { return simdWidth; }
|
||||
int spBypassLength() const { return spBypassPipeLength; }
|
||||
int dpBypassLength() const { return dpBypassPipeLength; }
|
||||
int rfcLength() const { return rfcPipeLength; }
|
||||
int scalarPipeLength() const { return scalarPipeStages; }
|
||||
int storeBusLength() const { return numCyclesPerStoreTransfer; }
|
||||
int loadBusLength() const { return numCyclesPerLoadTransfer; }
|
||||
|
||||
@@ -194,6 +194,10 @@ RegisterFile::RegisterFileStats::RegisterFileStats(statistics::Group *parent)
|
||||
: statistics::Group(parent),
|
||||
ADD_STAT(registerReads,
|
||||
"Total number of DWORDs read from register file"),
|
||||
ADD_STAT(rfc_cache_read_hits,
|
||||
"Total number of DWORDs read from register file cache"),
|
||||
ADD_STAT(rfc_cache_write_hits,
|
||||
"Total number of writes to existing registers in the rfc"),
|
||||
ADD_STAT(registerWrites,
|
||||
"Total number of DWORDS written to register file"),
|
||||
ADD_STAT(sramReads,
|
||||
|
||||
@@ -158,6 +158,10 @@ class RegisterFile : public SimObject
|
||||
|
||||
// Total number of register reads per DWORD per thread
|
||||
statistics::Scalar registerReads;
|
||||
|
||||
statistics::Scalar rfc_cache_read_hits;
|
||||
statistics::Scalar rfc_cache_write_hits;
|
||||
|
||||
// Total number of register writes per DWORD per thread
|
||||
statistics::Scalar registerWrites;
|
||||
|
||||
|
||||
170
src/gpu-compute/register_file_cache.cc
Normal file
170
src/gpu-compute/register_file_cache.cc
Normal file
@@ -0,0 +1,170 @@
|
||||
/*
|
||||
* Copyright (c) 2023 The University of Wisconsin
|
||||
*
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "gpu-compute/register_file_cache.hh"
|
||||
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#include "base/intmath.hh"
|
||||
#include "base/logging.hh"
|
||||
#include "debug/GPURFC.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
#include "params/RegisterFileCache.hh"
|
||||
|
||||
namespace gem5
|
||||
{
|
||||
|
||||
RegisterFileCache::RegisterFileCache(const RegisterFileCacheParams &p)
|
||||
: SimObject(p), simdId(p.simd_id), _capacity(p.cache_size)
|
||||
{
|
||||
fatal_if(simdId < 0, "Illegal SIMD id for rfc");
|
||||
}
|
||||
|
||||
RegisterFileCache::~RegisterFileCache()
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
RegisterFileCache::setParent(ComputeUnit *_computeUnit)
|
||||
{
|
||||
computeUnit = _computeUnit;
|
||||
}
|
||||
|
||||
bool
|
||||
RegisterFileCache::inRFC(int regIdx)
|
||||
{
|
||||
return (lruHash.find(regIdx) != lruHash.end());
|
||||
}
|
||||
|
||||
std::string
|
||||
RegisterFileCache::dumpLL() const
|
||||
{
|
||||
std::stringstream ss;
|
||||
ss << "lru_order: ";
|
||||
for (auto i=lruHead; i!=nullptr; i=i->next) {
|
||||
if (i->prev == nullptr) {
|
||||
ss << "reg: " << i->regIdx << " ";
|
||||
} else {
|
||||
ss << "reg: " << i->regIdx << " (prev: " << i->prev->regIdx<<") ";
|
||||
}
|
||||
if (i->next != nullptr) {
|
||||
ss << " (next: " << i->next->regIdx<<") ";
|
||||
}
|
||||
}
|
||||
ss << "\n";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
void
|
||||
RegisterFileCache::markRFC(int regIdx)
|
||||
{
|
||||
if (_capacity == 0) {
|
||||
return;
|
||||
}
|
||||
if (lruHash.find(regIdx) == lruHash.end()) {
|
||||
if (lruHead == nullptr) {
|
||||
DPRINTF(GPURFC, "RFC SIMD[%d] cache miss inserting physReg[%d]\n",
|
||||
simdId, regIdx);
|
||||
OrderedRegs *oreg = new OrderedRegs(regIdx);
|
||||
lruHash[regIdx] = oreg;
|
||||
lruHead = oreg;
|
||||
lruTail = oreg;
|
||||
return;
|
||||
}
|
||||
|
||||
if (lruHash.size() >= _capacity) {
|
||||
int val = lruTail->regIdx;
|
||||
DPRINTF(GPURFC, "RFC SIMD[%d] cache miss inserting "
|
||||
"physReg[%d] evicting physReg[%d]\n", simdId, regIdx, val);
|
||||
|
||||
lruTail = lruTail->prev;
|
||||
lruTail->next = nullptr;
|
||||
lruHash.erase(val);
|
||||
} else {
|
||||
DPRINTF(GPURFC, "RFC SIMD[%d] cache miss inserting physReg[%d]\n",
|
||||
simdId, regIdx);
|
||||
}
|
||||
} else { // Exists in cache need to update
|
||||
DPRINTF(GPURFC, "RFC SIMD[%d] cache hit physReg[%d]\n",
|
||||
simdId, regIdx);
|
||||
|
||||
if (lruHead->regIdx == regIdx) {
|
||||
return;
|
||||
}
|
||||
if (lruHash[regIdx]==lruTail) {
|
||||
lruTail = lruHash[regIdx]->prev;
|
||||
}
|
||||
if (lruHash[regIdx]->next != nullptr) {
|
||||
lruHash[regIdx]->next->prev = lruHash[regIdx]->prev;
|
||||
}
|
||||
lruHash[regIdx]->prev->next = lruHash[regIdx]->next;
|
||||
lruHash.erase(regIdx);
|
||||
}
|
||||
|
||||
OrderedRegs *oreg = new OrderedRegs(regIdx);
|
||||
lruHash[regIdx] = oreg;
|
||||
oreg->next = lruHead;
|
||||
lruHead->prev = oreg;
|
||||
lruHead = oreg;
|
||||
}
|
||||
|
||||
void
|
||||
RegisterFileCache::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
|
||||
{
|
||||
if (!ii->isLoad()
|
||||
&& !(ii->isAtomic() || ii->isMemSync())) {
|
||||
Cycles delay(computeUnit->rfcLength());
|
||||
Tick tickDelay = computeUnit->cyclesToTicks(delay);
|
||||
|
||||
for (const auto& dstVecOp : ii->dstVecRegOperands()) {
|
||||
for (const auto& physIdx : dstVecOp.physIndices()) {
|
||||
enqCacheInsertEvent(physIdx, tickDelay);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
RegisterFileCache::enqCacheInsertEvent(uint32_t regIdx, uint64_t delay)
|
||||
{
|
||||
schedule(new MarkRegCachedEvent(this, regIdx),
|
||||
curTick() + delay);
|
||||
}
|
||||
|
||||
void
|
||||
RegisterFileCache::MarkRegCachedEvent::process()
|
||||
{
|
||||
rfc->markRFC(regIdx);
|
||||
}
|
||||
|
||||
}
|
||||
113
src/gpu-compute/register_file_cache.hh
Normal file
113
src/gpu-compute/register_file_cache.hh
Normal file
@@ -0,0 +1,113 @@
|
||||
/*
|
||||
* Copyright (c) 2023 The University of Wisconsin
|
||||
*
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef __REGISTER_FILE_CACHE_HH__
|
||||
#define __REGISTER_FILE_CACHE_HH__
|
||||
|
||||
#include <limits>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#include "base/statistics.hh"
|
||||
#include "base/types.hh"
|
||||
#include "gpu-compute/misc.hh"
|
||||
#include "sim/sim_object.hh"
|
||||
|
||||
namespace gem5
|
||||
{
|
||||
|
||||
class ComputeUnit;
|
||||
class Wavefront;
|
||||
|
||||
struct RegisterFileCacheParams;
|
||||
|
||||
class RegisterFileCache : public SimObject
|
||||
{
|
||||
public:
|
||||
RegisterFileCache(const RegisterFileCacheParams &p);
|
||||
virtual ~RegisterFileCache();
|
||||
virtual void setParent(ComputeUnit *_computeUnit);
|
||||
int cacheSize() const { return _capacity; }
|
||||
|
||||
// Debug functions
|
||||
virtual std::string dumpLL() const;
|
||||
|
||||
// Abstract Register Event
|
||||
class RegisterCacheEvent : public Event
|
||||
{
|
||||
protected:
|
||||
RegisterFileCache *rfc;
|
||||
int regIdx;
|
||||
|
||||
public:
|
||||
RegisterCacheEvent(RegisterFileCache *rfc, int regIdx)
|
||||
: rfc(rfc), regIdx(regIdx) { setFlags(AutoDelete); }
|
||||
};
|
||||
|
||||
class MarkRegCachedEvent : public RegisterCacheEvent
|
||||
{
|
||||
public:
|
||||
MarkRegCachedEvent(RegisterFileCache *rfc, int regIdx)
|
||||
: RegisterCacheEvent(rfc, regIdx) { }
|
||||
void process();
|
||||
};
|
||||
|
||||
virtual void enqCacheInsertEvent(uint32_t regIdx, uint64_t delay);
|
||||
|
||||
virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii);
|
||||
|
||||
// Add register to rfc using LRU replacement policy
|
||||
virtual void markRFC(int regIdx);
|
||||
|
||||
virtual bool inRFC(int regIdx);
|
||||
|
||||
protected:
|
||||
ComputeUnit* computeUnit;
|
||||
int simdId, _capacity;
|
||||
|
||||
class OrderedRegs
|
||||
{
|
||||
public:
|
||||
int regIdx;
|
||||
|
||||
OrderedRegs* next;
|
||||
OrderedRegs* prev;
|
||||
OrderedRegs(int val) : regIdx(val), next(nullptr), prev(nullptr) {}
|
||||
};
|
||||
|
||||
// Doubly linked list, head is the most recently used
|
||||
std::unordered_map<int, OrderedRegs*> lruHash;
|
||||
OrderedRegs* lruHead = nullptr;
|
||||
OrderedRegs* lruTail = nullptr;
|
||||
|
||||
};
|
||||
|
||||
} // namespace gem5
|
||||
|
||||
#endif // __REGISTER_FILE_CACHE_HH__
|
||||
@@ -38,6 +38,7 @@
|
||||
#include "debug/GPUVRF.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_static_inst.hh"
|
||||
#include "gpu-compute/register_file_cache.hh"
|
||||
#include "gpu-compute/scalar_register_file.hh"
|
||||
#include "gpu-compute/vector_register_file.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
|
||||
@@ -38,6 +38,7 @@
|
||||
#include "debug/GPUVRF.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
#include "gpu-compute/register_file_cache.hh"
|
||||
#include "gpu-compute/simple_pool_manager.hh"
|
||||
#include "gpu-compute/wavefront.hh"
|
||||
#include "params/VectorRegisterFile.hh"
|
||||
@@ -60,7 +61,8 @@ VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
|
||||
{
|
||||
for (const auto& srcVecOp : ii->srcVecRegOperands()) {
|
||||
for (const auto& physIdx : srcVecOp.physIndices()) {
|
||||
if (regBusy(physIdx)) {
|
||||
if (regBusy(physIdx) &&
|
||||
!computeUnit->rfc[simdId]->inRFC(physIdx)) {
|
||||
DPRINTF(GPUVRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
|
||||
w->wfDynId, ii->disassemble(), physIdx);
|
||||
w->stats.numTimesBlockedDueRAWDependencies++;
|
||||
@@ -71,7 +73,8 @@ VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
|
||||
|
||||
for (const auto& dstVecOp : ii->dstVecRegOperands()) {
|
||||
for (const auto& physIdx : dstVecOp.physIndices()) {
|
||||
if (regBusy(physIdx)) {
|
||||
if (regBusy(physIdx) &&
|
||||
!computeUnit->rfc[simdId]->inRFC(physIdx)) {
|
||||
DPRINTF(GPUVRF, "WAX stall: WV[%d]: %s: physReg[%d]\n",
|
||||
w->wfDynId, ii->disassemble(), physIdx);
|
||||
w->stats.numTimesBlockedDueWAXDependencies++;
|
||||
@@ -114,6 +117,22 @@ VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
|
||||
int DWords = ii->numSrcVecDWords();
|
||||
stats.registerReads += (DWords * w->execMask().count());
|
||||
|
||||
for (const auto& dstVecOp : ii->dstVecRegOperands()) {
|
||||
for (const auto& physIdx : dstVecOp.physIndices()) {
|
||||
if (computeUnit->rfc[simdId]->inRFC(physIdx)) {
|
||||
stats.rfc_cache_write_hits += w->execMask().count();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& srcVecOp : ii->srcVecRegOperands()) {
|
||||
for (const auto& physIdx : srcVecOp.physIndices()) {
|
||||
if (computeUnit->rfc[simdId]->inRFC(physIdx)) {
|
||||
stats.rfc_cache_read_hits += w->execMask().count();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t mask = w->execMask().to_ullong();
|
||||
int srams = w->execMask().size() / 4;
|
||||
for (int i = 0; i < srams; i++) {
|
||||
|
||||
@@ -37,6 +37,7 @@
|
||||
#include "debug/WavefrontStack.hh"
|
||||
#include "gpu-compute/compute_unit.hh"
|
||||
#include "gpu-compute/gpu_dyn_inst.hh"
|
||||
#include "gpu-compute/register_file_cache.hh"
|
||||
#include "gpu-compute/scalar_register_file.hh"
|
||||
#include "gpu-compute/shader.hh"
|
||||
#include "gpu-compute/simple_pool_manager.hh"
|
||||
@@ -933,6 +934,7 @@ Wavefront::exec()
|
||||
// inform VRF of instruction execution to schedule write-back
|
||||
// and scoreboard ready for registers
|
||||
if (!ii->isScalar()) {
|
||||
computeUnit->rfc[simdId]->waveExecuteInst(this, ii);
|
||||
computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
|
||||
}
|
||||
computeUnit->srf[simdId]->waveExecuteInst(this, ii);
|
||||
|
||||
Reference in New Issue
Block a user