gpu-compute: Added register file cache support (#730)

The RFC is defaulted to a size of 0 which removes it completely. To use
the RFC set the --register-file-cache-size to a non-zero multiple of
two. In addition, rfc_pipe_length may be altered to increase or decrease
RFC latency benefit.
This commit is contained in:
Matt Sinclair
2024-01-05 12:57:06 -06:00
committed by GitHub
14 changed files with 372 additions and 5 deletions

View File

@@ -335,6 +335,12 @@ parser.add_argument(
default="dynamic",
help="register allocation policy (simple/dynamic)",
)
parser.add_argument(
"--register-file-cache-size",
type=int,
default=0,
help="number of registers in cache",
)
parser.add_argument(
"--dgpu",
@@ -493,6 +499,7 @@ for i in range(n_cu):
vrfs = []
vrf_pool_mgrs = []
srfs = []
rfcs = []
srf_pool_mgrs = []
for j in range(args.simds_per_cu):
for k in range(shader.n_wf):
@@ -537,10 +544,16 @@ for i in range(n_cu):
simd_id=j, wf_size=args.wf_size, num_regs=args.sreg_file_size
)
)
rfcs.append(
RegisterFileCache(
simd_id=j, cache_size=args.register_file_cache_size
)
)
compute_units[-1].wavefronts = wavefronts
compute_units[-1].vector_register_file = vrfs
compute_units[-1].scalar_register_file = srfs
compute_units[-1].register_file_cache = rfcs
compute_units[-1].register_manager = RegisterManager(
policy=args.registerManagerPolicy,
vrf_pool_managers=vrf_pool_mgrs,

View File

@@ -247,3 +247,9 @@ def addAmdGPUOptions(parser):
default="simple",
help="register allocation policy (simple/dynamic)",
)
parser.add_argument(
"--register-file-cache-size",
type=int,
default=0,
help="number of registers in cache",
)

View File

@@ -84,6 +84,7 @@ def createGPU(system, args):
vrfs = []
vrf_pool_mgrs = []
srfs = []
rfcs = []
srf_pool_mgrs = []
for j in range(args.simds_per_cu):
for k in range(shader.n_wf):
@@ -133,10 +134,16 @@ def createGPU(system, args):
num_regs=args.sreg_file_size,
)
)
rfcs.append(
RegisterFileCache(
simd_id=j, cache_size=args.register_file_cache_size
)
)
compute_units[-1].wavefronts = wavefronts
compute_units[-1].vector_register_file = vrfs
compute_units[-1].scalar_register_file = srfs
compute_units[-1].register_file_cache = rfcs
compute_units[-1].register_manager = RegisterManager(
policy=args.registerManagerPolicy,
vrf_pool_managers=vrf_pool_mgrs,

View File

@@ -95,6 +95,14 @@ class VectorRegisterFile(RegisterFile):
cxx_header = "gpu-compute/vector_register_file.hh"
class RegisterFileCache(SimObject):
type = "RegisterFileCache"
cxx_class = "gem5::RegisterFileCache"
cxx_header = "gpu-compute/register_file_cache.hh"
simd_id = Param.Int("SIMD ID associated with this Register File Cache")
cache_size = Param.Int(0, "number of entries of rfc")
class RegisterManager(SimObject):
type = "RegisterManager"
cxx_class = "gem5::RegisterManager"
@@ -149,6 +157,11 @@ class ComputeUnit(ClockedObject):
dpbypass_pipe_length = Param.Int(
4, "vector ALU Double Precision bypass latency"
)
rfc_pipe_length = Param.Int(
2, "number of cycles per register file cache access"
)
scalar_pipe_length = Param.Int(1, "number of pipe stages per scalar ALU")
issue_period = Param.Int(4, "number of cycles per issue period")
@@ -260,6 +273,9 @@ class ComputeUnit(ClockedObject):
scalar_register_file = VectorParam.ScalarRegisterFile(
"Scalar register file"
)
register_file_cache = VectorParam.RegisterFileCache("Register file cache")
out_of_order_data_delivery = Param.Bool(
False, "enable OoO data delivery in the GM pipeline"
)

View File

@@ -38,7 +38,7 @@ SimObject('GPU.py', sim_objects=[
'PoolManager', 'SimplePoolManager', 'DynPoolManager', 'RegisterFile',
'ScalarRegisterFile', 'VectorRegisterFile', 'RegisterManager', 'Wavefront',
'ComputeUnit', 'Shader', 'GPUComputeDriver', 'GPURenderDriver',
'GPUDispatcher', 'GPUCommandProcessor'],
'GPUDispatcher', 'GPUCommandProcessor', 'RegisterFileCache'],
enums=['PrefetchType', 'GfxVersion', 'StorageClassType'])
SimObject('GPUStaticInstFlags.py', enums=['GPUStaticInstFlags'])
SimObject('LdsState.py', sim_objects=['LdsState'])
@@ -71,6 +71,7 @@ Source('dyn_pool_manager.cc')
Source('simple_pool_manager.cc')
Source('static_register_manager_policy.cc')
Source('vector_register_file.cc')
Source('register_file_cache.cc')
Source('wavefront.cc')
DebugFlag('GPUAgentDisp')
@@ -96,6 +97,7 @@ DebugFlag('GPUSRF')
DebugFlag('GPUSync')
DebugFlag('GPUTLB')
DebugFlag('GPUVRF')
DebugFlag('GPURFC')
DebugFlag('GPUVRFSched')
DebugFlag('GPUWgLatency')
DebugFlag('Predictor')
@@ -103,5 +105,5 @@ DebugFlag('WavefrontStack')
CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
'GPUMem', 'GPUPort', 'GPUSched', 'GPUSRF', 'GPUSync',
'GPUTLB', 'GPUVRF', 'GPUWgLatency', 'GPUKernelInfo',
'GPUInitAbi'])
'GPUTLB', 'GPUVRF', 'GPURFC', 'GPUWgLatency',
'GPUKernelInfo', 'GPUInitAbi'])

View File

@@ -50,6 +50,7 @@
#include "gpu-compute/gpu_command_processor.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/gpu_static_inst.hh"
#include "gpu-compute/register_file_cache.hh"
#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/simple_pool_manager.hh"
@@ -82,9 +83,11 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p),
false, Event::CPU_Tick_Pri),
cu_id(p.cu_id),
vrf(p.vector_register_file), srf(p.scalar_register_file),
rfc(p.register_file_cache),
simdWidth(p.simd_width),
spBypassPipeLength(p.spbypass_pipe_length),
dpBypassPipeLength(p.dpbypass_pipe_length),
rfcPipeLength(p.rfc_pipe_length),
scalarPipeStages(p.scalar_pipe_length),
operandNetworkLength(p.operand_network_length),
issuePeriod(p.issue_period),
@@ -207,6 +210,7 @@ ComputeUnit::ComputeUnit(const Params &p) : ClockedObject(p),
for (int i = 0; i < vrf.size(); ++i) {
vrf[i]->setParent(this);
rfc[i]->setParent(this);
}
for (int i = 0; i < srf.size(); ++i) {
srf[i]->setParent(this);

View File

@@ -66,6 +66,7 @@ class LdsChunk;
class ScalarRegisterFile;
class Shader;
class VectorRegisterFile;
class RegisterFileCache;
struct ComputeUnitParams;
@@ -296,6 +297,8 @@ class ComputeUnit : public ClockedObject
// array of scalar register files, one per SIMD
std::vector<ScalarRegisterFile*> srf;
std::vector<RegisterFileCache*> rfc;
// Width per VALU/SIMD unit: number of work items that can be executed
// on the vector ALU simultaneously in a SIMD unit
int simdWidth;
@@ -305,6 +308,8 @@ class ComputeUnit : public ClockedObject
// number of pipe stages for bypassing data to next dependent double
// precision vector instruction inside the vector ALU pipeline
int dpBypassPipeLength;
// number of pipe stages for register file cache
int rfcPipeLength;
// number of pipe stages for scalar ALU
int scalarPipeStages;
// number of pipe stages for operand collection & distribution network
@@ -390,6 +395,7 @@ class ComputeUnit : public ClockedObject
int simdUnitWidth() const { return simdWidth; }
int spBypassLength() const { return spBypassPipeLength; }
int dpBypassLength() const { return dpBypassPipeLength; }
int rfcLength() const { return rfcPipeLength; }
int scalarPipeLength() const { return scalarPipeStages; }
int storeBusLength() const { return numCyclesPerStoreTransfer; }
int loadBusLength() const { return numCyclesPerLoadTransfer; }

View File

@@ -194,6 +194,10 @@ RegisterFile::RegisterFileStats::RegisterFileStats(statistics::Group *parent)
: statistics::Group(parent),
ADD_STAT(registerReads,
"Total number of DWORDs read from register file"),
ADD_STAT(rfc_cache_read_hits,
"Total number of DWORDs read from register file cache"),
ADD_STAT(rfc_cache_write_hits,
"Total number of writes to existing registers in the rfc"),
ADD_STAT(registerWrites,
"Total number of DWORDS written to register file"),
ADD_STAT(sramReads,

View File

@@ -158,6 +158,10 @@ class RegisterFile : public SimObject
// Total number of register reads per DWORD per thread
statistics::Scalar registerReads;
statistics::Scalar rfc_cache_read_hits;
statistics::Scalar rfc_cache_write_hits;
// Total number of register writes per DWORD per thread
statistics::Scalar registerWrites;

View File

@@ -0,0 +1,170 @@
/*
* Copyright (c) 2023 The University of Wisconsin
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "gpu-compute/register_file_cache.hh"
#include <sstream>
#include <string>
#include "base/intmath.hh"
#include "base/logging.hh"
#include "debug/GPURFC.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/wavefront.hh"
#include "params/RegisterFileCache.hh"
namespace gem5
{
RegisterFileCache::RegisterFileCache(const RegisterFileCacheParams &p)
: SimObject(p), simdId(p.simd_id), _capacity(p.cache_size)
{
fatal_if(simdId < 0, "Illegal SIMD id for rfc");
}
RegisterFileCache::~RegisterFileCache()
{
}
void
RegisterFileCache::setParent(ComputeUnit *_computeUnit)
{
computeUnit = _computeUnit;
}
bool
RegisterFileCache::inRFC(int regIdx)
{
return (lruHash.find(regIdx) != lruHash.end());
}
std::string
RegisterFileCache::dumpLL() const
{
std::stringstream ss;
ss << "lru_order: ";
for (auto i=lruHead; i!=nullptr; i=i->next) {
if (i->prev == nullptr) {
ss << "reg: " << i->regIdx << " ";
} else {
ss << "reg: " << i->regIdx << " (prev: " << i->prev->regIdx<<") ";
}
if (i->next != nullptr) {
ss << " (next: " << i->next->regIdx<<") ";
}
}
ss << "\n";
return ss.str();
}
void
RegisterFileCache::markRFC(int regIdx)
{
if (_capacity == 0) {
return;
}
if (lruHash.find(regIdx) == lruHash.end()) {
if (lruHead == nullptr) {
DPRINTF(GPURFC, "RFC SIMD[%d] cache miss inserting physReg[%d]\n",
simdId, regIdx);
OrderedRegs *oreg = new OrderedRegs(regIdx);
lruHash[regIdx] = oreg;
lruHead = oreg;
lruTail = oreg;
return;
}
if (lruHash.size() >= _capacity) {
int val = lruTail->regIdx;
DPRINTF(GPURFC, "RFC SIMD[%d] cache miss inserting "
"physReg[%d] evicting physReg[%d]\n", simdId, regIdx, val);
lruTail = lruTail->prev;
lruTail->next = nullptr;
lruHash.erase(val);
} else {
DPRINTF(GPURFC, "RFC SIMD[%d] cache miss inserting physReg[%d]\n",
simdId, regIdx);
}
} else { // Exists in cache need to update
DPRINTF(GPURFC, "RFC SIMD[%d] cache hit physReg[%d]\n",
simdId, regIdx);
if (lruHead->regIdx == regIdx) {
return;
}
if (lruHash[regIdx]==lruTail) {
lruTail = lruHash[regIdx]->prev;
}
if (lruHash[regIdx]->next != nullptr) {
lruHash[regIdx]->next->prev = lruHash[regIdx]->prev;
}
lruHash[regIdx]->prev->next = lruHash[regIdx]->next;
lruHash.erase(regIdx);
}
OrderedRegs *oreg = new OrderedRegs(regIdx);
lruHash[regIdx] = oreg;
oreg->next = lruHead;
lruHead->prev = oreg;
lruHead = oreg;
}
void
RegisterFileCache::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
{
if (!ii->isLoad()
&& !(ii->isAtomic() || ii->isMemSync())) {
Cycles delay(computeUnit->rfcLength());
Tick tickDelay = computeUnit->cyclesToTicks(delay);
for (const auto& dstVecOp : ii->dstVecRegOperands()) {
for (const auto& physIdx : dstVecOp.physIndices()) {
enqCacheInsertEvent(physIdx, tickDelay);
}
}
}
}
void
RegisterFileCache::enqCacheInsertEvent(uint32_t regIdx, uint64_t delay)
{
schedule(new MarkRegCachedEvent(this, regIdx),
curTick() + delay);
}
void
RegisterFileCache::MarkRegCachedEvent::process()
{
rfc->markRFC(regIdx);
}
}

View File

@@ -0,0 +1,113 @@
/*
* Copyright (c) 2023 The University of Wisconsin
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __REGISTER_FILE_CACHE_HH__
#define __REGISTER_FILE_CACHE_HH__
#include <limits>
#include <unordered_set>
#include <vector>
#include "base/statistics.hh"
#include "base/types.hh"
#include "gpu-compute/misc.hh"
#include "sim/sim_object.hh"
namespace gem5
{
class ComputeUnit;
class Wavefront;
struct RegisterFileCacheParams;
class RegisterFileCache : public SimObject
{
public:
RegisterFileCache(const RegisterFileCacheParams &p);
virtual ~RegisterFileCache();
virtual void setParent(ComputeUnit *_computeUnit);
int cacheSize() const { return _capacity; }
// Debug functions
virtual std::string dumpLL() const;
// Abstract Register Event
class RegisterCacheEvent : public Event
{
protected:
RegisterFileCache *rfc;
int regIdx;
public:
RegisterCacheEvent(RegisterFileCache *rfc, int regIdx)
: rfc(rfc), regIdx(regIdx) { setFlags(AutoDelete); }
};
class MarkRegCachedEvent : public RegisterCacheEvent
{
public:
MarkRegCachedEvent(RegisterFileCache *rfc, int regIdx)
: RegisterCacheEvent(rfc, regIdx) { }
void process();
};
virtual void enqCacheInsertEvent(uint32_t regIdx, uint64_t delay);
virtual void waveExecuteInst(Wavefront *w, GPUDynInstPtr ii);
// Add register to rfc using LRU replacement policy
virtual void markRFC(int regIdx);
virtual bool inRFC(int regIdx);
protected:
ComputeUnit* computeUnit;
int simdId, _capacity;
class OrderedRegs
{
public:
int regIdx;
OrderedRegs* next;
OrderedRegs* prev;
OrderedRegs(int val) : regIdx(val), next(nullptr), prev(nullptr) {}
};
// Doubly linked list, head is the most recently used
std::unordered_map<int, OrderedRegs*> lruHash;
OrderedRegs* lruHead = nullptr;
OrderedRegs* lruTail = nullptr;
};
} // namespace gem5
#endif // __REGISTER_FILE_CACHE_HH__

View File

@@ -38,6 +38,7 @@
#include "debug/GPUVRF.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_static_inst.hh"
#include "gpu-compute/register_file_cache.hh"
#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/vector_register_file.hh"
#include "gpu-compute/wavefront.hh"

View File

@@ -38,6 +38,7 @@
#include "debug/GPUVRF.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/register_file_cache.hh"
#include "gpu-compute/simple_pool_manager.hh"
#include "gpu-compute/wavefront.hh"
#include "params/VectorRegisterFile.hh"
@@ -60,7 +61,8 @@ VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
{
for (const auto& srcVecOp : ii->srcVecRegOperands()) {
for (const auto& physIdx : srcVecOp.physIndices()) {
if (regBusy(physIdx)) {
if (regBusy(physIdx) &&
!computeUnit->rfc[simdId]->inRFC(physIdx)) {
DPRINTF(GPUVRF, "RAW stall: WV[%d]: %s: physReg[%d]\n",
w->wfDynId, ii->disassemble(), physIdx);
w->stats.numTimesBlockedDueRAWDependencies++;
@@ -71,7 +73,8 @@ VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
for (const auto& dstVecOp : ii->dstVecRegOperands()) {
for (const auto& physIdx : dstVecOp.physIndices()) {
if (regBusy(physIdx)) {
if (regBusy(physIdx) &&
!computeUnit->rfc[simdId]->inRFC(physIdx)) {
DPRINTF(GPUVRF, "WAX stall: WV[%d]: %s: physReg[%d]\n",
w->wfDynId, ii->disassemble(), physIdx);
w->stats.numTimesBlockedDueWAXDependencies++;
@@ -114,6 +117,22 @@ VectorRegisterFile::waveExecuteInst(Wavefront *w, GPUDynInstPtr ii)
int DWords = ii->numSrcVecDWords();
stats.registerReads += (DWords * w->execMask().count());
for (const auto& dstVecOp : ii->dstVecRegOperands()) {
for (const auto& physIdx : dstVecOp.physIndices()) {
if (computeUnit->rfc[simdId]->inRFC(physIdx)) {
stats.rfc_cache_write_hits += w->execMask().count();
}
}
}
for (const auto& srcVecOp : ii->srcVecRegOperands()) {
for (const auto& physIdx : srcVecOp.physIndices()) {
if (computeUnit->rfc[simdId]->inRFC(physIdx)) {
stats.rfc_cache_read_hits += w->execMask().count();
}
}
}
uint64_t mask = w->execMask().to_ullong();
int srams = w->execMask().size() / 4;
for (int i = 0; i < srams; i++) {

View File

@@ -37,6 +37,7 @@
#include "debug/WavefrontStack.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/register_file_cache.hh"
#include "gpu-compute/scalar_register_file.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/simple_pool_manager.hh"
@@ -933,6 +934,7 @@ Wavefront::exec()
// inform VRF of instruction execution to schedule write-back
// and scoreboard ready for registers
if (!ii->isScalar()) {
computeUnit->rfc[simdId]->waveExecuteInst(this, ii);
computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
}
computeUnit->srf[simdId]->waveExecuteInst(this, ii);