mem-ruby,configs: Add GPU GLC Atomic Resource Constraints (#120)

Added a resource constraint, AtomicALUOperation, to GLC atomics
performed in the TCC.

The resource constraint uses a new class, ALUFreeList array. The class
assumes the following:
  - There are a fixed number of atomic ALU pipelines
- While a new cache line can be processed in each pipeline each cycle,
if a cache line is currently going through a pipeline, it can't be
processed again until it's finished

Two configuration parameters have been used to tune this behavior:
- tcc-num-atomic-alus corresponds to the number of atomic ALU pipelines
- atomic-alu-latency corresponds to the latency of atomic ALU pipelines

Change-Id: I25bdde7dafc3877590bb6536efdf57b8c540a939
This commit is contained in:
Daniel Kouchekinia
2023-11-14 09:48:48 -06:00
committed by GitHub
parent f11227b4a0
commit be5c03ea9f
9 changed files with 242 additions and 5 deletions

View File

@@ -275,6 +275,8 @@ class TCC(RubyCache):
def create(self, options):
self.assoc = options.tcc_assoc
self.atomicLatency = options.atomic_alu_latency
self.atomicALUs = options.tcc_num_atomic_alus
if hasattr(options, "bw_scalor") and options.bw_scalor > 0:
s = options.num_compute_units
tcc_size = s * 128
@@ -497,6 +499,15 @@ def define_options(parser):
parser.add_argument(
"--glc-atomic-latency", type=int, default=1, help="GLC Atomic Latency"
)
parser.add_argument(
"--atomic-alu-latency", type=int, default=0, help="Atomic ALU Latency"
)
parser.add_argument(
"--tcc-num-atomic-alus",
type=int,
default=64,
help="Number of atomic ALUs in the TCC",
)
parser.add_argument(
"--tcp-num-banks",
type=int,

View File

@@ -93,6 +93,7 @@ machine(MachineType:TCC, "TCC Cache")
DataArrayWrite, desc="Write the data array";
TagArrayRead, desc="Read the data array";
TagArrayWrite, desc="Write the data array";
AtomicALUOperation, desc="Atomic ALU operation";
}
@@ -223,6 +224,8 @@ machine(MachineType:TCC, "TCC Cache")
L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
} else if (request_type == RequestType:TagArrayWrite) {
L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
} else if (request_type == RequestType:AtomicALUOperation) {
L2cache.recordRequestType(CacheRequestType:AtomicALUOperation, addr);
}
}
@@ -235,6 +238,8 @@ machine(MachineType:TCC, "TCC Cache")
return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
} else if (request_type == RequestType:TagArrayWrite) {
return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
} else if (request_type == RequestType:AtomicALUOperation) {
return L2cache.checkResourceAvailable(CacheResourceType:AtomicALUArray, addr);
} else {
error("Invalid RequestType type in checkResourceAvailable");
return true;
@@ -915,7 +920,7 @@ machine(MachineType:TCC, "TCC Cache")
st_stallAndWaitRequest;
}
transition(V, Atomic, M) {TagArrayRead, DataArrayWrite} {
transition(V, Atomic, M) {TagArrayRead, TagArrayWrite, DataArrayWrite, AtomicALUOperation} {
p_profileHit;
ut_updateTag;
owm_orWriteMask;
@@ -942,7 +947,7 @@ machine(MachineType:TCC, "TCC Cache")
st_stallAndWaitRequest;
}
transition(M, Atomic) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
transition(M, Atomic) {TagArrayRead, DataArrayWrite, AtomicALUOperation} {
p_profileHit;
owm_orWriteMask;
pa_performAtomic;
@@ -979,7 +984,7 @@ machine(MachineType:TCC, "TCC Cache")
st_stallAndWaitRequest;
}
transition({M, W}, AtomicPassOn, WI) {TagArrayRead} {
transition({M, W}, AtomicPassOn, WI) {TagArrayRead, DataArrayRead} {
t_allocateTBE;
wb_writeBack;
// after writing back the current line, we need to wait for it to be done
@@ -1098,7 +1103,7 @@ machine(MachineType:TCC, "TCC Cache")
dt_deallocateTBE;
}
transition(A, Bypass) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
transition(A, Bypass) {TagArrayRead, TagArrayWrite} {
bapdr_sendBypassedAtomicPerformedInDirectoryResponse;
dnpa_decrementNumPendingDirectoryAtomics;
pr_popResponseQueue;
@@ -1120,7 +1125,7 @@ machine(MachineType:TCC, "TCC Cache")
dt_deallocateTBE;
}
transition(A, Data, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
transition(A, Data, M) {TagArrayRead, TagArrayWrite, DataArrayWrite, AtomicALUOperation} {
a_allocateBlock;
wardb_writeAtomicResponseDirtyBytes;
pa_performAtomic;

View File

@@ -230,11 +230,13 @@ enumeration(CacheRequestType, desc="...", default="CacheRequestType_NULL") {
DataArrayWrite, desc="Write access to the cache's data array";
TagArrayRead, desc="Read access to the cache's tag array";
TagArrayWrite, desc="Write access to the cache's tag array";
AtomicALUOperation, desc="Atomic ALU operation";
}
enumeration(CacheResourceType, desc="...", default="CacheResourceType_NULL") {
DataArray, desc="Access to the cache's data array";
TagArray, desc="Access to the cache's tag array";
AtomicALUArray, desc="Access to the cache's atomic ALU array";
}
enumeration(DirectoryRequestType, desc="...", default="DirectoryRequestType_NULL") {

View File

@@ -0,0 +1,106 @@
/*
* Copyright (c) 2023 The University of Wisconsin
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "mem/ruby/structures/ALUFreeListArray.hh"
#include "base/intmath.hh"
#include "mem/ruby/system/RubySystem.hh"
#include "sim/cur_tick.hh"
namespace gem5
{
namespace ruby
{
/*
*
* Models num_ALUs pipelined atomic ALUs with a depth of access_latency ticks.
* Rather than reserving ALUs, this class assumes multiple requests can go
* through an ALU at the same time. As such, up to numALU new requests can
* go through at once, with the caveat that a line already being processed
* in an ALU can't start processing again until the previous request has exited
* the pipeline.
*
* ALUs aren't mapped directly to cache lines. Rather, ALUs are treated as
* a free list.
*
* Behavior:
* Requests will go through unless one/both of the following are met:
* - There have been more than [numALUs] requests in the current cycle
* - The same line has been accessed in the past accessLatency ticks
*/
ALUFreeListArray::ALUFreeListArray(unsigned int num_ALUs, Tick access_latency)
{
this->numALUs = num_ALUs;
this->accessLatency = access_latency;
}
bool ALUFreeListArray::tryAccess(Addr addr)
{
uint32_t accesses_this_tick = 0;
// Remove requests from the tail of the queue that occured more than
// accessLatency ticks ago
Tick oldestValidRecordStart = curTick() - this->accessLatency;
while (accessQueue.size() > 0 &&
(accessQueue.back().startTick < oldestValidRecordStart)) {
accessQueue.pop_back();
}
for (AccessRecord& record : accessQueue) {
// Block access if we would be using more ALUs than we have in a
// single tick
if (record.startTick == curTick() &&
(++accesses_this_tick > numALUs)) {
return false;
}
// Block access if the line is already being used
if (record.lineAddr == makeLineAddress(addr)) {
return false;
}
}
return true;
}
void ALUFreeListArray::reserve(Addr addr)
{
// Only called after tryAccess, so we know queue is up to date and that
// the access is valid
// Add record to queue
accessQueue.push_front(AccessRecord(makeLineAddress(addr), curTick()));
}
} // namespace ruby
} // namespace gem5

View File

@@ -0,0 +1,78 @@
/*
* Copyright (c) 2023 The University of Wisconsin
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __MEM_RUBY_STRUCTURES_ALUFREELISTARRAY_HH__
#define __MEM_RUBY_STRUCTURES_ALUFREELISTARRAY_HH__
#include <deque>
#include "mem/ruby/common/TypeDefines.hh"
#include "sim/cur_tick.hh"
namespace gem5
{
namespace ruby
{
class ALUFreeListArray
{
private:
unsigned int numALUs;
Tick accessLatency;
class AccessRecord
{
public:
AccessRecord(Addr line_addr, Tick start_tick) {
this->lineAddr = line_addr;
this->startTick = start_tick;
}
Addr lineAddr;
Tick startTick;
};
// Queue of accesses from past accessLatency cycles
std::deque<AccessRecord> accessQueue;
public:
ALUFreeListArray(unsigned int num_ALUs, Tick access_latency);
bool tryAccess(Addr addr);
void reserve(Addr addr);
Tick getLatency() const { return accessLatency; }
};
} // namespace ruby
} // namespace gem5
#endif

View File

@@ -73,6 +73,8 @@ CacheMemory::CacheMemory(const Params &p)
p.start_index_bit, p.ruby_system),
tagArray(p.tagArrayBanks, p.tagAccessLatency,
p.start_index_bit, p.ruby_system),
atomicALUArray(p.atomicALUs, p.atomicLatency *
p.ruby_system->clockPeriod()),
cacheMemoryStats(this)
{
m_cache_size = p.size;
@@ -529,6 +531,8 @@ CacheMemoryStats::CacheMemoryStats(statistics::Group *parent)
ADD_STAT(numTagArrayWrites, "Number of tag array writes"),
ADD_STAT(numTagArrayStalls, "Number of stalls caused by tag array"),
ADD_STAT(numDataArrayStalls, "Number of stalls caused by data array"),
ADD_STAT(numAtomicALUOperations, "Number of atomic ALU operations"),
ADD_STAT(numAtomicALUArrayStalls, "Number of stalls caused by atomic ALU array"),
ADD_STAT(htmTransCommitReadSet, "Read set size of a committed "
"transaction"),
ADD_STAT(htmTransCommitWriteSet, "Write set size of a committed "
@@ -564,6 +568,12 @@ CacheMemoryStats::CacheMemoryStats(statistics::Group *parent)
numDataArrayStalls
.flags(statistics::nozero);
numAtomicALUOperations
.flags(statistics::nozero);
numAtomicALUArrayStalls
.flags(statistics::nozero);
htmTransCommitReadSet
.init(8)
.flags(statistics::pdf | statistics::dist | statistics::nozero |
@@ -633,6 +643,11 @@ CacheMemory::recordRequestType(CacheRequestType requestType, Addr addr)
tagArray.reserve(addressToCacheSet(addr));
cacheMemoryStats.numTagArrayWrites++;
return;
case CacheRequestType_AtomicALUOperation:
if (m_resource_stalls)
atomicALUArray.reserve(addr);
cacheMemoryStats.numAtomicALUOperations++;
return;
default:
warn("CacheMemory access_type not found: %s",
CacheRequestType_to_string(requestType));
@@ -664,6 +679,15 @@ CacheMemory::checkResourceAvailable(CacheResourceType res, Addr addr)
cacheMemoryStats.numDataArrayStalls++;
return false;
}
} else if (res == CacheResourceType_AtomicALUArray) {
if (atomicALUArray.tryAccess(addr)) return true;
else {
DPRINTF(RubyResourceStalls,
"Atomic ALU array stall on addr %#x in line address %#x\n",
addr, makeLineAddress(addr));
cacheMemoryStats.numAtomicALUArrayStalls++;
return false;
}
} else {
panic("Unrecognized cache resource type.");
}

View File

@@ -56,6 +56,7 @@
#include "mem/ruby/slicc_interface/AbstractCacheEntry.hh"
#include "mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh"
#include "mem/ruby/structures/BankedArray.hh"
#include "mem/ruby/structures/ALUFreeListArray.hh"
#include "mem/ruby/system/CacheRecorder.hh"
#include "params/RubyCache.hh"
#include "sim/sim_object.hh"
@@ -186,6 +187,7 @@ class CacheMemory : public SimObject
BankedArray dataArray;
BankedArray tagArray;
ALUFreeListArray atomicALUArray;
int m_cache_size;
int m_cache_num_sets;
@@ -224,6 +226,9 @@ class CacheMemory : public SimObject
statistics::Scalar numTagArrayStalls;
statistics::Scalar numDataArrayStalls;
statistics::Scalar numAtomicALUOperations;
statistics::Scalar numAtomicALUArrayStalls;
// hardware transactional memory
statistics::Histogram htmTransCommitReadSet;
statistics::Histogram htmTransCommitWriteSet;

View File

@@ -44,6 +44,11 @@ class RubyCache(SimObject):
"0B", "block size in bytes. 0 means default RubyBlockSize"
)
# Atomic parameters only applicable to GPU atomics
# Zero atomic latency corresponds to instantanous atomic ALU operations
atomicLatency = Param.Cycles(0, "Cycles for an atomic ALU operation")
atomicALUs = Param.Int(64, "Number of atomic ALUs")
dataArrayBanks = Param.Int(1, "Number of banks for the data array")
tagArrayBanks = Param.Int(1, "Number of banks for the tag array")
dataAccessLatency = Param.Cycles(1, "cycles for a data array access")

View File

@@ -55,6 +55,7 @@ Source('PersistentTable.cc')
Source('RubyPrefetcher.cc')
Source('TimerTable.cc')
Source('BankedArray.cc')
Source('ALUFreeListArray.cc')
Source('TBEStorage.cc')
if env['PROTOCOL'] == 'CHI':
Source('MN_TBETable.cc')