mem-ruby,configs: Add GPU GLC Atomic Resource Constraints (#120)
Added a resource constraint, AtomicALUOperation, to GLC atomics performed in the TCC. The resource constraint uses a new class, ALUFreeList array. The class assumes the following: - There are a fixed number of atomic ALU pipelines - While a new cache line can be processed in each pipeline each cycle, if a cache line is currently going through a pipeline, it can't be processed again until it's finished Two configuration parameters have been used to tune this behavior: - tcc-num-atomic-alus corresponds to the number of atomic ALU pipelines - atomic-alu-latency corresponds to the latency of atomic ALU pipelines Change-Id: I25bdde7dafc3877590bb6536efdf57b8c540a939
This commit is contained in:
committed by
GitHub
parent
f11227b4a0
commit
be5c03ea9f
@@ -275,6 +275,8 @@ class TCC(RubyCache):
|
||||
|
||||
def create(self, options):
|
||||
self.assoc = options.tcc_assoc
|
||||
self.atomicLatency = options.atomic_alu_latency
|
||||
self.atomicALUs = options.tcc_num_atomic_alus
|
||||
if hasattr(options, "bw_scalor") and options.bw_scalor > 0:
|
||||
s = options.num_compute_units
|
||||
tcc_size = s * 128
|
||||
@@ -497,6 +499,15 @@ def define_options(parser):
|
||||
parser.add_argument(
|
||||
"--glc-atomic-latency", type=int, default=1, help="GLC Atomic Latency"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--atomic-alu-latency", type=int, default=0, help="Atomic ALU Latency"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tcc-num-atomic-alus",
|
||||
type=int,
|
||||
default=64,
|
||||
help="Number of atomic ALUs in the TCC",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tcp-num-banks",
|
||||
type=int,
|
||||
|
||||
@@ -93,6 +93,7 @@ machine(MachineType:TCC, "TCC Cache")
|
||||
DataArrayWrite, desc="Write the data array";
|
||||
TagArrayRead, desc="Read the data array";
|
||||
TagArrayWrite, desc="Write the data array";
|
||||
AtomicALUOperation, desc="Atomic ALU operation";
|
||||
}
|
||||
|
||||
|
||||
@@ -223,6 +224,8 @@ machine(MachineType:TCC, "TCC Cache")
|
||||
L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
|
||||
} else if (request_type == RequestType:TagArrayWrite) {
|
||||
L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
|
||||
} else if (request_type == RequestType:AtomicALUOperation) {
|
||||
L2cache.recordRequestType(CacheRequestType:AtomicALUOperation, addr);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -235,6 +238,8 @@ machine(MachineType:TCC, "TCC Cache")
|
||||
return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
|
||||
} else if (request_type == RequestType:TagArrayWrite) {
|
||||
return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
|
||||
} else if (request_type == RequestType:AtomicALUOperation) {
|
||||
return L2cache.checkResourceAvailable(CacheResourceType:AtomicALUArray, addr);
|
||||
} else {
|
||||
error("Invalid RequestType type in checkResourceAvailable");
|
||||
return true;
|
||||
@@ -915,7 +920,7 @@ machine(MachineType:TCC, "TCC Cache")
|
||||
st_stallAndWaitRequest;
|
||||
}
|
||||
|
||||
transition(V, Atomic, M) {TagArrayRead, DataArrayWrite} {
|
||||
transition(V, Atomic, M) {TagArrayRead, TagArrayWrite, DataArrayWrite, AtomicALUOperation} {
|
||||
p_profileHit;
|
||||
ut_updateTag;
|
||||
owm_orWriteMask;
|
||||
@@ -942,7 +947,7 @@ machine(MachineType:TCC, "TCC Cache")
|
||||
st_stallAndWaitRequest;
|
||||
}
|
||||
|
||||
transition(M, Atomic) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
|
||||
transition(M, Atomic) {TagArrayRead, DataArrayWrite, AtomicALUOperation} {
|
||||
p_profileHit;
|
||||
owm_orWriteMask;
|
||||
pa_performAtomic;
|
||||
@@ -979,7 +984,7 @@ machine(MachineType:TCC, "TCC Cache")
|
||||
st_stallAndWaitRequest;
|
||||
}
|
||||
|
||||
transition({M, W}, AtomicPassOn, WI) {TagArrayRead} {
|
||||
transition({M, W}, AtomicPassOn, WI) {TagArrayRead, DataArrayRead} {
|
||||
t_allocateTBE;
|
||||
wb_writeBack;
|
||||
// after writing back the current line, we need to wait for it to be done
|
||||
@@ -1098,7 +1103,7 @@ machine(MachineType:TCC, "TCC Cache")
|
||||
dt_deallocateTBE;
|
||||
}
|
||||
|
||||
transition(A, Bypass) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
|
||||
transition(A, Bypass) {TagArrayRead, TagArrayWrite} {
|
||||
bapdr_sendBypassedAtomicPerformedInDirectoryResponse;
|
||||
dnpa_decrementNumPendingDirectoryAtomics;
|
||||
pr_popResponseQueue;
|
||||
@@ -1120,7 +1125,7 @@ machine(MachineType:TCC, "TCC Cache")
|
||||
dt_deallocateTBE;
|
||||
}
|
||||
|
||||
transition(A, Data, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
|
||||
transition(A, Data, M) {TagArrayRead, TagArrayWrite, DataArrayWrite, AtomicALUOperation} {
|
||||
a_allocateBlock;
|
||||
wardb_writeAtomicResponseDirtyBytes;
|
||||
pa_performAtomic;
|
||||
|
||||
@@ -230,11 +230,13 @@ enumeration(CacheRequestType, desc="...", default="CacheRequestType_NULL") {
|
||||
DataArrayWrite, desc="Write access to the cache's data array";
|
||||
TagArrayRead, desc="Read access to the cache's tag array";
|
||||
TagArrayWrite, desc="Write access to the cache's tag array";
|
||||
AtomicALUOperation, desc="Atomic ALU operation";
|
||||
}
|
||||
|
||||
enumeration(CacheResourceType, desc="...", default="CacheResourceType_NULL") {
|
||||
DataArray, desc="Access to the cache's data array";
|
||||
TagArray, desc="Access to the cache's tag array";
|
||||
AtomicALUArray, desc="Access to the cache's atomic ALU array";
|
||||
}
|
||||
|
||||
enumeration(DirectoryRequestType, desc="...", default="DirectoryRequestType_NULL") {
|
||||
|
||||
106
src/mem/ruby/structures/ALUFreeListArray.cc
Normal file
106
src/mem/ruby/structures/ALUFreeListArray.cc
Normal file
@@ -0,0 +1,106 @@
|
||||
/*
|
||||
* Copyright (c) 2023 The University of Wisconsin
|
||||
*
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "mem/ruby/structures/ALUFreeListArray.hh"
|
||||
|
||||
#include "base/intmath.hh"
|
||||
#include "mem/ruby/system/RubySystem.hh"
|
||||
#include "sim/cur_tick.hh"
|
||||
|
||||
namespace gem5
|
||||
{
|
||||
|
||||
namespace ruby
|
||||
{
|
||||
|
||||
/*
|
||||
*
|
||||
* Models num_ALUs pipelined atomic ALUs with a depth of access_latency ticks.
|
||||
* Rather than reserving ALUs, this class assumes multiple requests can go
|
||||
* through an ALU at the same time. As such, up to numALU new requests can
|
||||
* go through at once, with the caveat that a line already being processed
|
||||
* in an ALU can't start processing again until the previous request has exited
|
||||
* the pipeline.
|
||||
*
|
||||
* ALUs aren't mapped directly to cache lines. Rather, ALUs are treated as
|
||||
* a free list.
|
||||
*
|
||||
* Behavior:
|
||||
* Requests will go through unless one/both of the following are met:
|
||||
* - There have been more than [numALUs] requests in the current cycle
|
||||
* - The same line has been accessed in the past accessLatency ticks
|
||||
*/
|
||||
|
||||
ALUFreeListArray::ALUFreeListArray(unsigned int num_ALUs, Tick access_latency)
|
||||
{
|
||||
this->numALUs = num_ALUs;
|
||||
this->accessLatency = access_latency;
|
||||
}
|
||||
|
||||
bool ALUFreeListArray::tryAccess(Addr addr)
|
||||
{
|
||||
uint32_t accesses_this_tick = 0;
|
||||
|
||||
// Remove requests from the tail of the queue that occured more than
|
||||
// accessLatency ticks ago
|
||||
Tick oldestValidRecordStart = curTick() - this->accessLatency;
|
||||
|
||||
while (accessQueue.size() > 0 &&
|
||||
(accessQueue.back().startTick < oldestValidRecordStart)) {
|
||||
accessQueue.pop_back();
|
||||
}
|
||||
|
||||
for (AccessRecord& record : accessQueue) {
|
||||
// Block access if we would be using more ALUs than we have in a
|
||||
// single tick
|
||||
if (record.startTick == curTick() &&
|
||||
(++accesses_this_tick > numALUs)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Block access if the line is already being used
|
||||
if (record.lineAddr == makeLineAddress(addr)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void ALUFreeListArray::reserve(Addr addr)
|
||||
{
|
||||
// Only called after tryAccess, so we know queue is up to date and that
|
||||
// the access is valid
|
||||
|
||||
// Add record to queue
|
||||
accessQueue.push_front(AccessRecord(makeLineAddress(addr), curTick()));
|
||||
}
|
||||
|
||||
} // namespace ruby
|
||||
} // namespace gem5
|
||||
78
src/mem/ruby/structures/ALUFreeListArray.hh
Normal file
78
src/mem/ruby/structures/ALUFreeListArray.hh
Normal file
@@ -0,0 +1,78 @@
|
||||
/*
|
||||
* Copyright (c) 2023 The University of Wisconsin
|
||||
*
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met: redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer;
|
||||
* redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution;
|
||||
* neither the name of the copyright holders nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef __MEM_RUBY_STRUCTURES_ALUFREELISTARRAY_HH__
|
||||
#define __MEM_RUBY_STRUCTURES_ALUFREELISTARRAY_HH__
|
||||
|
||||
#include <deque>
|
||||
|
||||
#include "mem/ruby/common/TypeDefines.hh"
|
||||
#include "sim/cur_tick.hh"
|
||||
|
||||
namespace gem5
|
||||
{
|
||||
|
||||
namespace ruby
|
||||
{
|
||||
|
||||
class ALUFreeListArray
|
||||
{
|
||||
private:
|
||||
unsigned int numALUs;
|
||||
Tick accessLatency;
|
||||
|
||||
class AccessRecord
|
||||
{
|
||||
public:
|
||||
AccessRecord(Addr line_addr, Tick start_tick) {
|
||||
this->lineAddr = line_addr;
|
||||
this->startTick = start_tick;
|
||||
}
|
||||
|
||||
Addr lineAddr;
|
||||
Tick startTick;
|
||||
};
|
||||
|
||||
// Queue of accesses from past accessLatency cycles
|
||||
std::deque<AccessRecord> accessQueue;
|
||||
|
||||
public:
|
||||
ALUFreeListArray(unsigned int num_ALUs, Tick access_latency);
|
||||
|
||||
bool tryAccess(Addr addr);
|
||||
|
||||
void reserve(Addr addr);
|
||||
|
||||
Tick getLatency() const { return accessLatency; }
|
||||
};
|
||||
|
||||
} // namespace ruby
|
||||
} // namespace gem5
|
||||
|
||||
#endif
|
||||
@@ -73,6 +73,8 @@ CacheMemory::CacheMemory(const Params &p)
|
||||
p.start_index_bit, p.ruby_system),
|
||||
tagArray(p.tagArrayBanks, p.tagAccessLatency,
|
||||
p.start_index_bit, p.ruby_system),
|
||||
atomicALUArray(p.atomicALUs, p.atomicLatency *
|
||||
p.ruby_system->clockPeriod()),
|
||||
cacheMemoryStats(this)
|
||||
{
|
||||
m_cache_size = p.size;
|
||||
@@ -529,6 +531,8 @@ CacheMemoryStats::CacheMemoryStats(statistics::Group *parent)
|
||||
ADD_STAT(numTagArrayWrites, "Number of tag array writes"),
|
||||
ADD_STAT(numTagArrayStalls, "Number of stalls caused by tag array"),
|
||||
ADD_STAT(numDataArrayStalls, "Number of stalls caused by data array"),
|
||||
ADD_STAT(numAtomicALUOperations, "Number of atomic ALU operations"),
|
||||
ADD_STAT(numAtomicALUArrayStalls, "Number of stalls caused by atomic ALU array"),
|
||||
ADD_STAT(htmTransCommitReadSet, "Read set size of a committed "
|
||||
"transaction"),
|
||||
ADD_STAT(htmTransCommitWriteSet, "Write set size of a committed "
|
||||
@@ -564,6 +568,12 @@ CacheMemoryStats::CacheMemoryStats(statistics::Group *parent)
|
||||
numDataArrayStalls
|
||||
.flags(statistics::nozero);
|
||||
|
||||
numAtomicALUOperations
|
||||
.flags(statistics::nozero);
|
||||
|
||||
numAtomicALUArrayStalls
|
||||
.flags(statistics::nozero);
|
||||
|
||||
htmTransCommitReadSet
|
||||
.init(8)
|
||||
.flags(statistics::pdf | statistics::dist | statistics::nozero |
|
||||
@@ -633,6 +643,11 @@ CacheMemory::recordRequestType(CacheRequestType requestType, Addr addr)
|
||||
tagArray.reserve(addressToCacheSet(addr));
|
||||
cacheMemoryStats.numTagArrayWrites++;
|
||||
return;
|
||||
case CacheRequestType_AtomicALUOperation:
|
||||
if (m_resource_stalls)
|
||||
atomicALUArray.reserve(addr);
|
||||
cacheMemoryStats.numAtomicALUOperations++;
|
||||
return;
|
||||
default:
|
||||
warn("CacheMemory access_type not found: %s",
|
||||
CacheRequestType_to_string(requestType));
|
||||
@@ -664,6 +679,15 @@ CacheMemory::checkResourceAvailable(CacheResourceType res, Addr addr)
|
||||
cacheMemoryStats.numDataArrayStalls++;
|
||||
return false;
|
||||
}
|
||||
} else if (res == CacheResourceType_AtomicALUArray) {
|
||||
if (atomicALUArray.tryAccess(addr)) return true;
|
||||
else {
|
||||
DPRINTF(RubyResourceStalls,
|
||||
"Atomic ALU array stall on addr %#x in line address %#x\n",
|
||||
addr, makeLineAddress(addr));
|
||||
cacheMemoryStats.numAtomicALUArrayStalls++;
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
panic("Unrecognized cache resource type.");
|
||||
}
|
||||
|
||||
@@ -56,6 +56,7 @@
|
||||
#include "mem/ruby/slicc_interface/AbstractCacheEntry.hh"
|
||||
#include "mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh"
|
||||
#include "mem/ruby/structures/BankedArray.hh"
|
||||
#include "mem/ruby/structures/ALUFreeListArray.hh"
|
||||
#include "mem/ruby/system/CacheRecorder.hh"
|
||||
#include "params/RubyCache.hh"
|
||||
#include "sim/sim_object.hh"
|
||||
@@ -186,6 +187,7 @@ class CacheMemory : public SimObject
|
||||
|
||||
BankedArray dataArray;
|
||||
BankedArray tagArray;
|
||||
ALUFreeListArray atomicALUArray;
|
||||
|
||||
int m_cache_size;
|
||||
int m_cache_num_sets;
|
||||
@@ -224,6 +226,9 @@ class CacheMemory : public SimObject
|
||||
statistics::Scalar numTagArrayStalls;
|
||||
statistics::Scalar numDataArrayStalls;
|
||||
|
||||
statistics::Scalar numAtomicALUOperations;
|
||||
statistics::Scalar numAtomicALUArrayStalls;
|
||||
|
||||
// hardware transactional memory
|
||||
statistics::Histogram htmTransCommitReadSet;
|
||||
statistics::Histogram htmTransCommitWriteSet;
|
||||
|
||||
@@ -44,6 +44,11 @@ class RubyCache(SimObject):
|
||||
"0B", "block size in bytes. 0 means default RubyBlockSize"
|
||||
)
|
||||
|
||||
# Atomic parameters only applicable to GPU atomics
|
||||
# Zero atomic latency corresponds to instantanous atomic ALU operations
|
||||
atomicLatency = Param.Cycles(0, "Cycles for an atomic ALU operation")
|
||||
atomicALUs = Param.Int(64, "Number of atomic ALUs")
|
||||
|
||||
dataArrayBanks = Param.Int(1, "Number of banks for the data array")
|
||||
tagArrayBanks = Param.Int(1, "Number of banks for the tag array")
|
||||
dataAccessLatency = Param.Cycles(1, "cycles for a data array access")
|
||||
|
||||
@@ -55,6 +55,7 @@ Source('PersistentTable.cc')
|
||||
Source('RubyPrefetcher.cc')
|
||||
Source('TimerTable.cc')
|
||||
Source('BankedArray.cc')
|
||||
Source('ALUFreeListArray.cc')
|
||||
Source('TBEStorage.cc')
|
||||
if env['PROTOCOL'] == 'CHI':
|
||||
Source('MN_TBETable.cc')
|
||||
|
||||
Reference in New Issue
Block a user