diff --git a/configs/ruby/GPU_VIPER.py b/configs/ruby/GPU_VIPER.py index b034cf76fd..665f739b4b 100644 --- a/configs/ruby/GPU_VIPER.py +++ b/configs/ruby/GPU_VIPER.py @@ -275,6 +275,8 @@ class TCC(RubyCache): def create(self, options): self.assoc = options.tcc_assoc + self.atomicLatency = options.atomic_alu_latency + self.atomicALUs = options.tcc_num_atomic_alus if hasattr(options, "bw_scalor") and options.bw_scalor > 0: s = options.num_compute_units tcc_size = s * 128 @@ -497,6 +499,15 @@ def define_options(parser): parser.add_argument( "--glc-atomic-latency", type=int, default=1, help="GLC Atomic Latency" ) + parser.add_argument( + "--atomic-alu-latency", type=int, default=0, help="Atomic ALU Latency" + ) + parser.add_argument( + "--tcc-num-atomic-alus", + type=int, + default=64, + help="Number of atomic ALUs in the TCC", + ) parser.add_argument( "--tcp-num-banks", type=int, diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm index e3438fcd91..dfab0ed29a 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm @@ -93,6 +93,7 @@ machine(MachineType:TCC, "TCC Cache") DataArrayWrite, desc="Write the data array"; TagArrayRead, desc="Read the data array"; TagArrayWrite, desc="Write the data array"; + AtomicALUOperation, desc="Atomic ALU operation"; } @@ -223,6 +224,8 @@ machine(MachineType:TCC, "TCC Cache") L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr); } else if (request_type == RequestType:TagArrayWrite) { L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr); + } else if (request_type == RequestType:AtomicALUOperation) { + L2cache.recordRequestType(CacheRequestType:AtomicALUOperation, addr); } } @@ -235,6 +238,8 @@ machine(MachineType:TCC, "TCC Cache") return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); } else if (request_type == RequestType:TagArrayWrite) { return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr); + } else if (request_type == RequestType:AtomicALUOperation) { + return L2cache.checkResourceAvailable(CacheResourceType:AtomicALUArray, addr); } else { error("Invalid RequestType type in checkResourceAvailable"); return true; @@ -915,7 +920,7 @@ machine(MachineType:TCC, "TCC Cache") st_stallAndWaitRequest; } - transition(V, Atomic, M) {TagArrayRead, DataArrayWrite} { + transition(V, Atomic, M) {TagArrayRead, TagArrayWrite, DataArrayWrite, AtomicALUOperation} { p_profileHit; ut_updateTag; owm_orWriteMask; @@ -942,7 +947,7 @@ machine(MachineType:TCC, "TCC Cache") st_stallAndWaitRequest; } - transition(M, Atomic) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + transition(M, Atomic) {TagArrayRead, DataArrayWrite, AtomicALUOperation} { p_profileHit; owm_orWriteMask; pa_performAtomic; @@ -979,7 +984,7 @@ machine(MachineType:TCC, "TCC Cache") st_stallAndWaitRequest; } - transition({M, W}, AtomicPassOn, WI) {TagArrayRead} { + transition({M, W}, AtomicPassOn, WI) {TagArrayRead, DataArrayRead} { t_allocateTBE; wb_writeBack; // after writing back the current line, we need to wait for it to be done @@ -1098,7 +1103,7 @@ machine(MachineType:TCC, "TCC Cache") dt_deallocateTBE; } - transition(A, Bypass) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + transition(A, Bypass) {TagArrayRead, TagArrayWrite} { bapdr_sendBypassedAtomicPerformedInDirectoryResponse; dnpa_decrementNumPendingDirectoryAtomics; pr_popResponseQueue; @@ -1120,7 +1125,7 @@ machine(MachineType:TCC, "TCC Cache") dt_deallocateTBE; } - transition(A, Data, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} { + transition(A, Data, M) {TagArrayRead, TagArrayWrite, DataArrayWrite, AtomicALUOperation} { a_allocateBlock; wardb_writeAtomicResponseDirtyBytes; pa_performAtomic; diff --git a/src/mem/ruby/protocol/RubySlicc_Exports.sm b/src/mem/ruby/protocol/RubySlicc_Exports.sm index 2e496a8221..9ccafba41f 100644 --- a/src/mem/ruby/protocol/RubySlicc_Exports.sm +++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm @@ -230,11 +230,13 @@ enumeration(CacheRequestType, desc="...", default="CacheRequestType_NULL") { DataArrayWrite, desc="Write access to the cache's data array"; TagArrayRead, desc="Read access to the cache's tag array"; TagArrayWrite, desc="Write access to the cache's tag array"; + AtomicALUOperation, desc="Atomic ALU operation"; } enumeration(CacheResourceType, desc="...", default="CacheResourceType_NULL") { DataArray, desc="Access to the cache's data array"; TagArray, desc="Access to the cache's tag array"; + AtomicALUArray, desc="Access to the cache's atomic ALU array"; } enumeration(DirectoryRequestType, desc="...", default="DirectoryRequestType_NULL") { diff --git a/src/mem/ruby/structures/ALUFreeListArray.cc b/src/mem/ruby/structures/ALUFreeListArray.cc new file mode 100644 index 0000000000..87b5cbfbd2 --- /dev/null +++ b/src/mem/ruby/structures/ALUFreeListArray.cc @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2023 The University of Wisconsin + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "mem/ruby/structures/ALUFreeListArray.hh" + +#include "base/intmath.hh" +#include "mem/ruby/system/RubySystem.hh" +#include "sim/cur_tick.hh" + +namespace gem5 +{ + +namespace ruby +{ + +/* +* +* Models num_ALUs pipelined atomic ALUs with a depth of access_latency ticks. +* Rather than reserving ALUs, this class assumes multiple requests can go +* through an ALU at the same time. As such, up to numALU new requests can +* go through at once, with the caveat that a line already being processed +* in an ALU can't start processing again until the previous request has exited +* the pipeline. +* +* ALUs aren't mapped directly to cache lines. Rather, ALUs are treated as +* a free list. +* +* Behavior: +* Requests will go through unless one/both of the following are met: +* - There have been more than [numALUs] requests in the current cycle +* - The same line has been accessed in the past accessLatency ticks +*/ + +ALUFreeListArray::ALUFreeListArray(unsigned int num_ALUs, Tick access_latency) +{ + this->numALUs = num_ALUs; + this->accessLatency = access_latency; +} + +bool ALUFreeListArray::tryAccess(Addr addr) +{ + uint32_t accesses_this_tick = 0; + + // Remove requests from the tail of the queue that occured more than + // accessLatency ticks ago + Tick oldestValidRecordStart = curTick() - this->accessLatency; + + while (accessQueue.size() > 0 && + (accessQueue.back().startTick < oldestValidRecordStart)) { + accessQueue.pop_back(); + } + + for (AccessRecord& record : accessQueue) { + // Block access if we would be using more ALUs than we have in a + // single tick + if (record.startTick == curTick() && + (++accesses_this_tick > numALUs)) { + return false; + } + + // Block access if the line is already being used + if (record.lineAddr == makeLineAddress(addr)) { + return false; + } + } + + return true; +} + +void ALUFreeListArray::reserve(Addr addr) +{ + // Only called after tryAccess, so we know queue is up to date and that + // the access is valid + + // Add record to queue + accessQueue.push_front(AccessRecord(makeLineAddress(addr), curTick())); +} + +} // namespace ruby +} // namespace gem5 diff --git a/src/mem/ruby/structures/ALUFreeListArray.hh b/src/mem/ruby/structures/ALUFreeListArray.hh new file mode 100644 index 0000000000..bed1b00b5c --- /dev/null +++ b/src/mem/ruby/structures/ALUFreeListArray.hh @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2023 The University of Wisconsin + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __MEM_RUBY_STRUCTURES_ALUFREELISTARRAY_HH__ +#define __MEM_RUBY_STRUCTURES_ALUFREELISTARRAY_HH__ + +#include + +#include "mem/ruby/common/TypeDefines.hh" +#include "sim/cur_tick.hh" + +namespace gem5 +{ + +namespace ruby +{ + +class ALUFreeListArray +{ + private: + unsigned int numALUs; + Tick accessLatency; + + class AccessRecord + { + public: + AccessRecord(Addr line_addr, Tick start_tick) { + this->lineAddr = line_addr; + this->startTick = start_tick; + } + + Addr lineAddr; + Tick startTick; + }; + + // Queue of accesses from past accessLatency cycles + std::deque accessQueue; + + public: + ALUFreeListArray(unsigned int num_ALUs, Tick access_latency); + + bool tryAccess(Addr addr); + + void reserve(Addr addr); + + Tick getLatency() const { return accessLatency; } +}; + +} // namespace ruby +} // namespace gem5 + +#endif diff --git a/src/mem/ruby/structures/CacheMemory.cc b/src/mem/ruby/structures/CacheMemory.cc index 975bf0b775..3b97d34d18 100644 --- a/src/mem/ruby/structures/CacheMemory.cc +++ b/src/mem/ruby/structures/CacheMemory.cc @@ -73,6 +73,8 @@ CacheMemory::CacheMemory(const Params &p) p.start_index_bit, p.ruby_system), tagArray(p.tagArrayBanks, p.tagAccessLatency, p.start_index_bit, p.ruby_system), + atomicALUArray(p.atomicALUs, p.atomicLatency * + p.ruby_system->clockPeriod()), cacheMemoryStats(this) { m_cache_size = p.size; @@ -529,6 +531,8 @@ CacheMemoryStats::CacheMemoryStats(statistics::Group *parent) ADD_STAT(numTagArrayWrites, "Number of tag array writes"), ADD_STAT(numTagArrayStalls, "Number of stalls caused by tag array"), ADD_STAT(numDataArrayStalls, "Number of stalls caused by data array"), + ADD_STAT(numAtomicALUOperations, "Number of atomic ALU operations"), + ADD_STAT(numAtomicALUArrayStalls, "Number of stalls caused by atomic ALU array"), ADD_STAT(htmTransCommitReadSet, "Read set size of a committed " "transaction"), ADD_STAT(htmTransCommitWriteSet, "Write set size of a committed " @@ -564,6 +568,12 @@ CacheMemoryStats::CacheMemoryStats(statistics::Group *parent) numDataArrayStalls .flags(statistics::nozero); + numAtomicALUOperations + .flags(statistics::nozero); + + numAtomicALUArrayStalls + .flags(statistics::nozero); + htmTransCommitReadSet .init(8) .flags(statistics::pdf | statistics::dist | statistics::nozero | @@ -633,6 +643,11 @@ CacheMemory::recordRequestType(CacheRequestType requestType, Addr addr) tagArray.reserve(addressToCacheSet(addr)); cacheMemoryStats.numTagArrayWrites++; return; + case CacheRequestType_AtomicALUOperation: + if (m_resource_stalls) + atomicALUArray.reserve(addr); + cacheMemoryStats.numAtomicALUOperations++; + return; default: warn("CacheMemory access_type not found: %s", CacheRequestType_to_string(requestType)); @@ -664,6 +679,15 @@ CacheMemory::checkResourceAvailable(CacheResourceType res, Addr addr) cacheMemoryStats.numDataArrayStalls++; return false; } + } else if (res == CacheResourceType_AtomicALUArray) { + if (atomicALUArray.tryAccess(addr)) return true; + else { + DPRINTF(RubyResourceStalls, + "Atomic ALU array stall on addr %#x in line address %#x\n", + addr, makeLineAddress(addr)); + cacheMemoryStats.numAtomicALUArrayStalls++; + return false; + } } else { panic("Unrecognized cache resource type."); } diff --git a/src/mem/ruby/structures/CacheMemory.hh b/src/mem/ruby/structures/CacheMemory.hh index a63bb02748..de7c327f63 100644 --- a/src/mem/ruby/structures/CacheMemory.hh +++ b/src/mem/ruby/structures/CacheMemory.hh @@ -56,6 +56,7 @@ #include "mem/ruby/slicc_interface/AbstractCacheEntry.hh" #include "mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh" #include "mem/ruby/structures/BankedArray.hh" +#include "mem/ruby/structures/ALUFreeListArray.hh" #include "mem/ruby/system/CacheRecorder.hh" #include "params/RubyCache.hh" #include "sim/sim_object.hh" @@ -186,6 +187,7 @@ class CacheMemory : public SimObject BankedArray dataArray; BankedArray tagArray; + ALUFreeListArray atomicALUArray; int m_cache_size; int m_cache_num_sets; @@ -224,6 +226,9 @@ class CacheMemory : public SimObject statistics::Scalar numTagArrayStalls; statistics::Scalar numDataArrayStalls; + statistics::Scalar numAtomicALUOperations; + statistics::Scalar numAtomicALUArrayStalls; + // hardware transactional memory statistics::Histogram htmTransCommitReadSet; statistics::Histogram htmTransCommitWriteSet; diff --git a/src/mem/ruby/structures/RubyCache.py b/src/mem/ruby/structures/RubyCache.py index f2c1b7230c..7446ac3de0 100644 --- a/src/mem/ruby/structures/RubyCache.py +++ b/src/mem/ruby/structures/RubyCache.py @@ -44,6 +44,11 @@ class RubyCache(SimObject): "0B", "block size in bytes. 0 means default RubyBlockSize" ) + # Atomic parameters only applicable to GPU atomics + # Zero atomic latency corresponds to instantanous atomic ALU operations + atomicLatency = Param.Cycles(0, "Cycles for an atomic ALU operation") + atomicALUs = Param.Int(64, "Number of atomic ALUs") + dataArrayBanks = Param.Int(1, "Number of banks for the data array") tagArrayBanks = Param.Int(1, "Number of banks for the tag array") dataAccessLatency = Param.Cycles(1, "cycles for a data array access") diff --git a/src/mem/ruby/structures/SConscript b/src/mem/ruby/structures/SConscript index cae03909c7..7baab6a4c4 100644 --- a/src/mem/ruby/structures/SConscript +++ b/src/mem/ruby/structures/SConscript @@ -55,6 +55,7 @@ Source('PersistentTable.cc') Source('RubyPrefetcher.cc') Source('TimerTable.cc') Source('BankedArray.cc') +Source('ALUFreeListArray.cc') Source('TBEStorage.cc') if env['PROTOCOL'] == 'CHI': Source('MN_TBETable.cc')