mem-ruby,configs: Add GPU GLC Atomic Resource Constraints (#120)

Added a resource constraint, AtomicALUOperation, to GLC atomics performed in the TCC. The resource constraint uses a new class, ALUFreeList array. The class assumes the following: - There are a fixed number of atomic ALU pipelines - While a new cache line can be processed in each pipeline each cycle, if a cache line is currently going through a pipeline, it can't be processed again until it's finished Two configuration parameters have been used to tune this behavior: - tcc-num-atomic-alus corresponds to the number of atomic ALU pipelines - atomic-alu-latency corresponds to the latency of atomic ALU pipelines Change-Id: I25bdde7dafc3877590bb6536efdf57b8c540a939
2023-11-14 09:48:48 -06:00
parent f11227b4a0
commit be5c03ea9f
9 changed files with 242 additions and 5 deletions
--- a/configs/ruby/GPU_VIPER.py
+++ b/configs/ruby/GPU_VIPER.py
@@ -275,6 +275,8 @@ class TCC(RubyCache):

    def create(self, options):
        self.assoc = options.tcc_assoc
+        self.atomicLatency = options.atomic_alu_latency
+        self.atomicALUs = options.tcc_num_atomic_alus
        if hasattr(options, "bw_scalor") and options.bw_scalor > 0:
            s = options.num_compute_units
            tcc_size = s * 128
@@ -497,6 +499,15 @@ def define_options(parser):
    parser.add_argument(
        "--glc-atomic-latency", type=int, default=1, help="GLC Atomic Latency"
    )
+    parser.add_argument(
+        "--atomic-alu-latency", type=int, default=0, help="Atomic ALU Latency"
+    )
+    parser.add_argument(
+        "--tcc-num-atomic-alus",
+        type=int,
+        default=64,
+        help="Number of atomic ALUs in the TCC",
+    )
    parser.add_argument(
        "--tcp-num-banks",
        type=int,
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -93,6 +93,7 @@ machine(MachineType:TCC, "TCC Cache")
    DataArrayWrite,   desc="Write the data array";
    TagArrayRead,     desc="Read the data array";
    TagArrayWrite,    desc="Write the data array";
+    AtomicALUOperation,  desc="Atomic ALU operation";
  }


@@ -223,6 +224,8 @@ machine(MachineType:TCC, "TCC Cache")
        L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
    } else if (request_type == RequestType:TagArrayWrite) {
        L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    } else if (request_type == RequestType:AtomicALUOperation) {
+        L2cache.recordRequestType(CacheRequestType:AtomicALUOperation, addr);
    }
  }

@@ -235,6 +238,8 @@ machine(MachineType:TCC, "TCC Cache")
      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
    } else if (request_type == RequestType:TagArrayWrite) {
      return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:AtomicALUOperation) {
+      return L2cache.checkResourceAvailable(CacheResourceType:AtomicALUArray, addr);
    } else {
      error("Invalid RequestType type in checkResourceAvailable");
      return true;
@@ -915,7 +920,7 @@ machine(MachineType:TCC, "TCC Cache")
    st_stallAndWaitRequest;
  }

-  transition(V, Atomic, M) {TagArrayRead, DataArrayWrite} {
+  transition(V, Atomic, M) {TagArrayRead, TagArrayWrite, DataArrayWrite, AtomicALUOperation} {
    p_profileHit;
    ut_updateTag;
    owm_orWriteMask;
@@ -942,7 +947,7 @@ machine(MachineType:TCC, "TCC Cache")
    st_stallAndWaitRequest;
  }

-  transition(M, Atomic) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+  transition(M, Atomic) {TagArrayRead, DataArrayWrite, AtomicALUOperation} {
    p_profileHit;
    owm_orWriteMask;
    pa_performAtomic;
@@ -979,7 +984,7 @@ machine(MachineType:TCC, "TCC Cache")
    st_stallAndWaitRequest;
  }

-  transition({M, W}, AtomicPassOn, WI) {TagArrayRead} {
+  transition({M, W}, AtomicPassOn, WI) {TagArrayRead, DataArrayRead} {
    t_allocateTBE;
    wb_writeBack;
    // after writing back the current line, we need to wait for it to be done
@@ -1098,7 +1103,7 @@ machine(MachineType:TCC, "TCC Cache")
    dt_deallocateTBE;
  }

-  transition(A, Bypass) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+  transition(A, Bypass) {TagArrayRead, TagArrayWrite} {
    bapdr_sendBypassedAtomicPerformedInDirectoryResponse;
    dnpa_decrementNumPendingDirectoryAtomics;
    pr_popResponseQueue;
@@ -1120,7 +1125,7 @@ machine(MachineType:TCC, "TCC Cache")
    dt_deallocateTBE;
  }

-  transition(A, Data, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+  transition(A, Data, M) {TagArrayRead, TagArrayWrite, DataArrayWrite, AtomicALUOperation} {
    a_allocateBlock;
    wardb_writeAtomicResponseDirtyBytes;
    pa_performAtomic;
--- a/src/mem/ruby/protocol/RubySlicc_Exports.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm
@@ -230,11 +230,13 @@ enumeration(CacheRequestType, desc="...", default="CacheRequestType_NULL") {
  DataArrayWrite,   desc="Write access to the cache's data array";
  TagArrayRead,     desc="Read access to the cache's tag array";
  TagArrayWrite,    desc="Write access to the cache's tag array";
+  AtomicALUOperation,  desc="Atomic ALU operation";
 }

 enumeration(CacheResourceType, desc="...", default="CacheResourceType_NULL") {
  DataArray,    desc="Access to the cache's data array";
  TagArray,     desc="Access to the cache's tag array";
+  AtomicALUArray, desc="Access to the cache's atomic ALU array";
 }

 enumeration(DirectoryRequestType, desc="...", default="DirectoryRequestType_NULL") {
--- a/src/mem/ruby/structures/ALUFreeListArray.cc
+++ b/src/mem/ruby/structures/ALUFreeListArray.cc
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2023 The University of Wisconsin
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "mem/ruby/structures/ALUFreeListArray.hh"
+
+#include "base/intmath.hh"
+#include "mem/ruby/system/RubySystem.hh"
+#include "sim/cur_tick.hh"
+
+namespace gem5
+{
+
+namespace ruby
+{
+
+/*
+*
+* Models num_ALUs pipelined atomic ALUs with a depth of access_latency ticks.
+* Rather than reserving ALUs, this class assumes multiple requests can go
+* through an ALU at the same time. As such, up to numALU new requests can
+* go through at once, with the caveat that a line already being processed
+* in an ALU can't start processing again until the previous request has exited
+* the pipeline.
+*
+* ALUs aren't mapped directly to cache lines. Rather, ALUs are treated as
+* a free list.
+*
+* Behavior:
+*   Requests will go through unless one/both of the following are met:
+*       - There have been more than [numALUs] requests in the current cycle
+*       - The same line has been accessed in the past accessLatency ticks
+*/
+
+ALUFreeListArray::ALUFreeListArray(unsigned int num_ALUs, Tick access_latency)
+{
+    this->numALUs = num_ALUs;
+    this->accessLatency = access_latency;
+}
+
+bool ALUFreeListArray::tryAccess(Addr addr)
+{
+    uint32_t accesses_this_tick = 0;
+
+    // Remove requests from the tail of the queue that occured more than
+    // accessLatency ticks ago
+    Tick oldestValidRecordStart = curTick() - this->accessLatency;
+
+    while (accessQueue.size() > 0 &&
+         (accessQueue.back().startTick < oldestValidRecordStart)) {
+        accessQueue.pop_back();
+    }
+
+    for (AccessRecord& record : accessQueue) {
+        // Block access if we would be using more ALUs than we have in a
+        // single tick
+        if (record.startTick == curTick() &&
+            (++accesses_this_tick > numALUs)) {
+            return false;
+        }
+
+        // Block access if the line is already being used
+        if (record.lineAddr == makeLineAddress(addr)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+void ALUFreeListArray::reserve(Addr addr)
+{
+    // Only called after tryAccess, so we know queue is up to date and that
+    // the access is valid
+
+    // Add record to queue
+    accessQueue.push_front(AccessRecord(makeLineAddress(addr), curTick()));
+}
+
+} // namespace ruby
+} // namespace gem5
--- a/src/mem/ruby/structures/ALUFreeListArray.hh
+++ b/src/mem/ruby/structures/ALUFreeListArray.hh
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2023 The University of Wisconsin
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __MEM_RUBY_STRUCTURES_ALUFREELISTARRAY_HH__
+#define __MEM_RUBY_STRUCTURES_ALUFREELISTARRAY_HH__
+
+#include <deque>
+
+#include "mem/ruby/common/TypeDefines.hh"
+#include "sim/cur_tick.hh"
+
+namespace gem5
+{
+
+namespace ruby
+{
+
+class ALUFreeListArray
+{
+  private:
+    unsigned int numALUs;
+    Tick accessLatency;
+
+    class AccessRecord
+    {
+      public:
+        AccessRecord(Addr line_addr, Tick start_tick) {
+          this->lineAddr = line_addr;
+          this->startTick = start_tick;
+        }
+
+        Addr lineAddr;
+        Tick startTick;
+    };
+
+    // Queue of accesses from past accessLatency cycles
+    std::deque<AccessRecord> accessQueue;
+
+  public:
+    ALUFreeListArray(unsigned int num_ALUs, Tick access_latency);
+
+    bool tryAccess(Addr addr);
+
+    void reserve(Addr addr);
+
+    Tick getLatency() const { return accessLatency; }
+};
+
+} // namespace ruby
+} // namespace gem5
+
+#endif
--- a/src/mem/ruby/structures/CacheMemory.cc
+++ b/src/mem/ruby/structures/CacheMemory.cc
@@ -73,6 +73,8 @@ CacheMemory::CacheMemory(const Params &p)
              p.start_index_bit, p.ruby_system),
    tagArray(p.tagArrayBanks, p.tagAccessLatency,
             p.start_index_bit, p.ruby_system),
+    atomicALUArray(p.atomicALUs, p.atomicLatency *
+             p.ruby_system->clockPeriod()),
    cacheMemoryStats(this)
 {
    m_cache_size = p.size;
@@ -529,6 +531,8 @@ CacheMemoryStats::CacheMemoryStats(statistics::Group *parent)
      ADD_STAT(numTagArrayWrites, "Number of tag array writes"),
      ADD_STAT(numTagArrayStalls, "Number of stalls caused by tag array"),
      ADD_STAT(numDataArrayStalls, "Number of stalls caused by data array"),
+      ADD_STAT(numAtomicALUOperations, "Number of atomic ALU operations"),
+      ADD_STAT(numAtomicALUArrayStalls, "Number of stalls caused by atomic ALU array"),
      ADD_STAT(htmTransCommitReadSet, "Read set size of a committed "
                                      "transaction"),
      ADD_STAT(htmTransCommitWriteSet, "Write set size of a committed "
@@ -564,6 +568,12 @@ CacheMemoryStats::CacheMemoryStats(statistics::Group *parent)
    numDataArrayStalls
        .flags(statistics::nozero);

+    numAtomicALUOperations
+        .flags(statistics::nozero);
+
+    numAtomicALUArrayStalls
+        .flags(statistics::nozero);
+
    htmTransCommitReadSet
        .init(8)
        .flags(statistics::pdf | statistics::dist | statistics::nozero |
@@ -633,6 +643,11 @@ CacheMemory::recordRequestType(CacheRequestType requestType, Addr addr)
            tagArray.reserve(addressToCacheSet(addr));
        cacheMemoryStats.numTagArrayWrites++;
        return;
+    case CacheRequestType_AtomicALUOperation:
+        if (m_resource_stalls)
+            atomicALUArray.reserve(addr);
+        cacheMemoryStats.numAtomicALUOperations++;
+        return;
    default:
        warn("CacheMemory access_type not found: %s",
             CacheRequestType_to_string(requestType));
@@ -664,6 +679,15 @@ CacheMemory::checkResourceAvailable(CacheResourceType res, Addr addr)
            cacheMemoryStats.numDataArrayStalls++;
            return false;
        }
+    } else if (res == CacheResourceType_AtomicALUArray) {
+        if (atomicALUArray.tryAccess(addr)) return true;
+        else {
+            DPRINTF(RubyResourceStalls,
+                    "Atomic ALU array stall on addr %#x in line address %#x\n",
+                    addr, makeLineAddress(addr));
+            cacheMemoryStats.numAtomicALUArrayStalls++;
+            return false;
+        }
    } else {
        panic("Unrecognized cache resource type.");
    }
--- a/src/mem/ruby/structures/CacheMemory.hh
+++ b/src/mem/ruby/structures/CacheMemory.hh
@@ -56,6 +56,7 @@
 #include "mem/ruby/slicc_interface/AbstractCacheEntry.hh"
 #include "mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh"
 #include "mem/ruby/structures/BankedArray.hh"
+#include "mem/ruby/structures/ALUFreeListArray.hh"
 #include "mem/ruby/system/CacheRecorder.hh"
 #include "params/RubyCache.hh"
 #include "sim/sim_object.hh"
@@ -186,6 +187,7 @@ class CacheMemory : public SimObject

    BankedArray dataArray;
    BankedArray tagArray;
+    ALUFreeListArray atomicALUArray;

    int m_cache_size;
    int m_cache_num_sets;
@@ -224,6 +226,9 @@ class CacheMemory : public SimObject
          statistics::Scalar numTagArrayStalls;
          statistics::Scalar numDataArrayStalls;

+          statistics::Scalar numAtomicALUOperations;
+          statistics::Scalar numAtomicALUArrayStalls;
+
          // hardware transactional memory
          statistics::Histogram htmTransCommitReadSet;
          statistics::Histogram htmTransCommitWriteSet;
--- a/src/mem/ruby/structures/RubyCache.py
+++ b/src/mem/ruby/structures/RubyCache.py
@@ -44,6 +44,11 @@ class RubyCache(SimObject):
        "0B", "block size in bytes. 0 means default RubyBlockSize"
    )

+    # Atomic parameters only applicable to GPU atomics
+    # Zero atomic latency corresponds to instantanous atomic ALU operations
+    atomicLatency = Param.Cycles(0, "Cycles for an atomic ALU operation")
+    atomicALUs = Param.Int(64, "Number of atomic ALUs")
+
    dataArrayBanks = Param.Int(1, "Number of banks for the data array")
    tagArrayBanks = Param.Int(1, "Number of banks for the tag array")
    dataAccessLatency = Param.Cycles(1, "cycles for a data array access")
--- a/src/mem/ruby/structures/SConscript
+++ b/src/mem/ruby/structures/SConscript
@@ -55,6 +55,7 @@ Source('PersistentTable.cc')
 Source('RubyPrefetcher.cc')
 Source('TimerTable.cc')
 Source('BankedArray.cc')
+Source('ALUFreeListArray.cc')
 Source('TBEStorage.cc')
 if env['PROTOCOL'] == 'CHI':
    Source('MN_TBETable.cc')