diff --git a/configs/ruby/GPU_VIPER.py b/configs/ruby/GPU_VIPER.py
index b034cf76fd..665f739b4b 100644
--- a/configs/ruby/GPU_VIPER.py
+++ b/configs/ruby/GPU_VIPER.py
@@ -275,6 +275,8 @@ class TCC(RubyCache):
 
     def create(self, options):
         self.assoc = options.tcc_assoc
+        self.atomicLatency = options.atomic_alu_latency
+        self.atomicALUs = options.tcc_num_atomic_alus
         if hasattr(options, "bw_scalor") and options.bw_scalor > 0:
             s = options.num_compute_units
             tcc_size = s * 128
@@ -497,6 +499,15 @@ def define_options(parser):
     parser.add_argument(
         "--glc-atomic-latency", type=int, default=1, help="GLC Atomic Latency"
     )
+    parser.add_argument(
+        "--atomic-alu-latency", type=int, default=0, help="Atomic ALU Latency"
+    )
+    parser.add_argument(
+        "--tcc-num-atomic-alus",
+        type=int,
+        default=64,
+        help="Number of atomic ALUs in the TCC",
+    )
     parser.add_argument(
         "--tcp-num-banks",
         type=int,
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
index e3438fcd91..dfab0ed29a 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -93,6 +93,7 @@ machine(MachineType:TCC, "TCC Cache")
     DataArrayWrite,   desc="Write the data array";
     TagArrayRead,     desc="Read the data array";
     TagArrayWrite,    desc="Write the data array";
+    AtomicALUOperation,  desc="Atomic ALU operation";
   }
 
 
@@ -223,6 +224,8 @@ machine(MachineType:TCC, "TCC Cache")
         L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
     } else if (request_type == RequestType:TagArrayWrite) {
         L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+    } else if (request_type == RequestType:AtomicALUOperation) {
+        L2cache.recordRequestType(CacheRequestType:AtomicALUOperation, addr);
     }
   }
 
@@ -235,6 +238,8 @@ machine(MachineType:TCC, "TCC Cache")
       return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
     } else if (request_type == RequestType:TagArrayWrite) {
       return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+    } else if (request_type == RequestType:AtomicALUOperation) {
+      return L2cache.checkResourceAvailable(CacheResourceType:AtomicALUArray, addr);
     } else {
       error("Invalid RequestType type in checkResourceAvailable");
       return true;
@@ -915,7 +920,7 @@ machine(MachineType:TCC, "TCC Cache")
     st_stallAndWaitRequest;
   }
 
-  transition(V, Atomic, M) {TagArrayRead, DataArrayWrite} {
+  transition(V, Atomic, M) {TagArrayRead, TagArrayWrite, DataArrayWrite, AtomicALUOperation} {
     p_profileHit;
     ut_updateTag;
     owm_orWriteMask;
@@ -942,7 +947,7 @@ machine(MachineType:TCC, "TCC Cache")
     st_stallAndWaitRequest;
   }
 
-  transition(M, Atomic) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+  transition(M, Atomic) {TagArrayRead, DataArrayWrite, AtomicALUOperation} {
     p_profileHit;
     owm_orWriteMask;
     pa_performAtomic;
@@ -979,7 +984,7 @@ machine(MachineType:TCC, "TCC Cache")
     st_stallAndWaitRequest;
   }
 
-  transition({M, W}, AtomicPassOn, WI) {TagArrayRead} {
+  transition({M, W}, AtomicPassOn, WI) {TagArrayRead, DataArrayRead} {
     t_allocateTBE;
     wb_writeBack;
     // after writing back the current line, we need to wait for it to be done
@@ -1098,7 +1103,7 @@ machine(MachineType:TCC, "TCC Cache")
     dt_deallocateTBE;
   }
 
-  transition(A, Bypass) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+  transition(A, Bypass) {TagArrayRead, TagArrayWrite} {
     bapdr_sendBypassedAtomicPerformedInDirectoryResponse;
     dnpa_decrementNumPendingDirectoryAtomics;
     pr_popResponseQueue;
@@ -1120,7 +1125,7 @@ machine(MachineType:TCC, "TCC Cache")
     dt_deallocateTBE;
   }
 
-  transition(A, Data, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+  transition(A, Data, M) {TagArrayRead, TagArrayWrite, DataArrayWrite, AtomicALUOperation} {
     a_allocateBlock;
     wardb_writeAtomicResponseDirtyBytes;
     pa_performAtomic;
diff --git a/src/mem/ruby/protocol/RubySlicc_Exports.sm b/src/mem/ruby/protocol/RubySlicc_Exports.sm
index 2e496a8221..9ccafba41f 100644
--- a/src/mem/ruby/protocol/RubySlicc_Exports.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm
@@ -230,11 +230,13 @@ enumeration(CacheRequestType, desc="...", default="CacheRequestType_NULL") {
   DataArrayWrite,   desc="Write access to the cache's data array";
   TagArrayRead,     desc="Read access to the cache's tag array";
   TagArrayWrite,    desc="Write access to the cache's tag array";
+  AtomicALUOperation,  desc="Atomic ALU operation";
 }
 
 enumeration(CacheResourceType, desc="...", default="CacheResourceType_NULL") {
   DataArray,    desc="Access to the cache's data array";
   TagArray,     desc="Access to the cache's tag array";
+  AtomicALUArray, desc="Access to the cache's atomic ALU array";
 }
 
 enumeration(DirectoryRequestType, desc="...", default="DirectoryRequestType_NULL") {
diff --git a/src/mem/ruby/structures/ALUFreeListArray.cc b/src/mem/ruby/structures/ALUFreeListArray.cc
new file mode 100644
index 0000000000..87b5cbfbd2
--- /dev/null
+++ b/src/mem/ruby/structures/ALUFreeListArray.cc
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2023 The University of Wisconsin
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "mem/ruby/structures/ALUFreeListArray.hh"
+
+#include "base/intmath.hh"
+#include "mem/ruby/system/RubySystem.hh"
+#include "sim/cur_tick.hh"
+
+namespace gem5
+{
+
+namespace ruby
+{
+
+/*
+*
+* Models num_ALUs pipelined atomic ALUs with a depth of access_latency ticks.
+* Rather than reserving ALUs, this class assumes multiple requests can go
+* through an ALU at the same time. As such, up to numALU new requests can
+* go through at once, with the caveat that a line already being processed
+* in an ALU can't start processing again until the previous request has exited
+* the pipeline.
+*
+* ALUs aren't mapped directly to cache lines. Rather, ALUs are treated as
+* a free list.
+*
+* Behavior:
+*   Requests will go through unless one/both of the following are met:
+*       - There have been more than [numALUs] requests in the current cycle
+*       - The same line has been accessed in the past accessLatency ticks
+*/
+
+ALUFreeListArray::ALUFreeListArray(unsigned int num_ALUs, Tick access_latency)
+{
+    this->numALUs = num_ALUs;
+    this->accessLatency = access_latency;
+}
+
+bool ALUFreeListArray::tryAccess(Addr addr)
+{
+    uint32_t accesses_this_tick = 0;
+
+    // Remove requests from the tail of the queue that occured more than
+    // accessLatency ticks ago
+    Tick oldestValidRecordStart = curTick() - this->accessLatency;
+
+    while (accessQueue.size() > 0 &&
+         (accessQueue.back().startTick < oldestValidRecordStart)) {
+        accessQueue.pop_back();
+    }
+
+    for (AccessRecord& record : accessQueue) {
+        // Block access if we would be using more ALUs than we have in a
+        // single tick
+        if (record.startTick == curTick() &&
+            (++accesses_this_tick > numALUs)) {
+            return false;
+        }
+
+        // Block access if the line is already being used
+        if (record.lineAddr == makeLineAddress(addr)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+void ALUFreeListArray::reserve(Addr addr)
+{
+    // Only called after tryAccess, so we know queue is up to date and that
+    // the access is valid
+
+    // Add record to queue
+    accessQueue.push_front(AccessRecord(makeLineAddress(addr), curTick()));
+}
+
+} // namespace ruby
+} // namespace gem5
diff --git a/src/mem/ruby/structures/ALUFreeListArray.hh b/src/mem/ruby/structures/ALUFreeListArray.hh
new file mode 100644
index 0000000000..bed1b00b5c
--- /dev/null
+++ b/src/mem/ruby/structures/ALUFreeListArray.hh
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2023 The University of Wisconsin
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __MEM_RUBY_STRUCTURES_ALUFREELISTARRAY_HH__
+#define __MEM_RUBY_STRUCTURES_ALUFREELISTARRAY_HH__
+
+#include <deque>
+
+#include "mem/ruby/common/TypeDefines.hh"
+#include "sim/cur_tick.hh"
+
+namespace gem5
+{
+
+namespace ruby
+{
+
+class ALUFreeListArray
+{
+  private:
+    unsigned int numALUs;
+    Tick accessLatency;
+
+    class AccessRecord
+    {
+      public:
+        AccessRecord(Addr line_addr, Tick start_tick) {
+          this->lineAddr = line_addr;
+          this->startTick = start_tick;
+        }
+
+        Addr lineAddr;
+        Tick startTick;
+    };
+
+    // Queue of accesses from past accessLatency cycles
+    std::deque<AccessRecord> accessQueue;
+
+  public:
+    ALUFreeListArray(unsigned int num_ALUs, Tick access_latency);
+
+    bool tryAccess(Addr addr);
+
+    void reserve(Addr addr);
+
+    Tick getLatency() const { return accessLatency; }
+};
+
+} // namespace ruby
+} // namespace gem5
+
+#endif
diff --git a/src/mem/ruby/structures/CacheMemory.cc b/src/mem/ruby/structures/CacheMemory.cc
index 975bf0b775..3b97d34d18 100644
--- a/src/mem/ruby/structures/CacheMemory.cc
+++ b/src/mem/ruby/structures/CacheMemory.cc
@@ -73,6 +73,8 @@ CacheMemory::CacheMemory(const Params &p)
               p.start_index_bit, p.ruby_system),
     tagArray(p.tagArrayBanks, p.tagAccessLatency,
              p.start_index_bit, p.ruby_system),
+    atomicALUArray(p.atomicALUs, p.atomicLatency *
+             p.ruby_system->clockPeriod()),
     cacheMemoryStats(this)
 {
     m_cache_size = p.size;
@@ -529,6 +531,8 @@ CacheMemoryStats::CacheMemoryStats(statistics::Group *parent)
       ADD_STAT(numTagArrayWrites, "Number of tag array writes"),
       ADD_STAT(numTagArrayStalls, "Number of stalls caused by tag array"),
       ADD_STAT(numDataArrayStalls, "Number of stalls caused by data array"),
+      ADD_STAT(numAtomicALUOperations, "Number of atomic ALU operations"),
+      ADD_STAT(numAtomicALUArrayStalls, "Number of stalls caused by atomic ALU array"),
       ADD_STAT(htmTransCommitReadSet, "Read set size of a committed "
                                       "transaction"),
       ADD_STAT(htmTransCommitWriteSet, "Write set size of a committed "
@@ -564,6 +568,12 @@ CacheMemoryStats::CacheMemoryStats(statistics::Group *parent)
     numDataArrayStalls
         .flags(statistics::nozero);
 
+    numAtomicALUOperations
+        .flags(statistics::nozero);
+
+    numAtomicALUArrayStalls
+        .flags(statistics::nozero);
+
     htmTransCommitReadSet
         .init(8)
         .flags(statistics::pdf | statistics::dist | statistics::nozero |
@@ -633,6 +643,11 @@ CacheMemory::recordRequestType(CacheRequestType requestType, Addr addr)
             tagArray.reserve(addressToCacheSet(addr));
         cacheMemoryStats.numTagArrayWrites++;
         return;
+    case CacheRequestType_AtomicALUOperation:
+        if (m_resource_stalls)
+            atomicALUArray.reserve(addr);
+        cacheMemoryStats.numAtomicALUOperations++;
+        return;
     default:
         warn("CacheMemory access_type not found: %s",
              CacheRequestType_to_string(requestType));
@@ -664,6 +679,15 @@ CacheMemory::checkResourceAvailable(CacheResourceType res, Addr addr)
             cacheMemoryStats.numDataArrayStalls++;
             return false;
         }
+    } else if (res == CacheResourceType_AtomicALUArray) {
+        if (atomicALUArray.tryAccess(addr)) return true;
+        else {
+            DPRINTF(RubyResourceStalls,
+                    "Atomic ALU array stall on addr %#x in line address %#x\n",
+                    addr, makeLineAddress(addr));
+            cacheMemoryStats.numAtomicALUArrayStalls++;
+            return false;
+        }
     } else {
         panic("Unrecognized cache resource type.");
     }
diff --git a/src/mem/ruby/structures/CacheMemory.hh b/src/mem/ruby/structures/CacheMemory.hh
index a63bb02748..de7c327f63 100644
--- a/src/mem/ruby/structures/CacheMemory.hh
+++ b/src/mem/ruby/structures/CacheMemory.hh
@@ -56,6 +56,7 @@
 #include "mem/ruby/slicc_interface/AbstractCacheEntry.hh"
 #include "mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh"
 #include "mem/ruby/structures/BankedArray.hh"
+#include "mem/ruby/structures/ALUFreeListArray.hh"
 #include "mem/ruby/system/CacheRecorder.hh"
 #include "params/RubyCache.hh"
 #include "sim/sim_object.hh"
@@ -186,6 +187,7 @@ class CacheMemory : public SimObject
 
     BankedArray dataArray;
     BankedArray tagArray;
+    ALUFreeListArray atomicALUArray;
 
     int m_cache_size;
     int m_cache_num_sets;
@@ -224,6 +226,9 @@ class CacheMemory : public SimObject
           statistics::Scalar numTagArrayStalls;
           statistics::Scalar numDataArrayStalls;
 
+          statistics::Scalar numAtomicALUOperations;
+          statistics::Scalar numAtomicALUArrayStalls;
+
           // hardware transactional memory
           statistics::Histogram htmTransCommitReadSet;
           statistics::Histogram htmTransCommitWriteSet;
diff --git a/src/mem/ruby/structures/RubyCache.py b/src/mem/ruby/structures/RubyCache.py
index f2c1b7230c..7446ac3de0 100644
--- a/src/mem/ruby/structures/RubyCache.py
+++ b/src/mem/ruby/structures/RubyCache.py
@@ -44,6 +44,11 @@ class RubyCache(SimObject):
         "0B", "block size in bytes. 0 means default RubyBlockSize"
     )
 
+    # Atomic parameters only applicable to GPU atomics
+    # Zero atomic latency corresponds to instantanous atomic ALU operations
+    atomicLatency = Param.Cycles(0, "Cycles for an atomic ALU operation")
+    atomicALUs = Param.Int(64, "Number of atomic ALUs")
+
     dataArrayBanks = Param.Int(1, "Number of banks for the data array")
     tagArrayBanks = Param.Int(1, "Number of banks for the tag array")
     dataAccessLatency = Param.Cycles(1, "cycles for a data array access")
diff --git a/src/mem/ruby/structures/SConscript b/src/mem/ruby/structures/SConscript
index cae03909c7..7baab6a4c4 100644
--- a/src/mem/ruby/structures/SConscript
+++ b/src/mem/ruby/structures/SConscript
@@ -55,6 +55,7 @@ Source('PersistentTable.cc')
 Source('RubyPrefetcher.cc')
 Source('TimerTable.cc')
 Source('BankedArray.cc')
+Source('ALUFreeListArray.cc')
 Source('TBEStorage.cc')
 if env['PROTOCOL'] == 'CHI':
     Source('MN_TBETable.cc')