From f6a453362fbc330bda69d187ac3ffed991bee1ef Mon Sep 17 00:00:00 2001
From: "Ranganath (Bujji) Selagamsetty" <bujji.selagamsetty@amd.com>
Date: Fri, 18 Aug 2023 12:08:01 -0500
Subject: [PATCH] mem: Atomic ops to same address

Augmenting the DataBlock class with a change log structure to
record the effects of atomic operations on a data block and
service these changes if the atomic operations require return
values.

Although the operations are atomic, the coalescer need not
send unique memory requests for each operation. Atomic
operations within a wavefront to the same address are now
coalesced into a single memory request. The response of this
request carries all the necessary information to provide the
requesting lanes unique values as a result of their individual
atomic operations. This helps reduce contention for request
and response queues in simulation.

Previously, only the final value of the datablock after all
atomic ops to the same address was visible to the requesting
waves. This change corrects this behavior by allowing each wave
to see the effect of this individual atomic op is a return value
is necessary.

Change-Id: I639bea943afd317e45f8fa3bff7689f6b8df9395
---
 src/mem/ruby/common/DataBlock.cc           | 66 ++++++++++++++++++++--
 src/mem/ruby/common/DataBlock.hh           | 13 +++++
 src/mem/ruby/common/WriteMask.cc           | 22 ++++++++
 src/mem/ruby/common/WriteMask.hh           | 29 +++-------
 src/mem/ruby/protocol/GPU_VIPER-TCC.sm     |  1 +
 src/mem/ruby/protocol/RubySlicc_Exports.sm |  2 +
 src/mem/ruby/system/GPUCoalescer.cc        | 52 ++++++++++++-----
 7 files changed, 146 insertions(+), 39 deletions(-)

diff --git a/src/mem/ruby/common/DataBlock.cc b/src/mem/ruby/common/DataBlock.cc
index f70aa79fd4..70d9bc332a 100644
--- a/src/mem/ruby/common/DataBlock.cc
+++ b/src/mem/ruby/common/DataBlock.cc
@@ -51,9 +51,19 @@ namespace ruby
 
 DataBlock::DataBlock(const DataBlock &cp)
 {
-    m_data = new uint8_t[RubySystem::getBlockSizeBytes()];
-    memcpy(m_data, cp.m_data, RubySystem::getBlockSizeBytes());
+    uint8_t *block_update;
+    size_t block_bytes = RubySystem::getBlockSizeBytes();
+    m_data = new uint8_t[block_bytes];
+    memcpy(m_data, cp.m_data, block_bytes);
     m_alloc = true;
+    // If this data block is involved in an atomic operation, the effect
+    // of applying the atomic operations on the data block are recorded in
+    // m_atomicLog. If so, we must copy over every entry in the change log
+    for (size_t i = 0; i < cp.m_atomicLog.size(); i++) {
+        block_update = new uint8_t[block_bytes];
+        memcpy(block_update, cp.m_atomicLog[i], block_bytes);
+        m_atomicLog.push_back(block_update);
+    }
 }
 
 void
@@ -73,7 +83,20 @@ DataBlock::clear()
 bool
 DataBlock::equal(const DataBlock& obj) const
 {
-    return !memcmp(m_data, obj.m_data, RubySystem::getBlockSizeBytes());
+    size_t block_bytes = RubySystem::getBlockSizeBytes();
+    // Check that the block contents match
+    if (memcmp(m_data, obj.m_data, block_bytes)) {
+        return false;
+    }
+    if (m_atomicLog.size() != obj.m_atomicLog.size()) {
+        return false;
+    }
+    for (size_t i = 0; i < m_atomicLog.size(); i++) {
+        if (memcmp(m_atomicLog[i], obj.m_atomicLog[i], block_bytes)) {
+            return false;
+        }
+    }
+    return true;
 }
 
 void
@@ -92,7 +115,7 @@ DataBlock::atomicPartial(const DataBlock &dblk, const WriteMask &mask)
     for (int i = 0; i < RubySystem::getBlockSizeBytes(); i++) {
         m_data[i] = dblk.m_data[i];
     }
-    mask.performAtomic(m_data);
+    mask.performAtomic(m_data, m_atomicLog);
 }
 
 void
@@ -107,6 +130,28 @@ DataBlock::print(std::ostream& out) const
     out << std::dec << "]" << std::flush;
 }
 
+int
+DataBlock::numAtomicLogEntries() const
+{
+    return m_atomicLog.size();
+}
+uint8_t*
+DataBlock::popAtomicLogEntryFront()
+{
+    assert(m_atomicLog.size() > 0);
+    auto ret = m_atomicLog.front();
+    m_atomicLog.pop_front();
+    return ret;
+}
+void
+DataBlock::clearAtomicLogEntries()
+{
+    for (auto log : m_atomicLog) {
+        delete [] log;
+    }
+    m_atomicLog.clear();
+}
+
 const uint8_t*
 DataBlock::getData(int offset, int len) const
 {
@@ -137,7 +182,18 @@ DataBlock::setData(PacketPtr pkt)
 DataBlock &
 DataBlock::operator=(const DataBlock & obj)
 {
-    memcpy(m_data, obj.m_data, RubySystem::getBlockSizeBytes());
+    uint8_t *block_update;
+    size_t block_bytes = RubySystem::getBlockSizeBytes();
+    // Copy entire block contents from obj to current block
+    memcpy(m_data, obj.m_data, block_bytes);
+    // If this data block is involved in an atomic operation, the effect
+    // of applying the atomic operations on the data block are recorded in
+    // m_atomicLog. If so, we must copy over every entry in the change log
+    for (size_t i = 0; i < obj.m_atomicLog.size(); i++) {
+        block_update = new uint8_t[block_bytes];
+        memcpy(block_update, obj.m_atomicLog[i], block_bytes);
+        m_atomicLog.push_back(block_update);
+    }
     return *this;
 }
 
diff --git a/src/mem/ruby/common/DataBlock.hh b/src/mem/ruby/common/DataBlock.hh
index e147d701c5..aa94f56eb8 100644
--- a/src/mem/ruby/common/DataBlock.hh
+++ b/src/mem/ruby/common/DataBlock.hh
@@ -44,6 +44,7 @@
 #include <inttypes.h>
 
 #include <cassert>
+#include <deque>
 #include <iomanip>
 #include <iostream>
 
@@ -71,6 +72,12 @@ class DataBlock
     {
         if (m_alloc)
             delete [] m_data;
+
+        // If data block involved in atomic
+        // operations, free all meta data
+        for (auto log : m_atomicLog) {
+            delete [] log;
+        }
     }
 
     DataBlock& operator=(const DataBlock& obj);
@@ -80,6 +87,9 @@ class DataBlock
     void clear();
     uint8_t getByte(int whichByte) const;
     const uint8_t *getData(int offset, int len) const;
+    uint8_t* popAtomicLogEntryFront();
+    int numAtomicLogEntries() const;
+    void clearAtomicLogEntries();
     uint8_t *getDataMod(int offset);
     void setByte(int whichByte, uint8_t data);
     void setData(const uint8_t *data, int offset, int len);
@@ -94,6 +104,9 @@ class DataBlock
     void alloc();
     uint8_t *m_data;
     bool m_alloc;
+
+    // Tracks block changes when atomic ops are applied
+    std::deque<uint8_t*> m_atomicLog;
 };
 
 inline void
diff --git a/src/mem/ruby/common/WriteMask.cc b/src/mem/ruby/common/WriteMask.cc
index 4c24a64706..911262b4ba 100644
--- a/src/mem/ruby/common/WriteMask.cc
+++ b/src/mem/ruby/common/WriteMask.cc
@@ -55,5 +55,27 @@ WriteMask::print(std::ostream& out) const
         << std::flush;
 }
 
+void
+WriteMask::performAtomic(uint8_t * p,
+        std::deque<uint8_t*>& log) const
+{
+    int offset;
+    uint8_t *block_update;
+    // Here, operations occur in FIFO order from the mAtomicOp
+    // vector. This is done to match the ordering of packets
+    // that was seen when the initial coalesced request was created.
+    for (int i = 0; i < mAtomicOp.size(); i++) {
+        // Save the old value of the data block in case a
+        // return value is needed
+        block_update = new uint8_t[mSize];
+        std::memcpy(block_update, p, mSize);
+        log.push_back(block_update);
+        // Perform the atomic operation
+        offset = mAtomicOp[i].first;
+        AtomicOpFunctor *fnctr = mAtomicOp[i].second;
+        (*fnctr)(&p[offset]);
+    }
+}
+
 } // namespace ruby
 } // namespace gem5
diff --git a/src/mem/ruby/common/WriteMask.hh b/src/mem/ruby/common/WriteMask.hh
index 2de21da79b..47ec798500 100644
--- a/src/mem/ruby/common/WriteMask.hh
+++ b/src/mem/ruby/common/WriteMask.hh
@@ -222,26 +222,15 @@ class WriteMask
 
     void print(std::ostream& out) const;
 
-    void
-    performAtomic(uint8_t * p) const
-    {
-        for (int i = 0; i < mAtomicOp.size(); i++) {
-            int offset = mAtomicOp[i].first;
-            AtomicOpFunctor *fnctr = mAtomicOp[i].second;
-            (*fnctr)(&p[offset]);
-        }
-    }
-
-    void
-    performAtomic(DataBlock & blk) const
-    {
-        for (int i = 0; i < mAtomicOp.size(); i++) {
-            int offset = mAtomicOp[i].first;
-            uint8_t *p = blk.getDataMod(offset);
-            AtomicOpFunctor *fnctr = mAtomicOp[i].second;
-            (*fnctr)(p);
-        }
-    }
+    /*
+     * Performs atomic operations on the data block pointed to by p. The
+     * atomic operations to perform are in the vector mAtomicOp. The
+     * effect of each atomic operation is pushed to the atomicChangeLog
+     * so that each individual atomic requestor may see the results of their
+     * specific atomic operation.
+     */
+    void performAtomic(uint8_t * p,
+            std::deque<uint8_t*>& atomicChangeLog) const;
 
     const AtomicOpVector&
     getAtomicOps() const
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
index 31fc484973..20a0979af1 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -523,6 +523,7 @@ machine(MachineType:TCC, "TCC Cache")
           out_msg.isSLCSet := in_msg.isSLCSet;
         }
     }
+    cache_entry.DataBlk.clearAtomicLogEntries();
   }
 
   action(bar_sendBypassedAtomicResponse, "bar", desc="send bypassed Atomic Ack") {
diff --git a/src/mem/ruby/protocol/RubySlicc_Exports.sm b/src/mem/ruby/protocol/RubySlicc_Exports.sm
index a32983ada4..2e496a8221 100644
--- a/src/mem/ruby/protocol/RubySlicc_Exports.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm
@@ -77,6 +77,8 @@ structure(DataBlock, external = "yes", desc="..."){
   void copyPartial(DataBlock, int, int);
   void copyPartial(DataBlock, WriteMask);
   void atomicPartial(DataBlock, WriteMask);
+  int numAtomicLogEntries();
+  void clearAtomicLogEntries();
 }
 
 bool testAndRead(Addr addr, DataBlock datablk, Packet *pkt);
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
index 8bde3f7bc8..beb8da3f9c 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -554,25 +554,48 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
                       success, isRegion);
     // update the data
     //
-    // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
+    // MUST ADD DOING THIS FOR EACH REQUEST IN COALESCER
     std::vector<PacketPtr> pktList = crequest->getPackets();
+
+    uint8_t* log = nullptr;
     DPRINTF(GPUCoalescer, "Responding to %d packets for addr 0x%X\n",
             pktList.size(), request_line_address);
+    uint32_t offset;
+    int pkt_size;
     for (auto& pkt : pktList) {
-        request_address = pkt->getAddr();
+        offset = getOffset(pkt->getAddr());
+        pkt_size = pkt->getSize();
         if (pkt->getPtr<uint8_t>()) {
-            if ((type == RubyRequestType_LD) ||
-                (type == RubyRequestType_ATOMIC) ||
-                (type == RubyRequestType_ATOMIC_RETURN) ||
-                (type == RubyRequestType_IFETCH) ||
-                (type == RubyRequestType_RMW_Read) ||
-                (type == RubyRequestType_Locked_RMW_Read) ||
-                (type == RubyRequestType_Load_Linked)) {
-                pkt->setData(
-                    data.getData(getOffset(request_address), pkt->getSize()));
-            } else {
-                data.setData(pkt->getPtr<uint8_t>(),
-                             getOffset(request_address), pkt->getSize());
+            switch(type) {
+                // Store and AtomicNoReturns follow the same path, as the
+                // data response is not needed.
+                case RubyRequestType_ATOMIC_NO_RETURN:
+                    assert(pkt->isAtomicOp());
+                case RubyRequestType_ST:
+                    data.setData(pkt->getPtr<uint8_t>(), offset, pkt_size);
+                    break;
+                case RubyRequestType_LD:
+                    pkt->setData(data.getData(offset, pkt_size));
+                    break;
+                case RubyRequestType_ATOMIC_RETURN:
+                    assert(pkt->isAtomicOp());
+                    // Atomic operations are performed by the WriteMask
+                    // in packet order, set by the crequest. Thus, when
+                    // unpacking the changes from the log, we read from
+                    // the front of the log to correctly map response
+                    // data into the packets.
+
+                    // Log entry contains the old value before the current
+                    // atomic operation occurred.
+                    log = data.popAtomicLogEntryFront();
+                    pkt->setData(&log[offset]);
+                    delete [] log;
+                    log = nullptr;
+                    break;
+                default:
+                    panic("Unsupported ruby packet type:%s\n",
+                                    RubyRequestType_to_string(type));
+                    break;
             }
         } else {
             DPRINTF(MemoryAccess,
@@ -581,6 +604,7 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
                     RubyRequestType_to_string(type));
         }
     }
+    assert(data.numAtomicLogEntries() == 0);
 
     m_outstanding_count--;
     assert(m_outstanding_count >= 0);