mem: Atomic ops to same address

Augmenting the DataBlock class with a change log structure to record the effects of atomic operations on a data block and service these changes if the atomic operations require return values. Although the operations are atomic, the coalescer need not send unique memory requests for each operation. Atomic operations within a wavefront to the same address are now coalesced into a single memory request. The response of this request carries all the necessary information to provide the requesting lanes unique values as a result of their individual atomic operations. This helps reduce contention for request and response queues in simulation. Previously, only the final value of the datablock after all atomic ops to the same address was visible to the requesting waves. This change corrects this behavior by allowing each wave to see the effect of this individual atomic op is a return value is necessary. Change-Id: I639bea943afd317e45f8fa3bff7689f6b8df9395
2023-08-18 12:08:01 -05:00
parent c218104f52
commit f6a453362f
7 changed files with 146 additions and 39 deletions
--- a/src/mem/ruby/common/DataBlock.cc
+++ b/src/mem/ruby/common/DataBlock.cc
@@ -51,9 +51,19 @@ namespace ruby

 DataBlock::DataBlock(const DataBlock &cp)
 {
-    m_data = new uint8_t[RubySystem::getBlockSizeBytes()];
-    memcpy(m_data, cp.m_data, RubySystem::getBlockSizeBytes());
+    uint8_t *block_update;
+    size_t block_bytes = RubySystem::getBlockSizeBytes();
+    m_data = new uint8_t[block_bytes];
+    memcpy(m_data, cp.m_data, block_bytes);
    m_alloc = true;
+    // If this data block is involved in an atomic operation, the effect
+    // of applying the atomic operations on the data block are recorded in
+    // m_atomicLog. If so, we must copy over every entry in the change log
+    for (size_t i = 0; i < cp.m_atomicLog.size(); i++) {
+        block_update = new uint8_t[block_bytes];
+        memcpy(block_update, cp.m_atomicLog[i], block_bytes);
+        m_atomicLog.push_back(block_update);
+    }
 }

 void
@@ -73,7 +83,20 @@ DataBlock::clear()
 bool
 DataBlock::equal(const DataBlock& obj) const
 {
-    return !memcmp(m_data, obj.m_data, RubySystem::getBlockSizeBytes());
+    size_t block_bytes = RubySystem::getBlockSizeBytes();
+    // Check that the block contents match
+    if (memcmp(m_data, obj.m_data, block_bytes)) {
+        return false;
+    }
+    if (m_atomicLog.size() != obj.m_atomicLog.size()) {
+        return false;
+    }
+    for (size_t i = 0; i < m_atomicLog.size(); i++) {
+        if (memcmp(m_atomicLog[i], obj.m_atomicLog[i], block_bytes)) {
+            return false;
+        }
+    }
+    return true;
 }

 void
@@ -92,7 +115,7 @@ DataBlock::atomicPartial(const DataBlock &dblk, const WriteMask &mask)
    for (int i = 0; i < RubySystem::getBlockSizeBytes(); i++) {
        m_data[i] = dblk.m_data[i];
    }
-    mask.performAtomic(m_data);
+    mask.performAtomic(m_data, m_atomicLog);
 }

 void
@@ -107,6 +130,28 @@ DataBlock::print(std::ostream& out) const
    out << std::dec << "]" << std::flush;
 }

+int
+DataBlock::numAtomicLogEntries() const
+{
+    return m_atomicLog.size();
+}
+uint8_t*
+DataBlock::popAtomicLogEntryFront()
+{
+    assert(m_atomicLog.size() > 0);
+    auto ret = m_atomicLog.front();
+    m_atomicLog.pop_front();
+    return ret;
+}
+void
+DataBlock::clearAtomicLogEntries()
+{
+    for (auto log : m_atomicLog) {
+        delete [] log;
+    }
+    m_atomicLog.clear();
+}
+
 const uint8_t*
 DataBlock::getData(int offset, int len) const
 {
@@ -137,7 +182,18 @@ DataBlock::setData(PacketPtr pkt)
 DataBlock &
 DataBlock::operator=(const DataBlock & obj)
 {
-    memcpy(m_data, obj.m_data, RubySystem::getBlockSizeBytes());
+    uint8_t *block_update;
+    size_t block_bytes = RubySystem::getBlockSizeBytes();
+    // Copy entire block contents from obj to current block
+    memcpy(m_data, obj.m_data, block_bytes);
+    // If this data block is involved in an atomic operation, the effect
+    // of applying the atomic operations on the data block are recorded in
+    // m_atomicLog. If so, we must copy over every entry in the change log
+    for (size_t i = 0; i < obj.m_atomicLog.size(); i++) {
+        block_update = new uint8_t[block_bytes];
+        memcpy(block_update, obj.m_atomicLog[i], block_bytes);
+        m_atomicLog.push_back(block_update);
+    }
    return *this;
 }

--- a/src/mem/ruby/common/DataBlock.hh
+++ b/src/mem/ruby/common/DataBlock.hh
@@ -44,6 +44,7 @@
 #include <inttypes.h>

 #include <cassert>
+#include <deque>
 #include <iomanip>
 #include <iostream>

@@ -71,6 +72,12 @@ class DataBlock
    {
        if (m_alloc)
            delete [] m_data;
+
+        // If data block involved in atomic
+        // operations, free all meta data
+        for (auto log : m_atomicLog) {
+            delete [] log;
+        }
    }

    DataBlock& operator=(const DataBlock& obj);
@@ -80,6 +87,9 @@ class DataBlock
    void clear();
    uint8_t getByte(int whichByte) const;
    const uint8_t *getData(int offset, int len) const;
+    uint8_t* popAtomicLogEntryFront();
+    int numAtomicLogEntries() const;
+    void clearAtomicLogEntries();
    uint8_t *getDataMod(int offset);
    void setByte(int whichByte, uint8_t data);
    void setData(const uint8_t *data, int offset, int len);
@@ -94,6 +104,9 @@ class DataBlock
    void alloc();
    uint8_t *m_data;
    bool m_alloc;
+
+    // Tracks block changes when atomic ops are applied
+    std::deque<uint8_t*> m_atomicLog;
 };

 inline void
--- a/src/mem/ruby/common/WriteMask.cc
+++ b/src/mem/ruby/common/WriteMask.cc
@@ -55,5 +55,27 @@ WriteMask::print(std::ostream& out) const
        << std::flush;
 }

+void
+WriteMask::performAtomic(uint8_t * p,
+        std::deque<uint8_t*>& log) const
+{
+    int offset;
+    uint8_t *block_update;
+    // Here, operations occur in FIFO order from the mAtomicOp
+    // vector. This is done to match the ordering of packets
+    // that was seen when the initial coalesced request was created.
+    for (int i = 0; i < mAtomicOp.size(); i++) {
+        // Save the old value of the data block in case a
+        // return value is needed
+        block_update = new uint8_t[mSize];
+        std::memcpy(block_update, p, mSize);
+        log.push_back(block_update);
+        // Perform the atomic operation
+        offset = mAtomicOp[i].first;
+        AtomicOpFunctor *fnctr = mAtomicOp[i].second;
+        (*fnctr)(&p[offset]);
+    }
+}
+
 } // namespace ruby
 } // namespace gem5
--- a/src/mem/ruby/common/WriteMask.hh
+++ b/src/mem/ruby/common/WriteMask.hh
@@ -222,26 +222,15 @@ class WriteMask

    void print(std::ostream& out) const;

-    void
-    performAtomic(uint8_t * p) const
-    {
-        for (int i = 0; i < mAtomicOp.size(); i++) {
-            int offset = mAtomicOp[i].first;
-            AtomicOpFunctor *fnctr = mAtomicOp[i].second;
-            (*fnctr)(&p[offset]);
-        }
-    }
-
-    void
-    performAtomic(DataBlock & blk) const
-    {
-        for (int i = 0; i < mAtomicOp.size(); i++) {
-            int offset = mAtomicOp[i].first;
-            uint8_t *p = blk.getDataMod(offset);
-            AtomicOpFunctor *fnctr = mAtomicOp[i].second;
-            (*fnctr)(p);
-        }
-    }
+    /*
+     * Performs atomic operations on the data block pointed to by p. The
+     * atomic operations to perform are in the vector mAtomicOp. The
+     * effect of each atomic operation is pushed to the atomicChangeLog
+     * so that each individual atomic requestor may see the results of their
+     * specific atomic operation.
+     */
+    void performAtomic(uint8_t * p,
+            std::deque<uint8_t*>& atomicChangeLog) const;

    const AtomicOpVector&
    getAtomicOps() const
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -523,6 +523,7 @@ machine(MachineType:TCC, "TCC Cache")
          out_msg.isSLCSet := in_msg.isSLCSet;
        }
    }
+    cache_entry.DataBlk.clearAtomicLogEntries();
  }

  action(bar_sendBypassedAtomicResponse, "bar", desc="send bypassed Atomic Ack") {
--- a/src/mem/ruby/protocol/RubySlicc_Exports.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm
@@ -77,6 +77,8 @@ structure(DataBlock, external = "yes", desc="..."){
  void copyPartial(DataBlock, int, int);
  void copyPartial(DataBlock, WriteMask);
  void atomicPartial(DataBlock, WriteMask);
+  int numAtomicLogEntries();
+  void clearAtomicLogEntries();
 }

 bool testAndRead(Addr addr, DataBlock datablk, Packet *pkt);
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -554,25 +554,48 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
                      success, isRegion);
    // update the data
    //
-    // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
+    // MUST ADD DOING THIS FOR EACH REQUEST IN COALESCER
    std::vector<PacketPtr> pktList = crequest->getPackets();
+
+    uint8_t* log = nullptr;
    DPRINTF(GPUCoalescer, "Responding to %d packets for addr 0x%X\n",
            pktList.size(), request_line_address);
+    uint32_t offset;
+    int pkt_size;
    for (auto& pkt : pktList) {
-        request_address = pkt->getAddr();
+        offset = getOffset(pkt->getAddr());
+        pkt_size = pkt->getSize();
        if (pkt->getPtr<uint8_t>()) {
-            if ((type == RubyRequestType_LD) ||
-                (type == RubyRequestType_ATOMIC) ||
-                (type == RubyRequestType_ATOMIC_RETURN) ||
-                (type == RubyRequestType_IFETCH) ||
-                (type == RubyRequestType_RMW_Read) ||
-                (type == RubyRequestType_Locked_RMW_Read) ||
-                (type == RubyRequestType_Load_Linked)) {
-                pkt->setData(
-                    data.getData(getOffset(request_address), pkt->getSize()));
-            } else {
-                data.setData(pkt->getPtr<uint8_t>(),
-                             getOffset(request_address), pkt->getSize());
+            switch(type) {
+                // Store and AtomicNoReturns follow the same path, as the
+                // data response is not needed.
+                case RubyRequestType_ATOMIC_NO_RETURN:
+                    assert(pkt->isAtomicOp());
+                case RubyRequestType_ST:
+                    data.setData(pkt->getPtr<uint8_t>(), offset, pkt_size);
+                    break;
+                case RubyRequestType_LD:
+                    pkt->setData(data.getData(offset, pkt_size));
+                    break;
+                case RubyRequestType_ATOMIC_RETURN:
+                    assert(pkt->isAtomicOp());
+                    // Atomic operations are performed by the WriteMask
+                    // in packet order, set by the crequest. Thus, when
+                    // unpacking the changes from the log, we read from
+                    // the front of the log to correctly map response
+                    // data into the packets.
+
+                    // Log entry contains the old value before the current
+                    // atomic operation occurred.
+                    log = data.popAtomicLogEntryFront();
+                    pkt->setData(&log[offset]);
+                    delete [] log;
+                    log = nullptr;
+                    break;
+                default:
+                    panic("Unsupported ruby packet type:%s\n",
+                                    RubyRequestType_to_string(type));
+                    break;
            }
        } else {
            DPRINTF(MemoryAccess,
@@ -581,6 +604,7 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
                    RubyRequestType_to_string(type));
        }
    }
+    assert(data.numAtomicLogEntries() == 0);

    m_outstanding_count--;
    assert(m_outstanding_count >= 0);