From 4fd9d66c537671f0950687cac327c4513c9c8eed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADctor=20Soria?= <victor.soria@bsc.es>
Date: Mon, 24 Jul 2023 16:20:10 +0200
Subject: [PATCH 1/3] tests,mem-ruby: Enhance ruby false sharing test with
 Atomics

New ruby mem test includes a percentages of AMOs that will be executed randomly in ruby mem test

Change-Id: Ie95ed78e59ea773ce6b59060eaece3701fe4478c
---
 configs/example/ruby_mem_test.py   | 11 ++++-
 src/cpu/testers/memtest/MemTest.py |  1 +
 src/cpu/testers/memtest/memtest.cc | 72 +++++++++++++++++++++++++-----
 src/cpu/testers/memtest/memtest.hh |  4 ++
 src/mem/request.hh                 |  7 +++
 5 files changed, 81 insertions(+), 14 deletions(-)

diff --git a/configs/example/ruby_mem_test.py b/configs/example/ruby_mem_test.py
index c90950107e..27751376fc 100644
--- a/configs/example/ruby_mem_test.py
+++ b/configs/example/ruby_mem_test.py
@@ -62,6 +62,12 @@ parser.add_argument(
     default=0,
     help="percentage of accesses that should be functional",
 )
+parser.add_argument(
+    "--atomic",
+    type=int,
+    default=30,
+    help="percentage of accesses that should be atomic",
+)
 parser.add_argument(
     "--suppress-func-errors",
     action="store_true",
@@ -105,6 +111,7 @@ cpus = [
         max_loads=args.maxloads,
         percent_functional=args.functional,
         percent_uncacheable=0,
+        percent_atomic=args.atomic,
         progress_interval=args.progress,
         suppress_func_errors=args.suppress_func_errors,
     )
@@ -133,7 +140,7 @@ else:
     dmas = []
 
 dma_ports = []
-for (i, dma) in enumerate(dmas):
+for i, dma in enumerate(dmas):
     dma_ports.append(dma.test)
 Ruby.create_system(args, False, system, dma_ports=dma_ports)
 
@@ -155,7 +162,7 @@ system.ruby.randomization = True
 
 assert len(cpus) == len(system.ruby._cpu_ports)
 
-for (i, cpu) in enumerate(cpus):
+for i, cpu in enumerate(cpus):
     #
     # Tie the cpu memtester ports to the correct system ports
     #
diff --git a/src/cpu/testers/memtest/MemTest.py b/src/cpu/testers/memtest/MemTest.py
index 24bd974804..15c329ee85 100644
--- a/src/cpu/testers/memtest/MemTest.py
+++ b/src/cpu/testers/memtest/MemTest.py
@@ -63,6 +63,7 @@ class MemTest(ClockedObject):
     percent_reads = Param.Percent(65, "Percentage reads")
     percent_functional = Param.Percent(50, "Percentage functional accesses")
     percent_uncacheable = Param.Percent(10, "Percentage uncacheable")
+    percent_atomic = Param.Percent(50, "Percentage atomics")
 
     # Determine how often to print progress messages and what timeout
     # to use for checking progress of both requests and responses
diff --git a/src/cpu/testers/memtest/memtest.cc b/src/cpu/testers/memtest/memtest.cc
index 7c256d8642..a84bf67cd9 100644
--- a/src/cpu/testers/memtest/memtest.cc
+++ b/src/cpu/testers/memtest/memtest.cc
@@ -94,6 +94,7 @@ MemTest::MemTest(const Params &p)
       percentReads(p.percent_reads),
       percentFunctional(p.percent_functional),
       percentUncacheable(p.percent_uncacheable),
+      percentAtomic(p.percent_atomic),
       requestorId(p.system->getRequestorId(this)),
       blockSize(p.system->cacheLineSize()),
       blockAddrMask(blockSize - 1),
@@ -115,6 +116,7 @@ MemTest::MemTest(const Params &p)
     // set up counters
     numReads = 0;
     numWrites = 0;
+    numAtomics = 0;
 
     // kick things into action
     schedule(tickEvent, curTick());
@@ -142,7 +144,7 @@ MemTest::completeRequest(PacketPtr pkt, bool functional)
     outstandingAddrs.erase(remove_addr);
 
     DPRINTF(MemTest, "Completing %s at address %x (blk %x) %s\n",
-            pkt->isWrite() ? "write" : "read",
+            pkt->isWrite() ? pkt->isAtomicOp() ? "atomic" : "write" : "read",
             req->getPaddr(), blockAlign(req->getPaddr()),
             pkt->isError() ? "error" : "success");
 
@@ -153,7 +155,25 @@ MemTest::completeRequest(PacketPtr pkt, bool functional)
             panic( "%s access failed at %#x\n",
                 pkt->isWrite() ? "Write" : "Read", req->getPaddr());
     } else {
-        if (pkt->isRead()) {
+        if (pkt->isAtomicOp()) {
+            uint8_t ref_data = referenceData[req->getPaddr()];
+            if (pkt_data[0] != ref_data) {
+                panic("%s: read of %x (blk %x) @ cycle %d "
+                      "returns %x, expected %x\n", name(),
+                       req->getPaddr(), blockAlign(req->getPaddr()), curTick(),
+                       pkt_data[0], ref_data);
+            }
+            DPRINTF(MemTest,
+                    "Completing atomic at address %x (blk %x) value %x\n",
+                    req->getPaddr(), blockAlign(req->getPaddr()),
+                    pkt_data[0]);
+
+            referenceData[req->getPaddr()] =
+                   atomicPendingData[req->getPaddr()];
+
+            numAtomics++;
+            stats.numAtomics++;
+        } else if (pkt->isRead()) {
             uint8_t ref_data = referenceData[req->getPaddr()];
             if (pkt_data[0] != ref_data) {
                 panic("%s: read of %x (blk %x) @ cycle %d "
@@ -167,9 +187,10 @@ MemTest::completeRequest(PacketPtr pkt, bool functional)
 
             if (numReads == (uint64_t)nextProgressMessage) {
                 ccprintf(std::cerr,
-                        "%s: completed %d read, %d write accesses @%d\n",
-                        name(), numReads, numWrites, curTick());
-                nextProgressMessage += progressInterval;
+                        "%s: completed %d read, %d write, "
+                        "%d atomic accesses @%d\n",
+                        name(), numReads, numWrites, numAtomics, curTick());
+                        nextProgressMessage += progressInterval;
             }
 
             if (maxLoads != 0 && numReads >= maxLoads)
@@ -205,7 +226,9 @@ MemTest::MemTestStats::MemTestStats(statistics::Group *parent)
       ADD_STAT(numReads, statistics::units::Count::get(),
                "number of read accesses completed"),
       ADD_STAT(numWrites, statistics::units::Count::get(),
-               "number of write accesses completed")
+               "number of write accesses completed"),
+      ADD_STAT(numAtomics, statistics::units::Count::get(),
+               "number of atomic accesses completed")
 {
 
 }
@@ -221,6 +244,8 @@ MemTest::tick()
     unsigned cmd = random_mt.random(0, 100);
     uint8_t data = random_mt.random<uint8_t>();
     bool uncacheable = random_mt.random(0, 100) < percentUncacheable;
+    bool do_atomic = (random_mt.random(0, 100) < percentAtomic) &&
+                     !uncacheable;
     unsigned base = random_mt.random(0, 1);
     Request::Flags flags;
     Addr paddr;
@@ -281,13 +306,36 @@ MemTest::tick()
         pkt = new Packet(req, MemCmd::ReadReq);
         pkt->dataDynamic(pkt_data);
     } else {
-        DPRINTF(MemTest, "Initiating %swrite at addr %x (blk %x) value %x\n",
-                do_functional ? "functional " : "", req->getPaddr(),
-                blockAlign(req->getPaddr()), data);
+        if (do_atomic) {
+            DPRINTF(MemTest,
+                    "Initiating atomic at addr %x (blk %x) value %x\n",
+                    req->getPaddr(), blockAlign(req->getPaddr()), data);
 
-        pkt = new Packet(req, MemCmd::WriteReq);
-        pkt->dataDynamic(pkt_data);
-        pkt_data[0] = data;
+            TypedAtomicOpFunctor<uint8_t> *_amo_op =
+                  new AtomicGeneric3Op<uint8_t>(
+                  data, data,
+                  [](uint8_t* b, uint8_t a, uint8_t c){
+                      *b = c;
+                  });
+            assert(_amo_op);
+            AtomicOpFunctorPtr amo_op = AtomicOpFunctorPtr(_amo_op);
+            req->setAtomicOpFunctor(std::move(amo_op));
+            req->setFlags(Request::ATOMIC_RETURN_OP);
+
+            pkt = new Packet(req, MemCmd::WriteReq);
+            pkt->dataDynamic(pkt_data);
+            pkt_data[0] = data;
+            atomicPendingData[req->getPaddr()] = data;
+        } else {
+            DPRINTF(MemTest,
+                    "Initiating %swrite at addr %x (blk %x) value %x\n",
+                    do_functional ? "functional " : "", req->getPaddr(),
+                    blockAlign(req->getPaddr()), data);
+
+            pkt = new Packet(req, MemCmd::WriteReq);
+            pkt->dataDynamic(pkt_data);
+            pkt_data[0] = data;
+        }
     }
 
     // there is no point in ticking if we are waiting for a retry
diff --git a/src/cpu/testers/memtest/memtest.hh b/src/cpu/testers/memtest/memtest.hh
index 3fd1674191..ee512048c1 100644
--- a/src/cpu/testers/memtest/memtest.hh
+++ b/src/cpu/testers/memtest/memtest.hh
@@ -131,6 +131,7 @@ class MemTest : public ClockedObject
     const unsigned percentReads;
     const unsigned percentFunctional;
     const unsigned percentUncacheable;
+    const unsigned percentAtomic;
 
     /** Request id for all generated traffic */
     RequestorID requestorId;
@@ -138,6 +139,7 @@ class MemTest : public ClockedObject
     unsigned int id;
 
     std::unordered_set<Addr> outstandingAddrs;
+    std::unordered_map<Addr, uint8_t> atomicPendingData;
 
     // store the expected value for the addresses we have touched
     std::unordered_map<Addr, uint8_t> referenceData;
@@ -169,6 +171,7 @@ class MemTest : public ClockedObject
 
     uint64_t numReads;
     uint64_t numWrites;
+    uint64_t numAtomics;
     const uint64_t maxLoads;
 
     const bool atomic;
@@ -180,6 +183,7 @@ class MemTest : public ClockedObject
         MemTestStats(statistics::Group *parent);
         statistics::Scalar numReads;
         statistics::Scalar numWrites;
+        statistics::Scalar numAtomics;
     } stats;
 
     /**
diff --git a/src/mem/request.hh b/src/mem/request.hh
index 491aad0241..df249ac249 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -757,6 +757,13 @@ class Request : public Extensible<Request>
         return atomicOpFunctor.get();
     }
 
+    void
+    setAtomicOpFunctor(AtomicOpFunctorPtr amo_op)
+    {
+        atomicOpFunctor = std::move(amo_op);
+    }
+
+
     /**
      * Accessor for hardware transactional memory abort cause.
      */

From 12dada2dc5623e6b012047d432ffd31780f5b55d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADctor=20Soria?= <victor.soria@bsc.es>
Date: Mon, 24 Jul 2023 16:25:08 +0200
Subject: [PATCH 2/3] arch-arm: Correct return operand in swap instructions

Swap instructions are configured as non returning AMO operations. This is wrong because they
return the previous value stored in the target memory position

Change-Id: I84d75a571a8eaeaee0dbfac344f7b34c72b47d53
---
 src/arch/arm/isa/insts/amo64.isa | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/arch/arm/isa/insts/amo64.isa b/src/arch/arm/isa/insts/amo64.isa
index 72eea89518..3de9a41bfe 100644
--- a/src/arch/arm/isa/insts/amo64.isa
+++ b/src/arch/arm/isa/insts/amo64.isa
@@ -827,35 +827,35 @@ let {{
                    ret_op=False, flavor="release").emit(OP_DICT['MIN'])
 
     AtomicArithmeticSingleOp("swpb",   "SWPB",    1, unsign=True,
-                   ret_op=False, flavor="normal").emit(OP_DICT['SWP'])
+                   flavor="normal").emit(OP_DICT['SWP'])
     AtomicArithmeticSingleOp("swplb",  "SWPLB",   1, unsign=True,
-                   ret_op=False, flavor="release").emit(OP_DICT['SWP'])
+                   flavor="release").emit(OP_DICT['SWP'])
     AtomicArithmeticSingleOp("swpab",  "SWPAB",   1, unsign=True,
-                   ret_op=False, flavor="acquire").emit(OP_DICT['SWP'])
+                   flavor="acquire").emit(OP_DICT['SWP'])
     AtomicArithmeticSingleOp("swplab", "SWPLAB",  1, unsign=True,
-                   ret_op=False, flavor="acquire_release").emit(OP_DICT['SWP'])
+                   flavor="acquire_release").emit(OP_DICT['SWP'])
     AtomicArithmeticSingleOp("swph",   "SWPH",    2, unsign=True,
-                   ret_op=False, flavor="normal").emit(OP_DICT['SWP'])
+                   flavor="normal").emit(OP_DICT['SWP'])
     AtomicArithmeticSingleOp("swplh",  "SWPLH",   2, unsign=True,
-                   ret_op=False, flavor="release").emit(OP_DICT['SWP'])
+                   flavor="release").emit(OP_DICT['SWP'])
     AtomicArithmeticSingleOp("swpah",  "SWPAH",   2, unsign=True,
-                   ret_op=False, flavor="acquire").emit(OP_DICT['SWP'])
+                   flavor="acquire").emit(OP_DICT['SWP'])
     AtomicArithmeticSingleOp("swplah", "SWPLAH",  2, unsign=True,
-                   ret_op=False, flavor="acquire_release").emit(OP_DICT['SWP'])
+                   flavor="acquire_release").emit(OP_DICT['SWP'])
     AtomicArithmeticSingleOp("swp",    "SWP",     4, unsign=True,
-                   ret_op=False, flavor="normal").emit(OP_DICT['SWP'])
+                   flavor="normal").emit(OP_DICT['SWP'])
     AtomicArithmeticSingleOp("swpl",   "SWPL",    4, unsign=True,
-                   ret_op=False, flavor="release").emit(OP_DICT['SWP'])
+                   flavor="release").emit(OP_DICT['SWP'])
     AtomicArithmeticSingleOp("swpa",   "SWPA",    4, unsign=True,
-                   ret_op=False, flavor="acquire").emit(OP_DICT['SWP'])
+                   flavor="acquire").emit(OP_DICT['SWP'])
     AtomicArithmeticSingleOp("swpla",  "SWPLA",   4, unsign=True,
-                   ret_op=False, flavor="acquire_release").emit(OP_DICT['SWP'])
+                   flavor="acquire_release").emit(OP_DICT['SWP'])
     AtomicArithmeticSingleOp("swp64",  "SWP64",    8, unsign=True,
-                   ret_op=False, flavor="normal").emit(OP_DICT['SWP'])
+                   flavor="normal").emit(OP_DICT['SWP'])
     AtomicArithmeticSingleOp("swpl64", "SWPL64",   8, unsign=True,
-                   ret_op=False, flavor="release").emit(OP_DICT['SWP'])
+                   flavor="release").emit(OP_DICT['SWP'])
     AtomicArithmeticSingleOp("swpa64", "SWPA64",   8, unsign=True,
-                   ret_op=False, flavor="acquire").emit(OP_DICT['SWP'])
+                   flavor="acquire").emit(OP_DICT['SWP'])
     AtomicArithmeticSingleOp("swpla64", "SWPLA64", 8, unsign=True,
-                   ret_op=False, flavor="acquire_release").emit(OP_DICT['SWP'])
+                   flavor="acquire_release").emit(OP_DICT['SWP'])
 }};

From 6411b2255ccdab3ad782f385b94cd43b248a46a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADctor=20Soria?= <victor.soria@bsc.es>
Date: Thu, 10 Aug 2023 17:04:17 +0200
Subject: [PATCH 3/3] mem-ruby,configs: Add CHI far atomics support

Introduce far atomic operations in CHI protocol.
Three configuration parameters have been used to tune this behavior:

  policy_type:       sets the atomic policy to one of the described in our paper
  atomic_op_latency: simulates the AMO ALU operation latency
  comp_anr:          configures the Atomic No return transaction to split
                     CompDBIDResp into two different messages DBIDResp and Comp

Change-Id: I087afad9ad9fcb9df42d72893c9e32ad5a5eb478
---
 configs/ruby/CHI_config.py                    |   4 +
 src/mem/ruby/protocol/RubySlicc_Types.sm      |   7 +
 .../ruby/protocol/chi/CHI-cache-actions.sm    | 477 +++++++++++++++++-
 src/mem/ruby/protocol/chi/CHI-cache-funcs.sm  |  51 +-
 .../protocol/chi/CHI-cache-transitions.sm     | 221 +++++++-
 src/mem/ruby/protocol/chi/CHI-cache.sm        |  69 ++-
 src/mem/ruby/protocol/chi/CHI-msg.sm          |   7 +
 src/mem/ruby/slicc_interface/RubyRequest.cc   |   9 +
 src/mem/ruby/slicc_interface/RubyRequest.hh   |   2 +
 src/mem/ruby/system/Sequencer.cc              |  99 +++-
 src/mem/ruby/system/Sequencer.hh              |   8 +
 11 files changed, 924 insertions(+), 30 deletions(-)

diff --git a/configs/ruby/CHI_config.py b/configs/ruby/CHI_config.py
index 4f2580c373..1288cf95d6 100644
--- a/configs/ruby/CHI_config.py
+++ b/configs/ruby/CHI_config.py
@@ -244,6 +244,7 @@ class CHI_L1Controller(CHI_Cache_Controller):
         self.alloc_on_readunique = True
         self.alloc_on_readonce = True
         self.alloc_on_writeback = True
+        self.alloc_on_atomic = False
         self.dealloc_on_unique = False
         self.dealloc_on_shared = False
         self.dealloc_backinv_unique = True
@@ -280,6 +281,7 @@ class CHI_L2Controller(CHI_Cache_Controller):
         self.alloc_on_readunique = True
         self.alloc_on_readonce = True
         self.alloc_on_writeback = True
+        self.alloc_on_atomic = False
         self.dealloc_on_unique = False
         self.dealloc_on_shared = False
         self.dealloc_backinv_unique = True
@@ -316,6 +318,7 @@ class CHI_HNFController(CHI_Cache_Controller):
         self.alloc_on_readunique = False
         self.alloc_on_readonce = True
         self.alloc_on_writeback = True
+        self.alloc_on_atomic = True
         self.dealloc_on_unique = True
         self.dealloc_on_shared = False
         self.dealloc_backinv_unique = False
@@ -392,6 +395,7 @@ class CHI_DMAController(CHI_Cache_Controller):
         self.alloc_on_readunique = False
         self.alloc_on_readonce = False
         self.alloc_on_writeback = False
+        self.alloc_on_atomic = False
         self.dealloc_on_unique = False
         self.dealloc_on_shared = False
         self.dealloc_backinv_unique = False
diff --git a/src/mem/ruby/protocol/RubySlicc_Types.sm b/src/mem/ruby/protocol/RubySlicc_Types.sm
index 8ba9d935ff..293c731c37 100644
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -139,6 +139,13 @@ structure (Sequencer, external = "yes") {
                      Cycles, Cycles, Cycles);
   void writeUniqueCallback(Addr, DataBlock);
 
+  void atomicCallback(Addr, DataBlock);
+  void atomicCallback(Addr, DataBlock, bool);
+  void atomicCallback(Addr, DataBlock, bool, MachineType);
+  void atomicCallback(Addr, DataBlock, bool, MachineType,
+                      Cycles, Cycles, Cycles);
+
+
   void unaddressedCallback(Addr, RubyRequestType);
   void unaddressedCallback(Addr, RubyRequestType, MachineType);
   void unaddressedCallback(Addr, RubyRequestType, MachineType,
diff --git a/src/mem/ruby/protocol/chi/CHI-cache-actions.sm b/src/mem/ruby/protocol/chi/CHI-cache-actions.sm
index 42e07eb46b..4c9498423c 100644
--- a/src/mem/ruby/protocol/chi/CHI-cache-actions.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache-actions.sm
@@ -148,15 +148,22 @@ action(AllocateTBE_SeqRequest, desc="") {
       out_msg.is_remote_pf := false;
       out_msg.txnId := max_outstanding_transactions;
 
+      out_msg.atomic_op.clear();
+      out_msg.atomic_op.orMask(in_msg.writeMask);
+
       if ((in_msg.Type == RubyRequestType:LD) ||
           (in_msg.Type == RubyRequestType:IFETCH)) {
         out_msg.type := CHIRequestType:Load;
-      } else  if (in_msg.Type == RubyRequestType:ST) {
+      } else if (in_msg.Type == RubyRequestType:ST) {
         if (in_msg.Size == blockSize) {
           out_msg.type := CHIRequestType:StoreLine;
         } else {
           out_msg.type := CHIRequestType:Store;
         }
+      } else if (in_msg.Type == RubyRequestType:ATOMIC_RETURN) {
+        out_msg.type := CHIRequestType:AtomicLoad;
+      } else if (in_msg.Type == RubyRequestType:ATOMIC_NO_RETURN){
+        out_msg.type := CHIRequestType:AtomicStore;
       } else {
         error("Invalid RubyRequestType");
       }
@@ -769,6 +776,148 @@ action(Initiate_StoreMiss, desc="") {
   }
 }
 
+action(Initiate_Atomic_UC, desc="") {
+  if ((policy_type == 0) || // ALL NEAR
+      (policy_type == 1) || // UNIQUE NEAR
+      (policy_type == 2)    // PRESENT NEAR
+      ){
+    tbe.actions.push(Event:DataArrayRead);
+    tbe.actions.push(Event:DelayAtomic);
+    tbe.actions.push(Event:AtomicHit);
+    tbe.actions.pushNB(Event:DataArrayWrite);
+    tbe.actions.pushNB(Event:TagArrayWrite);
+  } else {
+    error("Invalid policy type");
+  }
+}
+
+action(Initiate_Atomic_UD, desc="") {
+  if ((policy_type == 0) || // ALL NEAR
+      (policy_type == 1) || // UNIQUE NEAR
+      (policy_type == 2)    // PRESENT NEAR
+      ){
+    tbe.actions.push(Event:DataArrayRead);
+    tbe.actions.push(Event:DelayAtomic);
+    tbe.actions.push(Event:AtomicHit);
+    tbe.actions.pushNB(Event:DataArrayWrite);
+    tbe.actions.pushNB(Event:TagArrayWrite);
+  } else {
+    error("Invalid policy type");
+  }
+}
+
+action(Initiate_AtomicReturn_I, desc="") {
+  if (policy_type == 0){ // ALL NEAR
+    tbe.actions.push(Event:SendReadUnique);
+    tbe.actions.push(Event:WriteFEPipe);
+    tbe.actions.push(Event:CheckCacheFill);
+    tbe.actions.push(Event:WriteBEPipe);
+    tbe.actions.push(Event:TagArrayWrite);
+    tbe.atomic_to_be_done := true;
+  } else if ((policy_type == 1) || // UNIQUE NEAR
+             (policy_type == 2)) { // PRESENT NEAR
+    tbe.actions.push(Event:SendAtomicReturn_NoWait);
+    tbe.dataToBeInvalid := true;
+    tbe.doCacheFill := false;
+    tbe.atomic_to_be_done := false;
+  } else {
+    error("Invalid policy type");
+  }
+}
+
+action(Initiate_AtomicNoReturn_I, desc="") {
+  if (policy_type == 0){ // ALL NEAR
+    tbe.actions.push(Event:SendReadUnique);
+    tbe.actions.push(Event:WriteFEPipe);
+    tbe.actions.push(Event:CheckCacheFill);
+    tbe.actions.push(Event:WriteBEPipe);
+    tbe.actions.push(Event:TagArrayWrite);
+    tbe.atomic_to_be_done := true;
+  } else if (policy_type == 1) { // UNIQUE NEAR
+    tbe.actions.push(Event:SendAtomicNoReturn);
+    tbe.actions.push(Event:SendANRData);
+    tbe.dataToBeInvalid := true;
+    tbe.doCacheFill := false;
+    tbe.atomic_to_be_done := false;
+  } else {
+    error("Invalid policy type");
+  }
+}
+
+action(Initiate_AtomicReturn_SD, desc="") {
+  if (policy_type == 0){ // ALL NEAR
+    tbe.actions.push(Event:SendReadUnique);
+    tbe.actions.push(Event:WriteFEPipe);
+    tbe.actions.push(Event:CheckCacheFill);
+    tbe.actions.push(Event:WriteBEPipe);
+    tbe.actions.push(Event:TagArrayWrite);
+    tbe.atomic_to_be_done := true;
+  } else if (policy_type == 1) { // UNIQUE NEAR
+    tbe.actions.push(Event:SendAtomicReturn_NoWait);
+    tbe.dataToBeInvalid := true;
+    tbe.doCacheFill := false;
+    tbe.atomic_to_be_done := false;
+  } else {
+    error("Invalid policy type");
+  }
+}
+
+action(Initiate_AtomicNoReturn_SD, desc="") {
+  if (policy_type == 0){ // ALL NEAR
+    tbe.actions.push(Event:SendReadUnique);
+    tbe.actions.push(Event:WriteFEPipe);
+    tbe.actions.push(Event:CheckCacheFill);
+    tbe.actions.push(Event:WriteBEPipe);
+    tbe.actions.push(Event:TagArrayWrite);
+    tbe.atomic_to_be_done := true;
+  } else if (policy_type == 1) { // UNIQUE NEAR
+    tbe.actions.push(Event:SendAtomicNoReturn);
+    tbe.actions.push(Event:SendANRData);
+    tbe.dataToBeInvalid := true;
+    tbe.doCacheFill := false;
+    tbe.atomic_to_be_done := false;
+  } else {
+    error("Invalid policy type");
+  }
+}
+
+action(Initiate_AtomicReturn_SC, desc="") {
+  if (policy_type == 0){ // ALL NEAR
+    tbe.actions.push(Event:SendReadUnique);
+    tbe.actions.push(Event:WriteFEPipe);
+    tbe.actions.push(Event:CheckCacheFill);
+    tbe.actions.push(Event:WriteBEPipe);
+    tbe.actions.push(Event:TagArrayWrite);
+    tbe.atomic_to_be_done := true;
+  } else if (policy_type == 1) { // UNIQUE NEAR
+    tbe.actions.push(Event:SendAtomicReturn_NoWait);
+    tbe.dataToBeInvalid := true;
+    tbe.doCacheFill := false;
+    tbe.atomic_to_be_done := false;
+  } else {
+    error("Invalid policy type");
+  }
+}
+
+action(Initiate_AtomicNoReturn_SC, desc="") {
+  if (policy_type == 0){ // ALL NEAR
+    tbe.actions.push(Event:SendReadUnique);
+    tbe.actions.push(Event:WriteFEPipe);
+    tbe.actions.push(Event:CheckCacheFill);
+    tbe.actions.push(Event:WriteBEPipe);
+    tbe.actions.push(Event:TagArrayWrite);
+    tbe.atomic_to_be_done := true;
+  } else if (policy_type == 1) { // UNIQUE NEAR
+    tbe.actions.push(Event:SendAtomicNoReturn);
+    tbe.actions.push(Event:SendANRData);
+    tbe.dataToBeInvalid := true;
+    tbe.doCacheFill := false;
+    tbe.atomic_to_be_done := false;
+  } else {
+    error("Invalid policy type");
+  }
+}
+
 action(Initiate_StoreUpgrade, desc="") {
   assert(tbe.dataValid);
   assert(is_valid(cache_entry));
@@ -865,8 +1014,111 @@ action(Initiate_WriteUnique_Forward, desc="") {
   tbe.actions.pushNB(Event:TagArrayWrite);
 }
 
+action(Initiate_AtomicReturn_LocalWrite, desc="") {
+  if ((tbe.dir_sharers.count() > 0) && tbe.dataMaybeDirtyUpstream) {
+    tbe.actions.push(Event:SendSnpUnique);
+  } else if (tbe.dir_sharers.count() > 0){
+    // no one will send us data unless we explicitly ask
+    tbe.actions.push(Event:SendSnpUniqueRetToSrc);
+  }
+  tbe.actions.push(Event:SendDBIDResp_AR);
+  tbe.actions.pushNB(Event:WriteFEPipe);
+  tbe.actions.pushNB(Event:SendCompData_AR);
+  tbe.actions.push(Event:WriteFEPipe);
+  tbe.actions.push(Event:CheckCacheFill);
+  tbe.actions.push(Event:DelayAtomic);
+  tbe.actions.push(Event:WriteBEPipe);
+  tbe.actions.push(Event:TagArrayWrite);
+}
 
 
+action(Initiate_AtomicNoReturn_LocalWrite, desc="") {
+  if ((tbe.dir_sharers.count() > 0) && tbe.dataMaybeDirtyUpstream) {
+    tbe.actions.push(Event:SendSnpUnique);
+  } else if (tbe.dir_sharers.count() > 0){
+    // no one will send us data unless we explicitly ask
+    tbe.actions.push(Event:SendSnpUniqueRetToSrc);
+  }
+  if (comp_wu) {
+    tbe.actions.push(Event:SendDBIDResp_ANR);
+    tbe.actions.pushNB(Event:WriteFEPipe);
+    tbe.actions.pushNB(Event:SendComp_ANR);
+  } else {
+    tbe.actions.push(Event:SendCompDBIDResp_ANR);
+    tbe.actions.pushNB(Event:WriteFEPipe);
+  }
+  tbe.actions.push(Event:WriteFEPipe);
+  tbe.actions.push(Event:CheckCacheFill);
+  tbe.actions.push(Event:DelayAtomic);
+  tbe.actions.push(Event:WriteBEPipe);
+  tbe.actions.push(Event:TagArrayWrite);
+}
+
+
+action(Initiate_AtomicReturn_Forward, desc="") {
+  if ((tbe.dir_sharers.count() > 0) &&
+     (tbe.dir_sharers.isElement(tbe.requestor))){
+    tbe.dir_sharers.remove(tbe.requestor);
+  }
+  tbe.actions.push(Event:SendAtomicReturn);
+  tbe.actions.push(Event:SendCompData_AR);
+  tbe.actions.pushNB(Event:TagArrayWrite);
+
+  tbe.dataToBeInvalid := true;
+}
+
+action(Initiate_AtomicNoReturn_Forward, desc="") {
+  if ((tbe.dir_sharers.count() > 0) &&
+     (tbe.dir_sharers.isElement(tbe.requestor))){
+    tbe.dir_sharers.remove(tbe.requestor);
+  }
+  if (comp_wu) {
+    tbe.actions.push(Event:SendAtomicNoReturn);
+    tbe.actions.push(Event:SendDBIDResp_ANR);
+    tbe.actions.pushNB(Event:SendComp_ANR);
+  } else {
+    tbe.actions.push(Event:SendAtomicNoReturn);
+    tbe.actions.push(Event:SendCompDBIDResp_ANR);
+  }
+  tbe.actions.push(Event:WriteBEPipe);
+  tbe.actions.push(Event:SendANRData);
+  tbe.actions.pushNB(Event:TagArrayWrite);
+
+  tbe.dataToBeInvalid := true;
+}
+
+action(Initiate_AtomicReturn_Miss, desc="") {
+  tbe.actions.push(Event:SendReadNoSnp);
+  tbe.actions.pushNB(Event:WriteFEPipe);
+  tbe.actions.push(Event:SendDBIDResp_AR);
+  tbe.actions.pushNB(Event:WriteFEPipe);
+  tbe.actions.pushNB(Event:SendCompData_AR);
+  tbe.actions.push(Event:WriteFEPipe);
+  tbe.actions.push(Event:CheckCacheFill);
+  tbe.actions.push(Event:DelayAtomic);
+  tbe.actions.push(Event:WriteBEPipe);
+  tbe.actions.push(Event:TagArrayWrite);
+}
+
+action(Initiate_AtomicNoReturn_Miss, desc="") {
+  assert(is_HN);
+  tbe.actions.push(Event:SendReadNoSnp);
+  if (comp_wu) {
+    tbe.actions.push(Event:SendDBIDResp_ANR);
+    tbe.actions.pushNB(Event:WriteFEPipe);
+    tbe.actions.pushNB(Event:SendComp_ANR);
+  } else {
+    tbe.actions.push(Event:SendCompDBIDResp_ANR);
+    tbe.actions.pushNB(Event:WriteFEPipe);
+  }
+
+  tbe.actions.push(Event:WriteFEPipe);
+  tbe.actions.push(Event:CheckCacheFill);
+  tbe.actions.push(Event:DelayAtomic);
+  tbe.actions.push(Event:WriteBEPipe);
+  tbe.actions.push(Event:TagArrayWrite);
+}
+
 action(Initiate_CopyBack, desc="") {
   // expect to receive this data after Send_CompDBIDResp
   if (tbe.reqType == CHIRequestType:WriteBackFull) {
@@ -1157,7 +1409,9 @@ action(Send_ReadShared, desc="") {
 
 action(Send_ReadNoSnp, desc="") {
   assert(is_HN);
-  assert(tbe.use_DMT == false);
+  assert((tbe.use_DMT == false) ||
+         ((tbe.reqType == CHIRequestType:AtomicReturn) ||
+          (tbe.reqType == CHIRequestType:AtomicNoReturn)));
 
   clearExpectedReqResp(tbe);
   tbe.expected_req_resp.addExpectedDataType(CHIDataType:CompData_UC);
@@ -1368,6 +1622,45 @@ action(Send_WriteUnique, desc="") {
   tbe.expected_req_resp.addExpectedCount(1);
 }
 
+action(Send_AtomicReturn, desc="") {
+  assert(is_valid(tbe));
+
+  enqueue(reqOutPort, CHIRequestMsg, request_latency) {
+    prepareRequestAtomic(tbe, CHIRequestType:AtomicReturn, out_msg);
+    out_msg.Destination.add(mapAddressToDownstreamMachine(tbe.addr));
+    allowRequestRetry(tbe, out_msg);
+  }
+  clearExpectedReqResp(tbe);
+  tbe.expected_req_resp.addExpectedRespType(CHIResponseType:DBIDResp);
+  tbe.expected_req_resp.addExpectedCount(1);
+}
+
+action(Send_AtomicReturn_NoWait, desc="") {
+  assert(is_valid(tbe));
+
+  enqueue(reqOutPort, CHIRequestMsg, request_latency) {
+    prepareRequestAtomic(tbe, CHIRequestType:AtomicReturn, out_msg);
+    out_msg.Destination.add(mapAddressToDownstreamMachine(tbe.addr));
+    allowRequestRetry(tbe, out_msg);
+  }
+
+  tbe.dataAMOValid := false;
+}
+
+action(Send_AtomicNoReturn, desc="") {
+  assert(is_valid(tbe));
+
+  enqueue(reqOutPort, CHIRequestMsg, request_latency) {
+    prepareRequestAtomic(tbe, CHIRequestType:AtomicNoReturn, out_msg);
+    out_msg.Destination.add(mapAddressToDownstreamMachine(tbe.addr));
+    allowRequestRetry(tbe, out_msg);
+  }
+  tbe.expected_req_resp.addExpectedRespType(CHIResponseType:CompDBIDResp);
+  tbe.expected_req_resp.addExpectedRespType(CHIResponseType:DBIDResp);
+  tbe.expected_req_resp.addExpectedCount(1);
+}
+
+
 action(Send_SnpCleanInvalid, desc="") {
   assert(is_valid(tbe));
   assert(tbe.expected_snp_resp.hasExpected() == false);
@@ -1636,6 +1929,20 @@ action(ExpectNCBWrData, desc="") {
   tbe.dataBlkValid.setMask(addressOffset(tbe.accAddr, tbe.addr), tbe.accSize, false);
 }
 
+action(ExpectNCBWrData_A, desc="") {
+  // Expected data
+  int num_msgs := tbe.accSize / data_channel_size;
+  if ((tbe.accSize % data_channel_size) != 0) {
+    num_msgs := num_msgs + 1;
+  }
+  tbe.expected_req_resp.clear(num_msgs);
+  tbe.expected_req_resp.addExpectedDataType(CHIDataType:NCBWrData);
+  tbe.expected_req_resp.setExpectedCount(1);
+
+  // In atomic operations we do not expect real data for the current block
+  // Thus the mask bits do not care
+}
+
 action(ExpectCompAck, desc="") {
   assert(is_valid(tbe));
   tbe.expected_req_resp.addExpectedRespType(CHIResponseType:CompAck);
@@ -1658,7 +1965,22 @@ action(Receive_ReqDataResp, desc="") {
     }
     // Copy data to tbe only if we didn't have valid data or the received
     // data is dirty
-    if ((tbe.dataBlkValid.isFull() == false) ||
+    if ((in_msg.type == CHIDataType:NCBWrData) &&
+         ((tbe.reqType == CHIRequestType:AtomicReturn) ||
+          (tbe.reqType == CHIRequestType:AtomicNoReturn))){
+      // DO NOTHING
+    } else if ((in_msg.type == CHIDataType:CompData_I) &&
+               ((tbe.reqType == CHIRequestType:AtomicReturn) ||
+                (tbe.reqType == CHIRequestType:AtomicLoad))) {
+      if(tbe.dataBlkValid.isFull()){
+        tbe.dataBlkValid.clear();
+      }
+      tbe.oldDataBlk.copyPartial(in_msg.dataBlk, in_msg.bitMask);
+      assert(tbe.dataBlkValid.isOverlap(in_msg.bitMask) == false);
+      tbe.dataBlkValid.orMask(in_msg.bitMask);
+      DPRINTF(RubySlicc, "Received %s\n", tbe.oldDataBlk);
+      DPRINTF(RubySlicc, "dataBlkValid = %s\n", tbe.dataBlkValid);
+    } else if ((tbe.dataBlkValid.isFull() == false) ||
         (in_msg.type == CHIDataType:CompData_UD_PD) ||
         (in_msg.type == CHIDataType:CompData_SD_PD) ||
         (in_msg.type == CHIDataType:CBWrData_UD_PD) ||
@@ -1683,7 +2005,8 @@ action(Receive_RespSepDataFromCompData, desc="") {
     if (tbe.expected_req_resp.receiveResp(CHIResponseType:RespSepData) == false) {
       error("Received unexpected message");
     }
-    if (is_HN == false) {
+    if ((is_HN == false) && (tbe.reqType != CHIRequestType:AtomicReturn) &&
+        ((tbe.reqType != CHIRequestType:AtomicLoad) || (tbe.atomic_to_be_done == true))){
       // must now ack the responder
       tbe.actions.pushFrontNB(Event:SendCompAck);
     }
@@ -1905,6 +2228,7 @@ action(UpdateDataState_FromReqDataResp, desc="") {
 
       } else if (in_msg.type == CHIDataType:CompData_I) {
         tbe.dataValid := true;
+        tbe.dataAMOValid := true;
         tbe.dataToBeInvalid := true;
         assert(tbe.dataMaybeDirtyUpstream == false);
 
@@ -1946,7 +2270,9 @@ action(UpdateDataState_FromReqDataResp, desc="") {
 
 action(UpdateDataState_FromWUDataResp, desc="") {
   assert(is_valid(tbe));
-  if (tbe.expected_req_resp.hasReceivedData()) {
+  if (tbe.expected_req_resp.hasReceivedData() &&
+       (tbe.reqType != CHIRequestType:AtomicReturn) &&
+       (tbe.reqType != CHIRequestType:AtomicNoReturn)) {
     assert(tbe.dataBlkValid.test(addressOffset(tbe.accAddr, tbe.addr)));
     assert(tbe.dataBlkValid.test(addressOffset(tbe.accAddr, tbe.addr)
                                   + tbe.accSize - 1));
@@ -1964,6 +2290,22 @@ action(UpdateDataState_FromWUDataResp, desc="") {
   printTBEState(tbe);
 }
 
+action(UpdateDataState_FromADataResp, desc="") {
+  assert(is_valid(tbe));
+  if (is_HN && (tbe.expected_req_resp.hasReceivedData()) &&
+      ((tbe.reqType == CHIRequestType:AtomicReturn) ||
+       (tbe.reqType == CHIRequestType:AtomicNoReturn))) {
+    DPRINTF(RubySlicc, "Atomic before %s\n", tbe.dataBlk);
+
+    tbe.oldDataBlk := tbe.dataBlk;
+    tbe.dataBlk.atomicPartial(tbe.dataBlk, tbe.atomic_op);
+    tbe.dataDirty := true;
+
+    DPRINTF(RubySlicc, "Atomic after %s\n", tbe.dataBlk);
+  }
+  printTBEState(tbe);
+}
+
 action(UpdateDataState_FromCUResp, desc="") {
   assert(is_valid(tbe));
   peek(rspInPort, CHIResponseMsg) {
@@ -2127,6 +2469,10 @@ action(Receive_ReqResp_WUNeedComp, desc="") {
   tbe.defer_expected_comp := true;
 }
 
+action(Receive_ReqResp_AR, desc="") {
+  tbe.actions.pushFrontNB(Event:SendARData);
+}
+
 action(Receive_ReqResp_WUComp, desc="") {
   if (tbe.defer_expected_comp) {
     tbe.defer_expected_comp := false;
@@ -2320,6 +2666,36 @@ action(CheckWUComp, desc="") {
   }
 }
 
+action(Send_ARData, desc="") {
+  assert(is_valid(tbe));
+  tbe.snd_msgType := CHIDataType:NCBWrData;
+  tbe.snd_destination := mapAddressToDownstreamMachine(tbe.addr);
+  setupPendingAtomicSend(tbe);
+}
+
+action(Send_ANRData, desc="") {
+  assert(is_valid(tbe));
+  tbe.snd_msgType := CHIDataType:NCBWrData;
+  tbe.snd_destination := mapAddressToDownstreamMachine(tbe.addr);
+  setupPendingAtomicSend(tbe);
+}
+
+action(CheckARComp, desc="") {
+  assert(is_valid(tbe));
+  tbe.expected_req_resp.addExpectedDataType(CHIDataType:CompData_I);
+  tbe.expected_req_resp.addExpectedRespType(CHIResponseType:RespSepData);
+  tbe.expected_req_resp.addExpectedCount(2);
+}
+
+action(CheckANRComp, desc="") {
+  assert(is_valid(tbe));
+  if (tbe.defer_expected_comp) {
+    tbe.defer_expected_comp := false;
+    tbe.expected_req_resp.addExpectedCount(1);
+    tbe.expected_req_resp.addExpectedRespType(CHIResponseType:Comp);
+  }
+}
+
 action(Send_SnpRespData, desc="") {
   assert(is_HN == false);
   assert(is_valid(tbe));
@@ -2531,7 +2907,12 @@ action(Send_Data, desc="") {
     }
     tbe.snd_pendBytes.setMask(offset, range, false);
 
-    out_msg.dataBlk := tbe.dataBlk;
+    if (tbe.reqType == CHIRequestType:AtomicReturn){
+        out_msg.dataBlk := tbe.oldDataBlk;
+    } else {
+        out_msg.dataBlk := tbe.dataBlk;
+    }
+
     out_msg.bitMask.setMask(offset, range);
 
     out_msg.responder := machineID;
@@ -2673,6 +3054,36 @@ action(Send_Comp_WU, desc="") {
   }
 }
 
+
+action(Send_CompData_AR, desc="") {
+  assert(is_valid(tbe));
+  assert(tbe.dataValid);
+
+  if (is_HN) {
+      tbe.oldDataBlk := tbe.dataBlk;
+  }
+
+  tbe.snd_msgType := CHIDataType:CompData_I;
+  tbe.dataMaybeDirtyUpstream := false;
+  tbe.requestorToBeExclusiveOwner := false;
+  tbe.requestorToBeOwner := false;
+  tbe.snd_destination := tbe.requestor;
+  setupPendingSend(tbe);
+  printTBEState(tbe);
+
+}
+
+action(Send_Comp_ANR, desc="") {
+  assert(is_valid(tbe));
+  enqueue(rspOutPort, CHIResponseMsg, comp_anr_latency + response_latency) {
+    out_msg.addr := address;
+    out_msg.type := CHIResponseType:Comp;
+    out_msg.responder := machineID;
+    out_msg.Destination.add(tbe.requestor);
+  }
+}
+
+
 action(Send_SnpRespI, desc="") {
   enqueue(rspOutPort, CHIResponseMsg, response_latency) {
     out_msg.addr := address;
@@ -3003,6 +3414,22 @@ action(Callback_StoreHit, desc="") {
   }
 }
 
+action(Callback_AtomicHit, desc="") {
+  assert(is_valid(tbe));
+  assert(tbe.dataValid);
+  assert((tbe.reqType == CHIRequestType:AtomicLoad) ||
+         (tbe.reqType == CHIRequestType:AtomicStore));
+  DPRINTF(RubySlicc, "Atomic before %s\n", tbe.dataBlk);
+
+  DataBlock oldDataBlk;
+  oldDataBlk := tbe.dataBlk;
+  tbe.dataBlk.atomicPartial(tbe.dataBlk, tbe.atomic_op);
+
+  sequencer.atomicCallback(tbe.addr, oldDataBlk, false);
+  DPRINTF(RubySlicc, "Atomic after %s\n", tbe.dataBlk);
+  tbe.dataDirty := true;
+}
+
 action(Callback_ExpressPrefetchHit, desc="") {
   // have not allocated TBE, but must clear the reservation
   assert(is_invalid(tbe));
@@ -3051,6 +3478,25 @@ action(Callback_Miss, desc="") {
       // also decay the timeout
       scLockDecayLatency();
     }
+  } else if (tbe.dataValid && tbe.atomic_to_be_done &&
+                ((tbe.reqType == CHIRequestType:AtomicLoad) ||
+                 (tbe.reqType == CHIRequestType:AtomicStore))){
+    assert(is_valid(tbe));
+    assert(tbe.dataValid);
+    assert((tbe.reqType == CHIRequestType:AtomicLoad) ||
+           (tbe.reqType == CHIRequestType:AtomicStore));
+    DPRINTF(RubySlicc, "Atomic before %s\n", tbe.dataBlk);
+
+    DataBlock oldDataBlk;
+    oldDataBlk := tbe.dataBlk;
+    tbe.dataBlk.atomicPartial(tbe.dataBlk, tbe.atomic_op);
+
+    sequencer.atomicCallback(tbe.addr, oldDataBlk, false);
+    DPRINTF(RubySlicc, "Atomic after %s\n", tbe.dataBlk);
+    tbe.dataDirty := true;
+  } else if (tbe.dataValid && tbe.dataAMOValid && (tbe.reqType == CHIRequestType:AtomicLoad)) {
+    DPRINTF(RubySlicc, "Atomic before %s\n", tbe.oldDataBlk);
+    sequencer.atomicCallback(tbe.addr, tbe.oldDataBlk, false);
   }
 }
 
@@ -3070,6 +3516,18 @@ action(Unset_Timeout_Cache, desc="") {
   wakeup_port(snpRdyPort, address);
 }
 
+action(Callback_AtomicNoReturn, desc="") {
+  assert(is_valid(tbe));
+  assert((tbe.is_local_pf || tbe.is_remote_pf) == false);
+  assert((tbe.reqType == CHIRequestType:AtomicNoReturn) ||
+	 (tbe.reqType == CHIRequestType:AtomicStore));
+
+  if(tbe.reqType == CHIRequestType:AtomicStore){
+    sequencer.atomicCallback(tbe.addr, tbe.dataBlk);
+    DPRINTF(RubySlicc, "AtomicNoReturn %s\n", tbe.dataBlk);
+  }
+}
+
 action(Callback_WriteUnique, desc="") {
   assert(is_valid(tbe));
   assert((tbe.is_local_pf || tbe.is_remote_pf) == false);
@@ -3183,7 +3641,7 @@ action(Profile_OutgoingEnd_DatalessResp, desc="") {
 action(TagArrayRead, desc="") {
   assert(is_valid(tbe));
   tbe.delayNextAction := curTick() + cyclesToTicks(
-    tagLatency(fromSequencer(tbe.reqType)));
+  tagLatency(fromSequencer(tbe.reqType)));
 }
 
 action(TagArrayWrite, desc="") {
@@ -3235,6 +3693,11 @@ action(FillPipe, desc="") {
   tbe.delayNextAction := curTick() + cyclesToTicks(fill_latency);
 }
 
+action(DelayAtomic, desc="") {
+  assert(is_valid(tbe));
+  tbe.delayNextAction := curTick() + cyclesToTicks(atomic_op_latency);
+}
+
 action(SnpSharedPipe, desc="") {
   assert(is_valid(tbe));
   tbe.delayNextAction := curTick() + cyclesToTicks(snp_latency);
diff --git a/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm b/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm
index 4d8c35053c..371ad05109 100644
--- a/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm
@@ -302,7 +302,9 @@ Cycles dataLatency() {
 bool fromSequencer(CHIRequestType reqType) {
   return reqType == CHIRequestType:Load ||
          reqType == CHIRequestType:Store ||
-         reqType == CHIRequestType:StoreLine;
+         reqType == CHIRequestType:StoreLine ||
+         reqType == CHIRequestType:AtomicLoad ||
+         tbe.reqType == CHIRequestType:AtomicStore;
 }
 
 bool inCache(Addr addr) {
@@ -434,6 +436,9 @@ TBE allocateRequestTBE(Addr addr, CHIRequestMsg in_msg), return_by_pointer="yes"
   tbe.is_local_pf := in_msg.is_local_pf;
   tbe.is_remote_pf := in_msg.is_remote_pf;
 
+  tbe.atomic_op.clear();
+  tbe.atomic_op.orMask(in_msg.atomic_op);
+
   tbe.use_DMT := false;
   tbe.use_DCT := false;
 
@@ -622,6 +627,13 @@ void setupPendingPartialSend(TBE tbe) {
   scheduleSendData(tbe, 0);
 }
 
+void setupPendingAtomicSend(TBE tbe) {
+  assert(blockSize >= data_channel_size);
+  assert((blockSize % data_channel_size) == 0);
+  tbe.snd_pendBytes.setMask(0,tbe.accSize,true);
+  scheduleSendData(tbe, 0);
+}
+
 // common code for downstream requests
 void prepareRequest(TBE tbe, CHIRequestType type, CHIRequestMsg & out_msg) {
   out_msg.addr := tbe.addr;
@@ -644,6 +656,17 @@ void prepareRequest(TBE tbe, CHIRequestType type, CHIRequestMsg & out_msg) {
   assert(tbe.txnId != static_cast(Addr, "value", -1));
 }
 
+void prepareRequestAtomic(TBE tbe, CHIRequestType type,
+                          CHIRequestMsg & out_msg) {
+  assert((type == CHIRequestType:AtomicReturn) ||
+         (type == CHIRequestType:AtomicNoReturn));
+  prepareRequest(tbe, type, out_msg);
+  out_msg.accAddr := tbe.accAddr;
+  out_msg.accSize := tbe.accSize;
+  out_msg.atomic_op.clear();
+  out_msg.atomic_op.orMask(tbe.atomic_op);
+}
+
 void allowRequestRetry(TBE tbe, CHIRequestMsg & out_msg) {
   out_msg.allowRetry := true;
   tbe.pendReqAllowRetry := true;
@@ -672,6 +695,8 @@ void prepareRequestRetry(TBE tbe, CHIRequestMsg & out_msg) {
   out_msg.seqReq := tbe.seqReq;
   out_msg.is_local_pf := false;
   out_msg.is_remote_pf := tbe.is_local_pf || tbe.is_remote_pf;
+  out_msg.atomic_op.clear();
+  out_msg.atomic_op.orMask(tbe.atomic_op);
 }
 
 void prepareRequestRetryDVM(TBE tbe, CHIRequestMsg & out_msg) {
@@ -773,8 +798,12 @@ bool needCacheEntry(CHIRequestType req_type,
                                    (req_type == CHIRequestType:WriteEvictFull) ||
                                    (is_HN && (req_type == CHIRequestType:WriteUniqueFull)))) ||
            (alloc_on_seq_acc && ((req_type == CHIRequestType:Load) ||
-                                 (req_type == CHIRequestType:Store))) ||
-           (alloc_on_seq_line_write && (req_type == CHIRequestType:StoreLine));
+                                 (req_type == CHIRequestType:Store) ||
+                                 (req_type == CHIRequestType:AtomicLoad) ||
+                                 (req_type == CHIRequestType:AtomicStore))) ||
+           (alloc_on_seq_line_write && (req_type == CHIRequestType:StoreLine)) ||
+           (alloc_on_atomic && ((req_type == CHIRequestType:AtomicReturn) ||
+                               (req_type == CHIRequestType:AtomicNoReturn)));
   }
 }
 
@@ -1174,6 +1203,10 @@ Event reqToEvent(CHIRequestType type, bool is_prefetch) {
     return Event:Store;
   } else if (type == CHIRequestType:StoreLine) {
     return Event:Store;
+  } else if (type == CHIRequestType:AtomicLoad) {
+    return Event:AtomicLoad;
+  } else if (type == CHIRequestType:AtomicStore){
+    return Event:AtomicStore;
   } else if (type == CHIRequestType:ReadShared) {
     return Event:ReadShared;
   } else if (type == CHIRequestType:ReadNotSharedDirty) {
@@ -1214,6 +1247,18 @@ Event reqToEvent(CHIRequestType type, bool is_prefetch) {
     return Event:DvmTlbi_Initiate;
   } else if (type == CHIRequestType:DvmSync_Initiate) {
     return Event:DvmSync_Initiate;
+  } else if (type == CHIRequestType:AtomicReturn){
+    if (is_HN) {
+      return Event:AtomicReturn_PoC;
+    } else {
+      return Event:AtomicReturn;
+    }
+  } else if (type == CHIRequestType:AtomicNoReturn){
+    if (is_HN) {
+      return Event:AtomicNoReturn_PoC;
+    } else {
+      return Event:AtomicNoReturn;
+    }
   } else {
     error("Invalid CHIRequestType");
   }
diff --git a/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm b/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm
index cb9ffa567a..0e8c6ec0e3 100644
--- a/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm
@@ -155,6 +155,12 @@ transition({BUSY_INTR,BUSY_BLKD}, FillPipe) {
   ProcessNextState_ClearPending;
 }
 
+transition({BUSY_INTR,BUSY_BLKD}, DelayAtomic) {
+  Pop_TriggerQueue;
+  DelayAtomic;
+  ProcessNextState_ClearPending;
+}
+
 transition({BUSY_INTR,BUSY_BLKD}, SnpSharedPipe) {
   Pop_TriggerQueue;
   SnpSharedPipe;
@@ -418,8 +424,82 @@ transition({RSC,RSD,RUSD,RUSC,RU,I}, WriteUnique, BUSY_BLKD) {
   ProcessNextState;
 }
 
+// AtomicReturn and AtomicNoReturn
 
-// Load / Store from sequencer & Prefetch from prefetcher
+transition({I,SC,SC_RSC,SD,SD_RSD,SD_RSC,RSD,RUSD,
+            UD,UD_RSC,UD_RSD,UD_RU,UC,UC_RSC,UC_RU,RSC,RU}, AtomicReturn, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicReturn_Forward;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition({I,SC,SC_RSC,SD,SD_RSD,SD_RSC,RSD,RUSD,
+            UD,UD_RSC,UD_RSD,UD_RU,UC,UC_RSC,UC_RU,RSC,RU}, AtomicNoReturn, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicNoReturn_Forward;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition({UD,UD_RU,UD_RSD,UD_RSC,UC,UC_RU,UC_RSC},
+           AtomicReturn_PoC, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicReturn_LocalWrite;
+  Profile_Hit;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition({UD,UD_RU,UD_RSD,UD_RSC,UC,UC_RU,UC_RSC},
+           AtomicNoReturn_PoC, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicNoReturn_LocalWrite;
+  Profile_Hit;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition({SD, SD_RSD, SD_RSC, SC, SC_RSC, RSC, RSD, RUSC, RUSD, RU},
+           AtomicReturn_PoC, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicReturn_LocalWrite;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition({SD, SD_RSD, SD_RSC, SC, SC_RSC, RSC, RSD, RUSC, RUSD, RU},
+           AtomicNoReturn_PoC, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicNoReturn_LocalWrite;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition(I, AtomicReturn_PoC, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicReturn_Miss;
+  Allocate_DirEntry;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition(I, AtomicNoReturn_PoC, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicNoReturn_Miss;
+  Allocate_DirEntry;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+
+// Load / Store / Atomic from sequencer & Prefetch from prefetcher
 
 transition({UD,UD_T,SD,UC,SC}, Load, BUSY_BLKD) {
   Initiate_Request;
@@ -460,6 +540,28 @@ transition(BUSY_BLKD, StoreHit) {
   ProcessNextState_ClearPending;
 }
 
+transition(UC, {AtomicLoad,AtomicStore}, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_Atomic_UC;
+  Profile_Hit;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition({UD,UD_T}, {AtomicLoad,AtomicStore}, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_Atomic_UD;
+  Profile_Hit;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition(BUSY_BLKD, AtomicHit) {
+  Pop_TriggerQueue;
+  Callback_AtomicHit;
+  ProcessNextState_ClearPending;
+}
+
 transition(I, {Load,Prefetch}, BUSY_BLKD) {
   Initiate_Request;
   Initiate_LoadMiss;
@@ -494,6 +596,55 @@ transition({BUSY_BLKD,BUSY_INTR}, UseTimeout) {
   Unset_Timeout_TBE;
 }
 
+transition(I, AtomicLoad, BUSY_BLKD){
+  Initiate_Request;
+  Initiate_AtomicReturn_I;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition(I, AtomicStore, BUSY_BLKD){
+  Initiate_Request;
+  Initiate_AtomicNoReturn_I;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition(SD, AtomicLoad, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicReturn_SD;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition(SC, AtomicLoad, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicReturn_SC;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition(SD, AtomicStore, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicNoReturn_SD;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition(SC, AtomicStore, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicNoReturn_SC;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+
 // Evict from Upstream
 
 transition({UD_RSC,SD_RSC,UC_RSC,SC_RSC,RSC,RSD,RUSD,RUSC,UD_RSD,SD_RSD}, Evict, BUSY_BLKD) {
@@ -691,13 +842,15 @@ transition(BUSY_INTR, {SnpOnce,SnpOnceFwd}, BUSY_BLKD) {
 transition({BUSY_BLKD,BUSY_INTR},
             {ReadShared, ReadNotSharedDirty, ReadUnique, ReadUnique_PoC,
             ReadOnce, CleanUnique, CleanUnique_Stale,
-            Load, Store, Prefetch,
+            Load, Store, AtomicLoad, AtomicStore, Prefetch,
             WriteBackFull, WriteBackFull_Stale,
             WriteEvictFull, WriteEvictFull_Stale,
             WriteCleanFull, WriteCleanFull_Stale,
             Evict, Evict_Stale,
             WriteUnique,WriteUniquePtl_PoC,
-            WriteUniqueFull_PoC,WriteUniqueFull_PoC_Alloc}) {
+            WriteUniqueFull_PoC,WriteUniqueFull_PoC_Alloc
+            AtomicReturn,AtomicReturn_PoC,
+            AtomicNoReturn,AtomicNoReturn_PoC}) {
   StallRequest;
 }
 
@@ -754,6 +907,30 @@ transition(BUSY_BLKD, SendWriteUnique, BUSY_INTR) {DestinationAvailable} {
   ProcessNextState_ClearPending;
 }
 
+transition(BUSY_BLKD, SendAtomicReturn, BUSY_INTR) {DestinationAvailable} {
+  Pop_TriggerQueue;
+  Send_AtomicReturn;
+  CheckARComp;
+  Profile_OutgoingStart;
+  ProcessNextState_ClearPending;
+}
+
+transition(BUSY_BLKD, SendAtomicReturn_NoWait, BUSY_INTR) {
+  Pop_TriggerQueue;
+  Send_AtomicReturn_NoWait;
+  CheckARComp;
+  Profile_OutgoingStart;
+  ProcessNextState_ClearPending;
+}
+
+transition(BUSY_BLKD, SendAtomicNoReturn, BUSY_INTR) {DestinationAvailable} {
+  Pop_TriggerQueue;
+  Send_AtomicNoReturn;
+  Profile_OutgoingStart;
+  ProcessNextState_ClearPending;
+}
+
+
 transition(BUSY_BLKD, SendWriteNoSnp, BUSY_INTR) {DestinationAvailable} {
   Pop_TriggerQueue;
   Send_WriteNoSnp;
@@ -804,6 +981,20 @@ transition(BUSY_BLKD, SendWUDataCB) {
   ProcessNextState_ClearPending;
 }
 
+transition({BUSY_BLKD,BUSY_INTR}, SendARData) {
+  Pop_TriggerQueue;
+  Send_ARData;
+  ProcessNextState_ClearPending;
+}
+
+transition({BUSY_BLKD,BUSY_INTR}, SendANRData) {
+  Pop_TriggerQueue;
+  Callback_AtomicNoReturn;
+  Send_ANRData;
+  CheckANRComp;
+  ProcessNextState_ClearPending;
+}
+
 transition(BUSY_BLKD, SendInvSnpResp) {
   Pop_TriggerQueue;
   Send_InvSnpResp;
@@ -1025,6 +1216,26 @@ transition({BUSY_BLKD,BUSY_INTR}, SendComp_WU) {
   ProcessNextState_ClearPending;
 }
 
+transition(BUSY_BLKD, SendCompDBIDResp_ANR) {
+  Pop_TriggerQueue;
+  ExpectNCBWrData_A;
+  Send_CompDBIDResp;
+  ProcessNextState_ClearPending;
+}
+
+transition(BUSY_BLKD, SendDBIDResp_AR) {
+  Pop_TriggerQueue;
+  ExpectNCBWrData_A;
+  Send_DBIDResp;
+  ProcessNextState_ClearPending;
+}
+
+transition({BUSY_BLKD,BUSY_INTR}, SendCompData_AR) {
+  Pop_TriggerQueue;
+  Send_CompData_AR;
+  ProcessNextState_ClearPending;
+}
+
 transition(BUSY_BLKD, SendCompDBIDRespStale) {
   Pop_TriggerQueue;
   Send_CompDBIDResp_Stale;
@@ -1085,6 +1296,7 @@ transition(BUSY_BLKD,
 transition({BUSY_BLKD,BUSY_INTR}, NCBWrData) {
   Receive_ReqDataResp;
   UpdateDataState_FromWUDataResp;
+  UpdateDataState_FromADataResp;
   Pop_DataInQueue;
   ProcessNextState;
 }
@@ -1238,10 +1450,11 @@ transition(BUSY_INTR, CompDBIDResp, BUSY_BLKD) {
 }
 
 // alternative flow for WU with separate Comp
-transition(BUSY_INTR, DBIDResp, BUSY_BLKD) {
+transition({BUSY_INTR,BUSY_BLKD}, DBIDResp, BUSY_BLKD) {
   Receive_ReqResp;
   Receive_ReqResp_CopyDBID;
   Receive_ReqResp_WUNeedComp;
+  Receive_ReqResp_AR;
   Pop_RespInQueue;
   ProcessNextState;
 }
diff --git a/src/mem/ruby/protocol/chi/CHI-cache.sm b/src/mem/ruby/protocol/chi/CHI-cache.sm
index e40989df47..f806488b45 100644
--- a/src/mem/ruby/protocol/chi/CHI-cache.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache.sm
@@ -51,6 +51,7 @@ machine(MachineType:Cache, "Cache coherency protocol") :
   // sending necessary snoops.
   Cycles read_hit_latency := 0;
   Cycles read_miss_latency := 0;
+  Cycles atomic_op_latency := 0;
   Cycles write_fe_latency := 0; // Front-end: Rcv req -> Snd req
   Cycles write_be_latency := 0; // Back-end: Rcv ack -> Snd data
   Cycles fill_latency := 0; // Fill latency
@@ -126,11 +127,24 @@ machine(MachineType:Cache, "Cache coherency protocol") :
   // possible.
   bool enable_DCT;
 
+  // Atomic Operation Policy
+  // All Near executes all Atomics at L1 (variable set to 0; default)
+  // Unique Near executes Atomics at HNF for states I, SC, SD (set to 1)
+  // Present Near execites all Atomics at L1 except when state is I (set to 2)
+  int policy_type := 1;
+
+
   // Use separate Comp/DBIDResp responses for WriteUnique
   bool comp_wu := "False";
   // additional latency for the WU Comp response
   Cycles comp_wu_latency := 0;
 
+
+  // Use separate Comp/DBIDResp responses for AtomicNoResponse
+  bool comp_anr := "False";
+  // additional latency for the ANR Comp response
+  Cycles comp_anr_latency := 0;
+
   // Controls cache clusivity for different request types.
   // set all alloc_on* to false to completelly disable caching
   bool alloc_on_readshared;
@@ -139,6 +153,7 @@ machine(MachineType:Cache, "Cache coherency protocol") :
   bool alloc_on_writeback;
   bool alloc_on_seq_acc;
   bool alloc_on_seq_line_write;
+  bool alloc_on_atomic;
   // Controls if the clusivity is strict.
   bool dealloc_on_unique;
   bool dealloc_on_shared;
@@ -285,6 +300,8 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     // See CHIRequestType in CHi-msg.sm for descriptions
     Load,                        desc="", in_trans="yes";
     Store,                       desc="", in_trans="yes";
+    AtomicLoad,                  desc="", in_trans="yes";
+    AtomicStore,                 desc="", in_trans="yes";
     Prefetch,                    desc="", in_trans="yes";
     ReadShared,                  desc="", in_trans="yes";
     ReadNotSharedDirty,          desc="", in_trans="yes";
@@ -300,6 +317,10 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     WriteUniquePtl_PoC,          desc="", in_trans="yes";
     WriteUniqueFull_PoC,         desc="", in_trans="yes";
     WriteUniqueFull_PoC_Alloc,   desc="", in_trans="yes";
+    AtomicReturn,                desc="", in_trans="yes";
+    AtomicNoReturn,              desc="", in_trans="yes";
+    AtomicReturn_PoC,            desc="", in_trans="yes";
+    AtomicNoReturn_PoC,          desc="", in_trans="yes";
     SnpCleanInvalid,             desc="", in_trans="yes";
     SnpShared,                   desc="", in_trans="yes";
     SnpSharedFwd,                desc="", in_trans="yes";
@@ -418,11 +439,12 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     DataArrayWriteOnFill, desc="Write the cache data array (cache fill)";
 
     // Events for modeling the pipeline latency
-    ReadHitPipe,  desc="Latency of reads served from local cache";
-    ReadMissPipe, desc="Latency of reads not served from local cache";
-    WriteFEPipe,  desc="Front-end latency of write requests";
-    WriteBEPipe,  desc="Back-end latency of write requests";
-    FillPipe,     desc="Cache fill latency";
+    ReadHitPipe,   desc="Latency of reads served from local cache";
+    ReadMissPipe,  desc="Latency of reads not served from local cache";
+    WriteFEPipe,   desc="Front-end latency of write requests";
+    WriteBEPipe,   desc="Back-end latency of write requests";
+    FillPipe,      desc="Cache fill latency";
+    DelayAtomic,   desc="Atomic operation latency";
     SnpSharedPipe, desc="Latency for SnpShared requests";
     SnpInvPipe,    desc="Latency for SnpUnique and SnpCleanInv requests";
     SnpOncePipe,   desc="Latency for SnpOnce requests";
@@ -435,9 +457,9 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     SendReadUnique,       out_trans="yes", desc="Send a ReadUnique";
     SendCompAck,          desc="Send CompAck";
     // Read handling at the completer
-    SendCompData,    desc="Send CompData";
-    WaitCompAck,     desc="Expect to receive CompAck";
-    SendRespSepData, desc="Send RespSepData for a DMT request";
+    SendCompData,         desc="Send CompData";
+    WaitCompAck,          desc="Expect to receive CompAck";
+    SendRespSepData,      desc="Send RespSepData for a DMT request";
 
     // Send a write request downstream.
     SendWriteBackOrWriteEvict, out_trans="yes", desc="Send a WriteBackFull (if line is UD or SD) or WriteEvictFull (if UC)";
@@ -449,11 +471,25 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     SendWUData,                desc="Send write unique data";
     SendWUDataCB,              desc="Send write unique data from a sequencer callback";
     // Write handling at the completer
-    SendCompDBIDResp,      desc="Ack WB with CompDBIDResp";
-    SendCompDBIDRespStale, desc="Ack stale WB with CompDBIDResp";
-    SendCompDBIDResp_WU,   desc="Ack WU with CompDBIDResp and set expected data";
-    SendDBIDResp_WU,       desc="Ack WU with DBIDResp and set expected data";
-    SendComp_WU,           desc="Ack WU completion";
+    SendCompDBIDResp,          desc="Ack WB with CompDBIDResp";
+    SendCompDBIDRespStale,     desc="Ack stale WB with CompDBIDResp";
+    SendCompDBIDResp_WU,       desc="Ack WU with CompDBIDResp and set expected data";
+    SendDBIDResp_WU,           desc="Ack WU with DBIDResp and set expected data";
+    SendComp_WU,               desc="Ack WU completion";
+
+    // Send an atomic request downstream.
+    SendAtomicReturn,          out_trans="yes", desc="Send atomic request with return";
+    SendAtomicReturn_NoWait,   out_trans="yes", desc="Send atomic request with return, but no DBID";
+    SendAtomicNoReturn,        out_trans="yes", desc="Send atomic request without return";
+    SendARData,                desc="Send atomic return request data";
+    SendANRData,               desc="Send atomic no return request data";
+    // Atomic handling at the completer
+    SendDBIDResp_AR,       desc="Ack AR with DBIDResp and set expected data";
+    SendCompData_AR,       desc="Ack AR completion";
+    SendCompDBIDResp_ANR,  desc="Ack ANR with CompDBIDResp and set expected data";
+    SendDBIDResp_ANR,      desc="Ack ANR with DBIDResp and set expected data";
+    SendComp_ANR,          desc="Ack ANR completion";
+
 
     // Dataless requests
     SendEvict,      out_trans="yes", desc="Send a Evict";
@@ -502,6 +538,7 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     // Misc triggers
     LoadHit,  desc="Complete a load hit";
     StoreHit, desc="Complete a store hit";
+    AtomicHit, desc="Complete an atomic hit";
     UseTimeout, desc="Transition from UD_T -> UD";
     RestoreFromHazard, desc="Restore from a snoop hazard";
     TX_Data, desc="Transmit pending data messages";
@@ -613,6 +650,10 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     bool is_local_pf,       desc="Request generated by a local prefetcher";
     bool is_remote_pf,      desc="Request generated a prefetcher in another cache";
 
+    // Atomic info associated with the transaction
+    WriteMask atomic_op,    desc="Atomic Operation Wrapper";
+    bool atomic_to_be_done, desc="We have yet to perform the atomic";
+
     // NOTE: seqReq is a smart pointer pointing to original CPU request object
     // that triggers transactions associated with this TBE. seqReq carries some
     // information (e.g., PC of requesting instruction, virtual address of this
@@ -630,8 +671,10 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     // stable state.
     bool hasUseTimeout,           desc="Line is locked under store/use timeout";
     DataBlock dataBlk,            desc="Local copy of the line";
+    DataBlock oldDataBlk,         desc="Local copy of the line before executing atomic";
     WriteMask dataBlkValid,       desc="Marks which bytes in the DataBlock are valid";
     bool dataValid,               desc="Local copy is valid";
+    bool dataAMOValid,            desc="Local copy is valid for AMO";
     bool dataDirty,               desc="Local copy is dirtry";
     bool dataMaybeDirtyUpstream,  desc="Line maybe dirty upstream";
     bool dataUnique,              desc="Line is unique either locally or upsatream";
diff --git a/src/mem/ruby/protocol/chi/CHI-msg.sm b/src/mem/ruby/protocol/chi/CHI-msg.sm
index f3c2d66363..b9e11d9dd9 100644
--- a/src/mem/ruby/protocol/chi/CHI-msg.sm
+++ b/src/mem/ruby/protocol/chi/CHI-msg.sm
@@ -46,6 +46,8 @@ enumeration(CHIRequestType, desc="") {
   Load;
   Store;
   StoreLine;
+  AtomicLoad;
+  AtomicStore;
   // Incoming DVM-related requests generated by the sequencer
   DvmTlbi_Initiate;
   DvmSync_Initiate;
@@ -66,6 +68,9 @@ enumeration(CHIRequestType, desc="") {
   WriteUniquePtl;
   WriteUniqueFull;
 
+  AtomicReturn;
+  AtomicNoReturn;
+
   SnpSharedFwd;
   SnpNotSharedDirtyFwd;
   SnpUniqueFwd;
@@ -108,6 +113,8 @@ structure(CHIRequestMsg, desc="", interface="Message") {
   bool is_local_pf,         desc="Request generated by a local prefetcher";
   bool is_remote_pf,        desc="Request generated a prefetcher in another cache";
 
+  WriteMask atomic_op,      desc="Atomic Operation Wrapper";
+
   bool usesTxnId,       desc="True if using a Transaction ID", default="false";
   Addr txnId,           desc="Transaction ID", default="0";
 
diff --git a/src/mem/ruby/slicc_interface/RubyRequest.cc b/src/mem/ruby/slicc_interface/RubyRequest.cc
index 643c1dec6f..c6faf2d76f 100644
--- a/src/mem/ruby/slicc_interface/RubyRequest.cc
+++ b/src/mem/ruby/slicc_interface/RubyRequest.cc
@@ -123,5 +123,14 @@ RubyRequest::functionalWrite(Packet *pkt)
     return cBase < cTail;
 }
 
+void
+RubyRequest::setWriteMask(uint32_t offset, uint32_t len,
+        std::vector< std::pair<int,AtomicOpFunctor*>> atomicOps)
+{
+    m_writeMask.setMask(offset, len);
+    m_writeMask.setAtomicOps(atomicOps);
+}
+
+
 } // namespace ruby
 } // namespace gem5
diff --git a/src/mem/ruby/slicc_interface/RubyRequest.hh b/src/mem/ruby/slicc_interface/RubyRequest.hh
index 89ce83451e..1e9674b9f5 100644
--- a/src/mem/ruby/slicc_interface/RubyRequest.hh
+++ b/src/mem/ruby/slicc_interface/RubyRequest.hh
@@ -226,6 +226,8 @@ class RubyRequest : public Message
     const PrefetchBit& getPrefetch() const { return m_Prefetch; }
     RequestPtr getRequestPtr() const { return m_pkt->req; }
 
+    void setWriteMask(uint32_t offset, uint32_t len,
+        std::vector< std::pair<int,AtomicOpFunctor*>> atomicOps);
     void print(std::ostream& out) const;
     bool functionalRead(Packet *pkt);
     bool functionalRead(Packet *pkt, WriteMask &mask);
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
index 82fc19b57c..48054febef 100644
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -466,8 +466,12 @@ Sequencer::writeCallback(Addr address, DataBlock& data,
     bool ruby_request = true;
     while (!seq_req_list.empty()) {
         SequencerRequest &seq_req = seq_req_list.front();
+        // Atomic Request may be executed remotly in the cache hierarchy
+        bool atomic_req =
+           ((seq_req.m_type == RubyRequestType_ATOMIC_RETURN) ||
+            (seq_req.m_type == RubyRequestType_ATOMIC_NO_RETURN));
 
-        if (noCoales && !ruby_request) {
+        if ((noCoales || atomic_req) && !ruby_request) {
             // Do not process follow-up requests
             // (e.g. if full line no present)
             // Reissue to the cache hierarchy
@@ -479,6 +483,8 @@ Sequencer::writeCallback(Addr address, DataBlock& data,
             assert(seq_req.m_type != RubyRequestType_LD);
             assert(seq_req.m_type != RubyRequestType_Load_Linked);
             assert(seq_req.m_type != RubyRequestType_IFETCH);
+            assert(seq_req.m_type != RubyRequestType_ATOMIC_RETURN);
+            assert(seq_req.m_type != RubyRequestType_ATOMIC_NO_RETURN);
         }
 
         // handle write request
@@ -594,6 +600,62 @@ Sequencer::readCallback(Addr address, DataBlock& data,
     }
 }
 
+void
+Sequencer::atomicCallback(Addr address, DataBlock& data,
+                         const bool externalHit, const MachineType mach,
+                         const Cycles initialRequestTime,
+                         const Cycles forwardRequestTime,
+                         const Cycles firstResponseTime)
+{
+    //
+    // Free the first request (an atomic operation) from the list.
+    // Then issue the next request to ruby system as we cannot
+    // assume the cache line is present in the cache
+    // (the opperation could be performed remotly)
+    //
+    assert(address == makeLineAddress(address));
+    assert(m_RequestTable.find(address) != m_RequestTable.end());
+    auto &seq_req_list = m_RequestTable[address];
+
+    // Perform hitCallback only on the first cpu request that
+    // issued the ruby request
+    bool ruby_request = true;
+    while (!seq_req_list.empty()) {
+        SequencerRequest &seq_req = seq_req_list.front();
+
+        if (ruby_request) {
+            // Check that the request was an atomic memory operation
+            // and record the latency
+            assert((seq_req.m_type == RubyRequestType_ATOMIC_RETURN) ||
+                   (seq_req.m_type == RubyRequestType_ATOMIC_NO_RETURN));
+            recordMissLatency(&seq_req, true, mach, externalHit,
+                              initialRequestTime, forwardRequestTime,
+                              firstResponseTime);
+        } else {
+            // Read, Write or Atomic request:
+            // reissue request to the cache hierarchy
+            // (we don't know if op was performed remotly)
+            issueRequest(seq_req.pkt, seq_req.m_second_type);
+            break;
+        }
+
+        // Atomics clean the monitor entry
+        llscClearMonitor(address);
+
+        markRemoved();
+        ruby_request = false;
+        hitCallback(&seq_req, data, true, mach, externalHit,
+                    initialRequestTime, forwardRequestTime,
+                    firstResponseTime, false);
+        seq_req_list.pop_front();
+    }
+
+    // free all outstanding requests corresponding to this address
+    if (seq_req_list.empty()) {
+        m_RequestTable.erase(address);
+    }
+}
+
 void
 Sequencer::hitCallback(SequencerRequest* srequest, DataBlock& data,
                        bool llscSuccess,
@@ -637,10 +699,16 @@ Sequencer::hitCallback(SequencerRequest* srequest, DataBlock& data,
             (type == RubyRequestType_IFETCH) ||
             (type == RubyRequestType_RMW_Read) ||
             (type == RubyRequestType_Locked_RMW_Read) ||
-            (type == RubyRequestType_Load_Linked)) {
+            (type == RubyRequestType_Load_Linked) ||
+            (type == RubyRequestType_ATOMIC_RETURN)) {
             pkt->setData(
                 data.getData(getOffset(request_address), pkt->getSize()));
-            DPRINTF(RubySequencer, "read data %s\n", data);
+
+           if (type == RubyRequestType_ATOMIC_RETURN) {
+               DPRINTF(RubySequencer, "ATOMIC RETURN data %s\n", data);
+           } else {
+               DPRINTF(RubySequencer, "read data %s\n", data);
+           }
         } else if (pkt->req->isSwap()) {
             assert(!pkt->isMaskedWrite());
             std::vector<uint8_t> overwrite_val(pkt->getSize());
@@ -807,6 +875,19 @@ Sequencer::makeRequest(PacketPtr pkt)
     } else if (pkt->req->isTlbiCmd()) {
         primary_type = secondary_type = tlbiCmdToRubyRequestType(pkt);
         DPRINTF(RubySequencer, "Issuing TLBI\n");
+#if defined (PROTOCOL_CHI)
+    } else if (pkt->isAtomicOp()) {
+        if (pkt->req->isAtomicReturn()){
+            DPRINTF(RubySequencer, "Issuing ATOMIC RETURN \n");
+            primary_type = secondary_type =
+                           RubyRequestType_ATOMIC_RETURN;
+        } else {
+            DPRINTF(RubySequencer, "Issuing ATOMIC NO RETURN\n");
+            primary_type = secondary_type =
+                           RubyRequestType_ATOMIC_NO_RETURN;
+
+        }
+#endif
     } else {
         //
         // To support SwapReq, we need to check isWrite() first: a SwapReq
@@ -914,6 +995,18 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
                                             RubyAccessMode_Supervisor, pkt,
                                             PrefetchBit_No, proc_id, core_id);
 
+        if (pkt->isAtomicOp() &&
+            ((secondary_type == RubyRequestType_ATOMIC_RETURN) ||
+             (secondary_type == RubyRequestType_ATOMIC_NO_RETURN))){
+            // Create the blocksize, access mask and atomicops
+            uint32_t offset = getOffset(pkt->getAddr());
+            std::vector<std::pair<int,AtomicOpFunctor*>> atomicOps;
+            atomicOps.push_back(std::make_pair<int,AtomicOpFunctor*>
+                                (offset, pkt->getAtomicOp()));
+
+            msg->setWriteMask(offset, pkt->getSize(), atomicOps);
+        }
+
         DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %#x %s\n",
                 curTick(), m_version, "Seq", "Begin", "", "",
                 printAddress(msg->getPhysicalAddress()),
diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh
index 020a7d8c20..8f736da6d5 100644
--- a/src/mem/ruby/system/Sequencer.hh
+++ b/src/mem/ruby/system/Sequencer.hh
@@ -126,6 +126,14 @@ class Sequencer : public RubyPort
                       const Cycles forwardRequestTime = Cycles(0),
                       const Cycles firstResponseTime = Cycles(0));
 
+    void atomicCallback(Addr address,
+                        DataBlock& data,
+                        const bool externalHit = false,
+                        const MachineType mach = MachineType_NUM,
+                        const Cycles initialRequestTime = Cycles(0),
+                        const Cycles forwardRequestTime = Cycles(0),
+                        const Cycles firstResponseTime = Cycles(0));
+
     void unaddressedCallback(Addr unaddressedReqId,
                              RubyRequestType requestType,
                              const MachineType mach = MachineType_NUM,