diff --git a/configs/ruby/CHI_config.py b/configs/ruby/CHI_config.py
index 4f2580c373..1288cf95d6 100644
--- a/configs/ruby/CHI_config.py
+++ b/configs/ruby/CHI_config.py
@@ -244,6 +244,7 @@ class CHI_L1Controller(CHI_Cache_Controller):
         self.alloc_on_readunique = True
         self.alloc_on_readonce = True
         self.alloc_on_writeback = True
+        self.alloc_on_atomic = False
         self.dealloc_on_unique = False
         self.dealloc_on_shared = False
         self.dealloc_backinv_unique = True
@@ -280,6 +281,7 @@ class CHI_L2Controller(CHI_Cache_Controller):
         self.alloc_on_readunique = True
         self.alloc_on_readonce = True
         self.alloc_on_writeback = True
+        self.alloc_on_atomic = False
         self.dealloc_on_unique = False
         self.dealloc_on_shared = False
         self.dealloc_backinv_unique = True
@@ -316,6 +318,7 @@ class CHI_HNFController(CHI_Cache_Controller):
         self.alloc_on_readunique = False
         self.alloc_on_readonce = True
         self.alloc_on_writeback = True
+        self.alloc_on_atomic = True
         self.dealloc_on_unique = True
         self.dealloc_on_shared = False
         self.dealloc_backinv_unique = False
@@ -392,6 +395,7 @@ class CHI_DMAController(CHI_Cache_Controller):
         self.alloc_on_readunique = False
         self.alloc_on_readonce = False
         self.alloc_on_writeback = False
+        self.alloc_on_atomic = False
         self.dealloc_on_unique = False
         self.dealloc_on_shared = False
         self.dealloc_backinv_unique = False
diff --git a/src/mem/ruby/protocol/RubySlicc_Types.sm b/src/mem/ruby/protocol/RubySlicc_Types.sm
index 8ba9d935ff..293c731c37 100644
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -139,6 +139,13 @@ structure (Sequencer, external = "yes") {
                      Cycles, Cycles, Cycles);
   void writeUniqueCallback(Addr, DataBlock);
 
+  void atomicCallback(Addr, DataBlock);
+  void atomicCallback(Addr, DataBlock, bool);
+  void atomicCallback(Addr, DataBlock, bool, MachineType);
+  void atomicCallback(Addr, DataBlock, bool, MachineType,
+                      Cycles, Cycles, Cycles);
+
+
   void unaddressedCallback(Addr, RubyRequestType);
   void unaddressedCallback(Addr, RubyRequestType, MachineType);
   void unaddressedCallback(Addr, RubyRequestType, MachineType,
diff --git a/src/mem/ruby/protocol/chi/CHI-cache-actions.sm b/src/mem/ruby/protocol/chi/CHI-cache-actions.sm
index 42e07eb46b..4c9498423c 100644
--- a/src/mem/ruby/protocol/chi/CHI-cache-actions.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache-actions.sm
@@ -148,15 +148,22 @@ action(AllocateTBE_SeqRequest, desc="") {
       out_msg.is_remote_pf := false;
       out_msg.txnId := max_outstanding_transactions;
 
+      out_msg.atomic_op.clear();
+      out_msg.atomic_op.orMask(in_msg.writeMask);
+
       if ((in_msg.Type == RubyRequestType:LD) ||
           (in_msg.Type == RubyRequestType:IFETCH)) {
         out_msg.type := CHIRequestType:Load;
-      } else  if (in_msg.Type == RubyRequestType:ST) {
+      } else if (in_msg.Type == RubyRequestType:ST) {
         if (in_msg.Size == blockSize) {
           out_msg.type := CHIRequestType:StoreLine;
         } else {
           out_msg.type := CHIRequestType:Store;
         }
+      } else if (in_msg.Type == RubyRequestType:ATOMIC_RETURN) {
+        out_msg.type := CHIRequestType:AtomicLoad;
+      } else if (in_msg.Type == RubyRequestType:ATOMIC_NO_RETURN){
+        out_msg.type := CHIRequestType:AtomicStore;
       } else {
         error("Invalid RubyRequestType");
       }
@@ -769,6 +776,148 @@ action(Initiate_StoreMiss, desc="") {
   }
 }
 
+action(Initiate_Atomic_UC, desc="") {
+  if ((policy_type == 0) || // ALL NEAR
+      (policy_type == 1) || // UNIQUE NEAR
+      (policy_type == 2)    // PRESENT NEAR
+      ){
+    tbe.actions.push(Event:DataArrayRead);
+    tbe.actions.push(Event:DelayAtomic);
+    tbe.actions.push(Event:AtomicHit);
+    tbe.actions.pushNB(Event:DataArrayWrite);
+    tbe.actions.pushNB(Event:TagArrayWrite);
+  } else {
+    error("Invalid policy type");
+  }
+}
+
+action(Initiate_Atomic_UD, desc="") {
+  if ((policy_type == 0) || // ALL NEAR
+      (policy_type == 1) || // UNIQUE NEAR
+      (policy_type == 2)    // PRESENT NEAR
+      ){
+    tbe.actions.push(Event:DataArrayRead);
+    tbe.actions.push(Event:DelayAtomic);
+    tbe.actions.push(Event:AtomicHit);
+    tbe.actions.pushNB(Event:DataArrayWrite);
+    tbe.actions.pushNB(Event:TagArrayWrite);
+  } else {
+    error("Invalid policy type");
+  }
+}
+
+action(Initiate_AtomicReturn_I, desc="") {
+  if (policy_type == 0){ // ALL NEAR
+    tbe.actions.push(Event:SendReadUnique);
+    tbe.actions.push(Event:WriteFEPipe);
+    tbe.actions.push(Event:CheckCacheFill);
+    tbe.actions.push(Event:WriteBEPipe);
+    tbe.actions.push(Event:TagArrayWrite);
+    tbe.atomic_to_be_done := true;
+  } else if ((policy_type == 1) || // UNIQUE NEAR
+             (policy_type == 2)) { // PRESENT NEAR
+    tbe.actions.push(Event:SendAtomicReturn_NoWait);
+    tbe.dataToBeInvalid := true;
+    tbe.doCacheFill := false;
+    tbe.atomic_to_be_done := false;
+  } else {
+    error("Invalid policy type");
+  }
+}
+
+action(Initiate_AtomicNoReturn_I, desc="") {
+  if (policy_type == 0){ // ALL NEAR
+    tbe.actions.push(Event:SendReadUnique);
+    tbe.actions.push(Event:WriteFEPipe);
+    tbe.actions.push(Event:CheckCacheFill);
+    tbe.actions.push(Event:WriteBEPipe);
+    tbe.actions.push(Event:TagArrayWrite);
+    tbe.atomic_to_be_done := true;
+  } else if (policy_type == 1) { // UNIQUE NEAR
+    tbe.actions.push(Event:SendAtomicNoReturn);
+    tbe.actions.push(Event:SendANRData);
+    tbe.dataToBeInvalid := true;
+    tbe.doCacheFill := false;
+    tbe.atomic_to_be_done := false;
+  } else {
+    error("Invalid policy type");
+  }
+}
+
+action(Initiate_AtomicReturn_SD, desc="") {
+  if (policy_type == 0){ // ALL NEAR
+    tbe.actions.push(Event:SendReadUnique);
+    tbe.actions.push(Event:WriteFEPipe);
+    tbe.actions.push(Event:CheckCacheFill);
+    tbe.actions.push(Event:WriteBEPipe);
+    tbe.actions.push(Event:TagArrayWrite);
+    tbe.atomic_to_be_done := true;
+  } else if (policy_type == 1) { // UNIQUE NEAR
+    tbe.actions.push(Event:SendAtomicReturn_NoWait);
+    tbe.dataToBeInvalid := true;
+    tbe.doCacheFill := false;
+    tbe.atomic_to_be_done := false;
+  } else {
+    error("Invalid policy type");
+  }
+}
+
+action(Initiate_AtomicNoReturn_SD, desc="") {
+  if (policy_type == 0){ // ALL NEAR
+    tbe.actions.push(Event:SendReadUnique);
+    tbe.actions.push(Event:WriteFEPipe);
+    tbe.actions.push(Event:CheckCacheFill);
+    tbe.actions.push(Event:WriteBEPipe);
+    tbe.actions.push(Event:TagArrayWrite);
+    tbe.atomic_to_be_done := true;
+  } else if (policy_type == 1) { // UNIQUE NEAR
+    tbe.actions.push(Event:SendAtomicNoReturn);
+    tbe.actions.push(Event:SendANRData);
+    tbe.dataToBeInvalid := true;
+    tbe.doCacheFill := false;
+    tbe.atomic_to_be_done := false;
+  } else {
+    error("Invalid policy type");
+  }
+}
+
+action(Initiate_AtomicReturn_SC, desc="") {
+  if (policy_type == 0){ // ALL NEAR
+    tbe.actions.push(Event:SendReadUnique);
+    tbe.actions.push(Event:WriteFEPipe);
+    tbe.actions.push(Event:CheckCacheFill);
+    tbe.actions.push(Event:WriteBEPipe);
+    tbe.actions.push(Event:TagArrayWrite);
+    tbe.atomic_to_be_done := true;
+  } else if (policy_type == 1) { // UNIQUE NEAR
+    tbe.actions.push(Event:SendAtomicReturn_NoWait);
+    tbe.dataToBeInvalid := true;
+    tbe.doCacheFill := false;
+    tbe.atomic_to_be_done := false;
+  } else {
+    error("Invalid policy type");
+  }
+}
+
+action(Initiate_AtomicNoReturn_SC, desc="") {
+  if (policy_type == 0){ // ALL NEAR
+    tbe.actions.push(Event:SendReadUnique);
+    tbe.actions.push(Event:WriteFEPipe);
+    tbe.actions.push(Event:CheckCacheFill);
+    tbe.actions.push(Event:WriteBEPipe);
+    tbe.actions.push(Event:TagArrayWrite);
+    tbe.atomic_to_be_done := true;
+  } else if (policy_type == 1) { // UNIQUE NEAR
+    tbe.actions.push(Event:SendAtomicNoReturn);
+    tbe.actions.push(Event:SendANRData);
+    tbe.dataToBeInvalid := true;
+    tbe.doCacheFill := false;
+    tbe.atomic_to_be_done := false;
+  } else {
+    error("Invalid policy type");
+  }
+}
+
 action(Initiate_StoreUpgrade, desc="") {
   assert(tbe.dataValid);
   assert(is_valid(cache_entry));
@@ -865,8 +1014,111 @@ action(Initiate_WriteUnique_Forward, desc="") {
   tbe.actions.pushNB(Event:TagArrayWrite);
 }
 
+action(Initiate_AtomicReturn_LocalWrite, desc="") {
+  if ((tbe.dir_sharers.count() > 0) && tbe.dataMaybeDirtyUpstream) {
+    tbe.actions.push(Event:SendSnpUnique);
+  } else if (tbe.dir_sharers.count() > 0){
+    // no one will send us data unless we explicitly ask
+    tbe.actions.push(Event:SendSnpUniqueRetToSrc);
+  }
+  tbe.actions.push(Event:SendDBIDResp_AR);
+  tbe.actions.pushNB(Event:WriteFEPipe);
+  tbe.actions.pushNB(Event:SendCompData_AR);
+  tbe.actions.push(Event:WriteFEPipe);
+  tbe.actions.push(Event:CheckCacheFill);
+  tbe.actions.push(Event:DelayAtomic);
+  tbe.actions.push(Event:WriteBEPipe);
+  tbe.actions.push(Event:TagArrayWrite);
+}
 
 
+action(Initiate_AtomicNoReturn_LocalWrite, desc="") {
+  if ((tbe.dir_sharers.count() > 0) && tbe.dataMaybeDirtyUpstream) {
+    tbe.actions.push(Event:SendSnpUnique);
+  } else if (tbe.dir_sharers.count() > 0){
+    // no one will send us data unless we explicitly ask
+    tbe.actions.push(Event:SendSnpUniqueRetToSrc);
+  }
+  if (comp_wu) {
+    tbe.actions.push(Event:SendDBIDResp_ANR);
+    tbe.actions.pushNB(Event:WriteFEPipe);
+    tbe.actions.pushNB(Event:SendComp_ANR);
+  } else {
+    tbe.actions.push(Event:SendCompDBIDResp_ANR);
+    tbe.actions.pushNB(Event:WriteFEPipe);
+  }
+  tbe.actions.push(Event:WriteFEPipe);
+  tbe.actions.push(Event:CheckCacheFill);
+  tbe.actions.push(Event:DelayAtomic);
+  tbe.actions.push(Event:WriteBEPipe);
+  tbe.actions.push(Event:TagArrayWrite);
+}
+
+
+action(Initiate_AtomicReturn_Forward, desc="") {
+  if ((tbe.dir_sharers.count() > 0) &&
+     (tbe.dir_sharers.isElement(tbe.requestor))){
+    tbe.dir_sharers.remove(tbe.requestor);
+  }
+  tbe.actions.push(Event:SendAtomicReturn);
+  tbe.actions.push(Event:SendCompData_AR);
+  tbe.actions.pushNB(Event:TagArrayWrite);
+
+  tbe.dataToBeInvalid := true;
+}
+
+action(Initiate_AtomicNoReturn_Forward, desc="") {
+  if ((tbe.dir_sharers.count() > 0) &&
+     (tbe.dir_sharers.isElement(tbe.requestor))){
+    tbe.dir_sharers.remove(tbe.requestor);
+  }
+  if (comp_wu) {
+    tbe.actions.push(Event:SendAtomicNoReturn);
+    tbe.actions.push(Event:SendDBIDResp_ANR);
+    tbe.actions.pushNB(Event:SendComp_ANR);
+  } else {
+    tbe.actions.push(Event:SendAtomicNoReturn);
+    tbe.actions.push(Event:SendCompDBIDResp_ANR);
+  }
+  tbe.actions.push(Event:WriteBEPipe);
+  tbe.actions.push(Event:SendANRData);
+  tbe.actions.pushNB(Event:TagArrayWrite);
+
+  tbe.dataToBeInvalid := true;
+}
+
+action(Initiate_AtomicReturn_Miss, desc="") {
+  tbe.actions.push(Event:SendReadNoSnp);
+  tbe.actions.pushNB(Event:WriteFEPipe);
+  tbe.actions.push(Event:SendDBIDResp_AR);
+  tbe.actions.pushNB(Event:WriteFEPipe);
+  tbe.actions.pushNB(Event:SendCompData_AR);
+  tbe.actions.push(Event:WriteFEPipe);
+  tbe.actions.push(Event:CheckCacheFill);
+  tbe.actions.push(Event:DelayAtomic);
+  tbe.actions.push(Event:WriteBEPipe);
+  tbe.actions.push(Event:TagArrayWrite);
+}
+
+action(Initiate_AtomicNoReturn_Miss, desc="") {
+  assert(is_HN);
+  tbe.actions.push(Event:SendReadNoSnp);
+  if (comp_wu) {
+    tbe.actions.push(Event:SendDBIDResp_ANR);
+    tbe.actions.pushNB(Event:WriteFEPipe);
+    tbe.actions.pushNB(Event:SendComp_ANR);
+  } else {
+    tbe.actions.push(Event:SendCompDBIDResp_ANR);
+    tbe.actions.pushNB(Event:WriteFEPipe);
+  }
+
+  tbe.actions.push(Event:WriteFEPipe);
+  tbe.actions.push(Event:CheckCacheFill);
+  tbe.actions.push(Event:DelayAtomic);
+  tbe.actions.push(Event:WriteBEPipe);
+  tbe.actions.push(Event:TagArrayWrite);
+}
+
 action(Initiate_CopyBack, desc="") {
   // expect to receive this data after Send_CompDBIDResp
   if (tbe.reqType == CHIRequestType:WriteBackFull) {
@@ -1157,7 +1409,9 @@ action(Send_ReadShared, desc="") {
 
 action(Send_ReadNoSnp, desc="") {
   assert(is_HN);
-  assert(tbe.use_DMT == false);
+  assert((tbe.use_DMT == false) ||
+         ((tbe.reqType == CHIRequestType:AtomicReturn) ||
+          (tbe.reqType == CHIRequestType:AtomicNoReturn)));
 
   clearExpectedReqResp(tbe);
   tbe.expected_req_resp.addExpectedDataType(CHIDataType:CompData_UC);
@@ -1368,6 +1622,45 @@ action(Send_WriteUnique, desc="") {
   tbe.expected_req_resp.addExpectedCount(1);
 }
 
+action(Send_AtomicReturn, desc="") {
+  assert(is_valid(tbe));
+
+  enqueue(reqOutPort, CHIRequestMsg, request_latency) {
+    prepareRequestAtomic(tbe, CHIRequestType:AtomicReturn, out_msg);
+    out_msg.Destination.add(mapAddressToDownstreamMachine(tbe.addr));
+    allowRequestRetry(tbe, out_msg);
+  }
+  clearExpectedReqResp(tbe);
+  tbe.expected_req_resp.addExpectedRespType(CHIResponseType:DBIDResp);
+  tbe.expected_req_resp.addExpectedCount(1);
+}
+
+action(Send_AtomicReturn_NoWait, desc="") {
+  assert(is_valid(tbe));
+
+  enqueue(reqOutPort, CHIRequestMsg, request_latency) {
+    prepareRequestAtomic(tbe, CHIRequestType:AtomicReturn, out_msg);
+    out_msg.Destination.add(mapAddressToDownstreamMachine(tbe.addr));
+    allowRequestRetry(tbe, out_msg);
+  }
+
+  tbe.dataAMOValid := false;
+}
+
+action(Send_AtomicNoReturn, desc="") {
+  assert(is_valid(tbe));
+
+  enqueue(reqOutPort, CHIRequestMsg, request_latency) {
+    prepareRequestAtomic(tbe, CHIRequestType:AtomicNoReturn, out_msg);
+    out_msg.Destination.add(mapAddressToDownstreamMachine(tbe.addr));
+    allowRequestRetry(tbe, out_msg);
+  }
+  tbe.expected_req_resp.addExpectedRespType(CHIResponseType:CompDBIDResp);
+  tbe.expected_req_resp.addExpectedRespType(CHIResponseType:DBIDResp);
+  tbe.expected_req_resp.addExpectedCount(1);
+}
+
+
 action(Send_SnpCleanInvalid, desc="") {
   assert(is_valid(tbe));
   assert(tbe.expected_snp_resp.hasExpected() == false);
@@ -1636,6 +1929,20 @@ action(ExpectNCBWrData, desc="") {
   tbe.dataBlkValid.setMask(addressOffset(tbe.accAddr, tbe.addr), tbe.accSize, false);
 }
 
+action(ExpectNCBWrData_A, desc="") {
+  // Expected data
+  int num_msgs := tbe.accSize / data_channel_size;
+  if ((tbe.accSize % data_channel_size) != 0) {
+    num_msgs := num_msgs + 1;
+  }
+  tbe.expected_req_resp.clear(num_msgs);
+  tbe.expected_req_resp.addExpectedDataType(CHIDataType:NCBWrData);
+  tbe.expected_req_resp.setExpectedCount(1);
+
+  // In atomic operations we do not expect real data for the current block
+  // Thus the mask bits do not care
+}
+
 action(ExpectCompAck, desc="") {
   assert(is_valid(tbe));
   tbe.expected_req_resp.addExpectedRespType(CHIResponseType:CompAck);
@@ -1658,7 +1965,22 @@ action(Receive_ReqDataResp, desc="") {
     }
     // Copy data to tbe only if we didn't have valid data or the received
     // data is dirty
-    if ((tbe.dataBlkValid.isFull() == false) ||
+    if ((in_msg.type == CHIDataType:NCBWrData) &&
+         ((tbe.reqType == CHIRequestType:AtomicReturn) ||
+          (tbe.reqType == CHIRequestType:AtomicNoReturn))){
+      // DO NOTHING
+    } else if ((in_msg.type == CHIDataType:CompData_I) &&
+               ((tbe.reqType == CHIRequestType:AtomicReturn) ||
+                (tbe.reqType == CHIRequestType:AtomicLoad))) {
+      if(tbe.dataBlkValid.isFull()){
+        tbe.dataBlkValid.clear();
+      }
+      tbe.oldDataBlk.copyPartial(in_msg.dataBlk, in_msg.bitMask);
+      assert(tbe.dataBlkValid.isOverlap(in_msg.bitMask) == false);
+      tbe.dataBlkValid.orMask(in_msg.bitMask);
+      DPRINTF(RubySlicc, "Received %s\n", tbe.oldDataBlk);
+      DPRINTF(RubySlicc, "dataBlkValid = %s\n", tbe.dataBlkValid);
+    } else if ((tbe.dataBlkValid.isFull() == false) ||
         (in_msg.type == CHIDataType:CompData_UD_PD) ||
         (in_msg.type == CHIDataType:CompData_SD_PD) ||
         (in_msg.type == CHIDataType:CBWrData_UD_PD) ||
@@ -1683,7 +2005,8 @@ action(Receive_RespSepDataFromCompData, desc="") {
     if (tbe.expected_req_resp.receiveResp(CHIResponseType:RespSepData) == false) {
       error("Received unexpected message");
     }
-    if (is_HN == false) {
+    if ((is_HN == false) && (tbe.reqType != CHIRequestType:AtomicReturn) &&
+        ((tbe.reqType != CHIRequestType:AtomicLoad) || (tbe.atomic_to_be_done == true))){
       // must now ack the responder
       tbe.actions.pushFrontNB(Event:SendCompAck);
     }
@@ -1905,6 +2228,7 @@ action(UpdateDataState_FromReqDataResp, desc="") {
 
       } else if (in_msg.type == CHIDataType:CompData_I) {
         tbe.dataValid := true;
+        tbe.dataAMOValid := true;
         tbe.dataToBeInvalid := true;
         assert(tbe.dataMaybeDirtyUpstream == false);
 
@@ -1946,7 +2270,9 @@ action(UpdateDataState_FromReqDataResp, desc="") {
 
 action(UpdateDataState_FromWUDataResp, desc="") {
   assert(is_valid(tbe));
-  if (tbe.expected_req_resp.hasReceivedData()) {
+  if (tbe.expected_req_resp.hasReceivedData() &&
+       (tbe.reqType != CHIRequestType:AtomicReturn) &&
+       (tbe.reqType != CHIRequestType:AtomicNoReturn)) {
     assert(tbe.dataBlkValid.test(addressOffset(tbe.accAddr, tbe.addr)));
     assert(tbe.dataBlkValid.test(addressOffset(tbe.accAddr, tbe.addr)
                                   + tbe.accSize - 1));
@@ -1964,6 +2290,22 @@ action(UpdateDataState_FromWUDataResp, desc="") {
   printTBEState(tbe);
 }
 
+action(UpdateDataState_FromADataResp, desc="") {
+  assert(is_valid(tbe));
+  if (is_HN && (tbe.expected_req_resp.hasReceivedData()) &&
+      ((tbe.reqType == CHIRequestType:AtomicReturn) ||
+       (tbe.reqType == CHIRequestType:AtomicNoReturn))) {
+    DPRINTF(RubySlicc, "Atomic before %s\n", tbe.dataBlk);
+
+    tbe.oldDataBlk := tbe.dataBlk;
+    tbe.dataBlk.atomicPartial(tbe.dataBlk, tbe.atomic_op);
+    tbe.dataDirty := true;
+
+    DPRINTF(RubySlicc, "Atomic after %s\n", tbe.dataBlk);
+  }
+  printTBEState(tbe);
+}
+
 action(UpdateDataState_FromCUResp, desc="") {
   assert(is_valid(tbe));
   peek(rspInPort, CHIResponseMsg) {
@@ -2127,6 +2469,10 @@ action(Receive_ReqResp_WUNeedComp, desc="") {
   tbe.defer_expected_comp := true;
 }
 
+action(Receive_ReqResp_AR, desc="") {
+  tbe.actions.pushFrontNB(Event:SendARData);
+}
+
 action(Receive_ReqResp_WUComp, desc="") {
   if (tbe.defer_expected_comp) {
     tbe.defer_expected_comp := false;
@@ -2320,6 +2666,36 @@ action(CheckWUComp, desc="") {
   }
 }
 
+action(Send_ARData, desc="") {
+  assert(is_valid(tbe));
+  tbe.snd_msgType := CHIDataType:NCBWrData;
+  tbe.snd_destination := mapAddressToDownstreamMachine(tbe.addr);
+  setupPendingAtomicSend(tbe);
+}
+
+action(Send_ANRData, desc="") {
+  assert(is_valid(tbe));
+  tbe.snd_msgType := CHIDataType:NCBWrData;
+  tbe.snd_destination := mapAddressToDownstreamMachine(tbe.addr);
+  setupPendingAtomicSend(tbe);
+}
+
+action(CheckARComp, desc="") {
+  assert(is_valid(tbe));
+  tbe.expected_req_resp.addExpectedDataType(CHIDataType:CompData_I);
+  tbe.expected_req_resp.addExpectedRespType(CHIResponseType:RespSepData);
+  tbe.expected_req_resp.addExpectedCount(2);
+}
+
+action(CheckANRComp, desc="") {
+  assert(is_valid(tbe));
+  if (tbe.defer_expected_comp) {
+    tbe.defer_expected_comp := false;
+    tbe.expected_req_resp.addExpectedCount(1);
+    tbe.expected_req_resp.addExpectedRespType(CHIResponseType:Comp);
+  }
+}
+
 action(Send_SnpRespData, desc="") {
   assert(is_HN == false);
   assert(is_valid(tbe));
@@ -2531,7 +2907,12 @@ action(Send_Data, desc="") {
     }
     tbe.snd_pendBytes.setMask(offset, range, false);
 
-    out_msg.dataBlk := tbe.dataBlk;
+    if (tbe.reqType == CHIRequestType:AtomicReturn){
+        out_msg.dataBlk := tbe.oldDataBlk;
+    } else {
+        out_msg.dataBlk := tbe.dataBlk;
+    }
+
     out_msg.bitMask.setMask(offset, range);
 
     out_msg.responder := machineID;
@@ -2673,6 +3054,36 @@ action(Send_Comp_WU, desc="") {
   }
 }
 
+
+action(Send_CompData_AR, desc="") {
+  assert(is_valid(tbe));
+  assert(tbe.dataValid);
+
+  if (is_HN) {
+      tbe.oldDataBlk := tbe.dataBlk;
+  }
+
+  tbe.snd_msgType := CHIDataType:CompData_I;
+  tbe.dataMaybeDirtyUpstream := false;
+  tbe.requestorToBeExclusiveOwner := false;
+  tbe.requestorToBeOwner := false;
+  tbe.snd_destination := tbe.requestor;
+  setupPendingSend(tbe);
+  printTBEState(tbe);
+
+}
+
+action(Send_Comp_ANR, desc="") {
+  assert(is_valid(tbe));
+  enqueue(rspOutPort, CHIResponseMsg, comp_anr_latency + response_latency) {
+    out_msg.addr := address;
+    out_msg.type := CHIResponseType:Comp;
+    out_msg.responder := machineID;
+    out_msg.Destination.add(tbe.requestor);
+  }
+}
+
+
 action(Send_SnpRespI, desc="") {
   enqueue(rspOutPort, CHIResponseMsg, response_latency) {
     out_msg.addr := address;
@@ -3003,6 +3414,22 @@ action(Callback_StoreHit, desc="") {
   }
 }
 
+action(Callback_AtomicHit, desc="") {
+  assert(is_valid(tbe));
+  assert(tbe.dataValid);
+  assert((tbe.reqType == CHIRequestType:AtomicLoad) ||
+         (tbe.reqType == CHIRequestType:AtomicStore));
+  DPRINTF(RubySlicc, "Atomic before %s\n", tbe.dataBlk);
+
+  DataBlock oldDataBlk;
+  oldDataBlk := tbe.dataBlk;
+  tbe.dataBlk.atomicPartial(tbe.dataBlk, tbe.atomic_op);
+
+  sequencer.atomicCallback(tbe.addr, oldDataBlk, false);
+  DPRINTF(RubySlicc, "Atomic after %s\n", tbe.dataBlk);
+  tbe.dataDirty := true;
+}
+
 action(Callback_ExpressPrefetchHit, desc="") {
   // have not allocated TBE, but must clear the reservation
   assert(is_invalid(tbe));
@@ -3051,6 +3478,25 @@ action(Callback_Miss, desc="") {
       // also decay the timeout
       scLockDecayLatency();
     }
+  } else if (tbe.dataValid && tbe.atomic_to_be_done &&
+                ((tbe.reqType == CHIRequestType:AtomicLoad) ||
+                 (tbe.reqType == CHIRequestType:AtomicStore))){
+    assert(is_valid(tbe));
+    assert(tbe.dataValid);
+    assert((tbe.reqType == CHIRequestType:AtomicLoad) ||
+           (tbe.reqType == CHIRequestType:AtomicStore));
+    DPRINTF(RubySlicc, "Atomic before %s\n", tbe.dataBlk);
+
+    DataBlock oldDataBlk;
+    oldDataBlk := tbe.dataBlk;
+    tbe.dataBlk.atomicPartial(tbe.dataBlk, tbe.atomic_op);
+
+    sequencer.atomicCallback(tbe.addr, oldDataBlk, false);
+    DPRINTF(RubySlicc, "Atomic after %s\n", tbe.dataBlk);
+    tbe.dataDirty := true;
+  } else if (tbe.dataValid && tbe.dataAMOValid && (tbe.reqType == CHIRequestType:AtomicLoad)) {
+    DPRINTF(RubySlicc, "Atomic before %s\n", tbe.oldDataBlk);
+    sequencer.atomicCallback(tbe.addr, tbe.oldDataBlk, false);
   }
 }
 
@@ -3070,6 +3516,18 @@ action(Unset_Timeout_Cache, desc="") {
   wakeup_port(snpRdyPort, address);
 }
 
+action(Callback_AtomicNoReturn, desc="") {
+  assert(is_valid(tbe));
+  assert((tbe.is_local_pf || tbe.is_remote_pf) == false);
+  assert((tbe.reqType == CHIRequestType:AtomicNoReturn) ||
+	 (tbe.reqType == CHIRequestType:AtomicStore));
+
+  if(tbe.reqType == CHIRequestType:AtomicStore){
+    sequencer.atomicCallback(tbe.addr, tbe.dataBlk);
+    DPRINTF(RubySlicc, "AtomicNoReturn %s\n", tbe.dataBlk);
+  }
+}
+
 action(Callback_WriteUnique, desc="") {
   assert(is_valid(tbe));
   assert((tbe.is_local_pf || tbe.is_remote_pf) == false);
@@ -3183,7 +3641,7 @@ action(Profile_OutgoingEnd_DatalessResp, desc="") {
 action(TagArrayRead, desc="") {
   assert(is_valid(tbe));
   tbe.delayNextAction := curTick() + cyclesToTicks(
-    tagLatency(fromSequencer(tbe.reqType)));
+  tagLatency(fromSequencer(tbe.reqType)));
 }
 
 action(TagArrayWrite, desc="") {
@@ -3235,6 +3693,11 @@ action(FillPipe, desc="") {
   tbe.delayNextAction := curTick() + cyclesToTicks(fill_latency);
 }
 
+action(DelayAtomic, desc="") {
+  assert(is_valid(tbe));
+  tbe.delayNextAction := curTick() + cyclesToTicks(atomic_op_latency);
+}
+
 action(SnpSharedPipe, desc="") {
   assert(is_valid(tbe));
   tbe.delayNextAction := curTick() + cyclesToTicks(snp_latency);
diff --git a/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm b/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm
index 4d8c35053c..371ad05109 100644
--- a/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm
@@ -302,7 +302,9 @@ Cycles dataLatency() {
 bool fromSequencer(CHIRequestType reqType) {
   return reqType == CHIRequestType:Load ||
          reqType == CHIRequestType:Store ||
-         reqType == CHIRequestType:StoreLine;
+         reqType == CHIRequestType:StoreLine ||
+         reqType == CHIRequestType:AtomicLoad ||
+         tbe.reqType == CHIRequestType:AtomicStore;
 }
 
 bool inCache(Addr addr) {
@@ -434,6 +436,9 @@ TBE allocateRequestTBE(Addr addr, CHIRequestMsg in_msg), return_by_pointer="yes"
   tbe.is_local_pf := in_msg.is_local_pf;
   tbe.is_remote_pf := in_msg.is_remote_pf;
 
+  tbe.atomic_op.clear();
+  tbe.atomic_op.orMask(in_msg.atomic_op);
+
   tbe.use_DMT := false;
   tbe.use_DCT := false;
 
@@ -622,6 +627,13 @@ void setupPendingPartialSend(TBE tbe) {
   scheduleSendData(tbe, 0);
 }
 
+void setupPendingAtomicSend(TBE tbe) {
+  assert(blockSize >= data_channel_size);
+  assert((blockSize % data_channel_size) == 0);
+  tbe.snd_pendBytes.setMask(0,tbe.accSize,true);
+  scheduleSendData(tbe, 0);
+}
+
 // common code for downstream requests
 void prepareRequest(TBE tbe, CHIRequestType type, CHIRequestMsg & out_msg) {
   out_msg.addr := tbe.addr;
@@ -644,6 +656,17 @@ void prepareRequest(TBE tbe, CHIRequestType type, CHIRequestMsg & out_msg) {
   assert(tbe.txnId != static_cast(Addr, "value", -1));
 }
 
+void prepareRequestAtomic(TBE tbe, CHIRequestType type,
+                          CHIRequestMsg & out_msg) {
+  assert((type == CHIRequestType:AtomicReturn) ||
+         (type == CHIRequestType:AtomicNoReturn));
+  prepareRequest(tbe, type, out_msg);
+  out_msg.accAddr := tbe.accAddr;
+  out_msg.accSize := tbe.accSize;
+  out_msg.atomic_op.clear();
+  out_msg.atomic_op.orMask(tbe.atomic_op);
+}
+
 void allowRequestRetry(TBE tbe, CHIRequestMsg & out_msg) {
   out_msg.allowRetry := true;
   tbe.pendReqAllowRetry := true;
@@ -672,6 +695,8 @@ void prepareRequestRetry(TBE tbe, CHIRequestMsg & out_msg) {
   out_msg.seqReq := tbe.seqReq;
   out_msg.is_local_pf := false;
   out_msg.is_remote_pf := tbe.is_local_pf || tbe.is_remote_pf;
+  out_msg.atomic_op.clear();
+  out_msg.atomic_op.orMask(tbe.atomic_op);
 }
 
 void prepareRequestRetryDVM(TBE tbe, CHIRequestMsg & out_msg) {
@@ -773,8 +798,12 @@ bool needCacheEntry(CHIRequestType req_type,
                                    (req_type == CHIRequestType:WriteEvictFull) ||
                                    (is_HN && (req_type == CHIRequestType:WriteUniqueFull)))) ||
            (alloc_on_seq_acc && ((req_type == CHIRequestType:Load) ||
-                                 (req_type == CHIRequestType:Store))) ||
-           (alloc_on_seq_line_write && (req_type == CHIRequestType:StoreLine));
+                                 (req_type == CHIRequestType:Store) ||
+                                 (req_type == CHIRequestType:AtomicLoad) ||
+                                 (req_type == CHIRequestType:AtomicStore))) ||
+           (alloc_on_seq_line_write && (req_type == CHIRequestType:StoreLine)) ||
+           (alloc_on_atomic && ((req_type == CHIRequestType:AtomicReturn) ||
+                               (req_type == CHIRequestType:AtomicNoReturn)));
   }
 }
 
@@ -1174,6 +1203,10 @@ Event reqToEvent(CHIRequestType type, bool is_prefetch) {
     return Event:Store;
   } else if (type == CHIRequestType:StoreLine) {
     return Event:Store;
+  } else if (type == CHIRequestType:AtomicLoad) {
+    return Event:AtomicLoad;
+  } else if (type == CHIRequestType:AtomicStore){
+    return Event:AtomicStore;
   } else if (type == CHIRequestType:ReadShared) {
     return Event:ReadShared;
   } else if (type == CHIRequestType:ReadNotSharedDirty) {
@@ -1214,6 +1247,18 @@ Event reqToEvent(CHIRequestType type, bool is_prefetch) {
     return Event:DvmTlbi_Initiate;
   } else if (type == CHIRequestType:DvmSync_Initiate) {
     return Event:DvmSync_Initiate;
+  } else if (type == CHIRequestType:AtomicReturn){
+    if (is_HN) {
+      return Event:AtomicReturn_PoC;
+    } else {
+      return Event:AtomicReturn;
+    }
+  } else if (type == CHIRequestType:AtomicNoReturn){
+    if (is_HN) {
+      return Event:AtomicNoReturn_PoC;
+    } else {
+      return Event:AtomicNoReturn;
+    }
   } else {
     error("Invalid CHIRequestType");
   }
diff --git a/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm b/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm
index cb9ffa567a..0e8c6ec0e3 100644
--- a/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm
@@ -155,6 +155,12 @@ transition({BUSY_INTR,BUSY_BLKD}, FillPipe) {
   ProcessNextState_ClearPending;
 }
 
+transition({BUSY_INTR,BUSY_BLKD}, DelayAtomic) {
+  Pop_TriggerQueue;
+  DelayAtomic;
+  ProcessNextState_ClearPending;
+}
+
 transition({BUSY_INTR,BUSY_BLKD}, SnpSharedPipe) {
   Pop_TriggerQueue;
   SnpSharedPipe;
@@ -418,8 +424,82 @@ transition({RSC,RSD,RUSD,RUSC,RU,I}, WriteUnique, BUSY_BLKD) {
   ProcessNextState;
 }
 
+// AtomicReturn and AtomicNoReturn
 
-// Load / Store from sequencer & Prefetch from prefetcher
+transition({I,SC,SC_RSC,SD,SD_RSD,SD_RSC,RSD,RUSD,
+            UD,UD_RSC,UD_RSD,UD_RU,UC,UC_RSC,UC_RU,RSC,RU}, AtomicReturn, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicReturn_Forward;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition({I,SC,SC_RSC,SD,SD_RSD,SD_RSC,RSD,RUSD,
+            UD,UD_RSC,UD_RSD,UD_RU,UC,UC_RSC,UC_RU,RSC,RU}, AtomicNoReturn, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicNoReturn_Forward;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition({UD,UD_RU,UD_RSD,UD_RSC,UC,UC_RU,UC_RSC},
+           AtomicReturn_PoC, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicReturn_LocalWrite;
+  Profile_Hit;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition({UD,UD_RU,UD_RSD,UD_RSC,UC,UC_RU,UC_RSC},
+           AtomicNoReturn_PoC, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicNoReturn_LocalWrite;
+  Profile_Hit;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition({SD, SD_RSD, SD_RSC, SC, SC_RSC, RSC, RSD, RUSC, RUSD, RU},
+           AtomicReturn_PoC, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicReturn_LocalWrite;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition({SD, SD_RSD, SD_RSC, SC, SC_RSC, RSC, RSD, RUSC, RUSD, RU},
+           AtomicNoReturn_PoC, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicNoReturn_LocalWrite;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition(I, AtomicReturn_PoC, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicReturn_Miss;
+  Allocate_DirEntry;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition(I, AtomicNoReturn_PoC, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicNoReturn_Miss;
+  Allocate_DirEntry;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+
+// Load / Store / Atomic from sequencer & Prefetch from prefetcher
 
 transition({UD,UD_T,SD,UC,SC}, Load, BUSY_BLKD) {
   Initiate_Request;
@@ -460,6 +540,28 @@ transition(BUSY_BLKD, StoreHit) {
   ProcessNextState_ClearPending;
 }
 
+transition(UC, {AtomicLoad,AtomicStore}, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_Atomic_UC;
+  Profile_Hit;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition({UD,UD_T}, {AtomicLoad,AtomicStore}, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_Atomic_UD;
+  Profile_Hit;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition(BUSY_BLKD, AtomicHit) {
+  Pop_TriggerQueue;
+  Callback_AtomicHit;
+  ProcessNextState_ClearPending;
+}
+
 transition(I, {Load,Prefetch}, BUSY_BLKD) {
   Initiate_Request;
   Initiate_LoadMiss;
@@ -494,6 +596,55 @@ transition({BUSY_BLKD,BUSY_INTR}, UseTimeout) {
   Unset_Timeout_TBE;
 }
 
+transition(I, AtomicLoad, BUSY_BLKD){
+  Initiate_Request;
+  Initiate_AtomicReturn_I;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition(I, AtomicStore, BUSY_BLKD){
+  Initiate_Request;
+  Initiate_AtomicNoReturn_I;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition(SD, AtomicLoad, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicReturn_SD;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition(SC, AtomicLoad, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicReturn_SC;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition(SD, AtomicStore, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicNoReturn_SD;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+transition(SC, AtomicStore, BUSY_BLKD) {
+  Initiate_Request;
+  Initiate_AtomicNoReturn_SC;
+  Profile_Miss;
+  Pop_ReqRdyQueue;
+  ProcessNextState;
+}
+
+
 // Evict from Upstream
 
 transition({UD_RSC,SD_RSC,UC_RSC,SC_RSC,RSC,RSD,RUSD,RUSC,UD_RSD,SD_RSD}, Evict, BUSY_BLKD) {
@@ -691,13 +842,15 @@ transition(BUSY_INTR, {SnpOnce,SnpOnceFwd}, BUSY_BLKD) {
 transition({BUSY_BLKD,BUSY_INTR},
             {ReadShared, ReadNotSharedDirty, ReadUnique, ReadUnique_PoC,
             ReadOnce, CleanUnique, CleanUnique_Stale,
-            Load, Store, Prefetch,
+            Load, Store, AtomicLoad, AtomicStore, Prefetch,
             WriteBackFull, WriteBackFull_Stale,
             WriteEvictFull, WriteEvictFull_Stale,
             WriteCleanFull, WriteCleanFull_Stale,
             Evict, Evict_Stale,
             WriteUnique,WriteUniquePtl_PoC,
-            WriteUniqueFull_PoC,WriteUniqueFull_PoC_Alloc}) {
+            WriteUniqueFull_PoC,WriteUniqueFull_PoC_Alloc
+            AtomicReturn,AtomicReturn_PoC,
+            AtomicNoReturn,AtomicNoReturn_PoC}) {
   StallRequest;
 }
 
@@ -754,6 +907,30 @@ transition(BUSY_BLKD, SendWriteUnique, BUSY_INTR) {DestinationAvailable} {
   ProcessNextState_ClearPending;
 }
 
+transition(BUSY_BLKD, SendAtomicReturn, BUSY_INTR) {DestinationAvailable} {
+  Pop_TriggerQueue;
+  Send_AtomicReturn;
+  CheckARComp;
+  Profile_OutgoingStart;
+  ProcessNextState_ClearPending;
+}
+
+transition(BUSY_BLKD, SendAtomicReturn_NoWait, BUSY_INTR) {
+  Pop_TriggerQueue;
+  Send_AtomicReturn_NoWait;
+  CheckARComp;
+  Profile_OutgoingStart;
+  ProcessNextState_ClearPending;
+}
+
+transition(BUSY_BLKD, SendAtomicNoReturn, BUSY_INTR) {DestinationAvailable} {
+  Pop_TriggerQueue;
+  Send_AtomicNoReturn;
+  Profile_OutgoingStart;
+  ProcessNextState_ClearPending;
+}
+
+
 transition(BUSY_BLKD, SendWriteNoSnp, BUSY_INTR) {DestinationAvailable} {
   Pop_TriggerQueue;
   Send_WriteNoSnp;
@@ -804,6 +981,20 @@ transition(BUSY_BLKD, SendWUDataCB) {
   ProcessNextState_ClearPending;
 }
 
+transition({BUSY_BLKD,BUSY_INTR}, SendARData) {
+  Pop_TriggerQueue;
+  Send_ARData;
+  ProcessNextState_ClearPending;
+}
+
+transition({BUSY_BLKD,BUSY_INTR}, SendANRData) {
+  Pop_TriggerQueue;
+  Callback_AtomicNoReturn;
+  Send_ANRData;
+  CheckANRComp;
+  ProcessNextState_ClearPending;
+}
+
 transition(BUSY_BLKD, SendInvSnpResp) {
   Pop_TriggerQueue;
   Send_InvSnpResp;
@@ -1025,6 +1216,26 @@ transition({BUSY_BLKD,BUSY_INTR}, SendComp_WU) {
   ProcessNextState_ClearPending;
 }
 
+transition(BUSY_BLKD, SendCompDBIDResp_ANR) {
+  Pop_TriggerQueue;
+  ExpectNCBWrData_A;
+  Send_CompDBIDResp;
+  ProcessNextState_ClearPending;
+}
+
+transition(BUSY_BLKD, SendDBIDResp_AR) {
+  Pop_TriggerQueue;
+  ExpectNCBWrData_A;
+  Send_DBIDResp;
+  ProcessNextState_ClearPending;
+}
+
+transition({BUSY_BLKD,BUSY_INTR}, SendCompData_AR) {
+  Pop_TriggerQueue;
+  Send_CompData_AR;
+  ProcessNextState_ClearPending;
+}
+
 transition(BUSY_BLKD, SendCompDBIDRespStale) {
   Pop_TriggerQueue;
   Send_CompDBIDResp_Stale;
@@ -1085,6 +1296,7 @@ transition(BUSY_BLKD,
 transition({BUSY_BLKD,BUSY_INTR}, NCBWrData) {
   Receive_ReqDataResp;
   UpdateDataState_FromWUDataResp;
+  UpdateDataState_FromADataResp;
   Pop_DataInQueue;
   ProcessNextState;
 }
@@ -1238,10 +1450,11 @@ transition(BUSY_INTR, CompDBIDResp, BUSY_BLKD) {
 }
 
 // alternative flow for WU with separate Comp
-transition(BUSY_INTR, DBIDResp, BUSY_BLKD) {
+transition({BUSY_INTR,BUSY_BLKD}, DBIDResp, BUSY_BLKD) {
   Receive_ReqResp;
   Receive_ReqResp_CopyDBID;
   Receive_ReqResp_WUNeedComp;
+  Receive_ReqResp_AR;
   Pop_RespInQueue;
   ProcessNextState;
 }
diff --git a/src/mem/ruby/protocol/chi/CHI-cache.sm b/src/mem/ruby/protocol/chi/CHI-cache.sm
index e40989df47..f806488b45 100644
--- a/src/mem/ruby/protocol/chi/CHI-cache.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache.sm
@@ -51,6 +51,7 @@ machine(MachineType:Cache, "Cache coherency protocol") :
   // sending necessary snoops.
   Cycles read_hit_latency := 0;
   Cycles read_miss_latency := 0;
+  Cycles atomic_op_latency := 0;
   Cycles write_fe_latency := 0; // Front-end: Rcv req -> Snd req
   Cycles write_be_latency := 0; // Back-end: Rcv ack -> Snd data
   Cycles fill_latency := 0; // Fill latency
@@ -126,11 +127,24 @@ machine(MachineType:Cache, "Cache coherency protocol") :
   // possible.
   bool enable_DCT;
 
+  // Atomic Operation Policy
+  // All Near executes all Atomics at L1 (variable set to 0; default)
+  // Unique Near executes Atomics at HNF for states I, SC, SD (set to 1)
+  // Present Near execites all Atomics at L1 except when state is I (set to 2)
+  int policy_type := 1;
+
+
   // Use separate Comp/DBIDResp responses for WriteUnique
   bool comp_wu := "False";
   // additional latency for the WU Comp response
   Cycles comp_wu_latency := 0;
 
+
+  // Use separate Comp/DBIDResp responses for AtomicNoResponse
+  bool comp_anr := "False";
+  // additional latency for the ANR Comp response
+  Cycles comp_anr_latency := 0;
+
   // Controls cache clusivity for different request types.
   // set all alloc_on* to false to completelly disable caching
   bool alloc_on_readshared;
@@ -139,6 +153,7 @@ machine(MachineType:Cache, "Cache coherency protocol") :
   bool alloc_on_writeback;
   bool alloc_on_seq_acc;
   bool alloc_on_seq_line_write;
+  bool alloc_on_atomic;
   // Controls if the clusivity is strict.
   bool dealloc_on_unique;
   bool dealloc_on_shared;
@@ -285,6 +300,8 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     // See CHIRequestType in CHi-msg.sm for descriptions
     Load,                        desc="", in_trans="yes";
     Store,                       desc="", in_trans="yes";
+    AtomicLoad,                  desc="", in_trans="yes";
+    AtomicStore,                 desc="", in_trans="yes";
     Prefetch,                    desc="", in_trans="yes";
     ReadShared,                  desc="", in_trans="yes";
     ReadNotSharedDirty,          desc="", in_trans="yes";
@@ -300,6 +317,10 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     WriteUniquePtl_PoC,          desc="", in_trans="yes";
     WriteUniqueFull_PoC,         desc="", in_trans="yes";
     WriteUniqueFull_PoC_Alloc,   desc="", in_trans="yes";
+    AtomicReturn,                desc="", in_trans="yes";
+    AtomicNoReturn,              desc="", in_trans="yes";
+    AtomicReturn_PoC,            desc="", in_trans="yes";
+    AtomicNoReturn_PoC,          desc="", in_trans="yes";
     SnpCleanInvalid,             desc="", in_trans="yes";
     SnpShared,                   desc="", in_trans="yes";
     SnpSharedFwd,                desc="", in_trans="yes";
@@ -418,11 +439,12 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     DataArrayWriteOnFill, desc="Write the cache data array (cache fill)";
 
     // Events for modeling the pipeline latency
-    ReadHitPipe,  desc="Latency of reads served from local cache";
-    ReadMissPipe, desc="Latency of reads not served from local cache";
-    WriteFEPipe,  desc="Front-end latency of write requests";
-    WriteBEPipe,  desc="Back-end latency of write requests";
-    FillPipe,     desc="Cache fill latency";
+    ReadHitPipe,   desc="Latency of reads served from local cache";
+    ReadMissPipe,  desc="Latency of reads not served from local cache";
+    WriteFEPipe,   desc="Front-end latency of write requests";
+    WriteBEPipe,   desc="Back-end latency of write requests";
+    FillPipe,      desc="Cache fill latency";
+    DelayAtomic,   desc="Atomic operation latency";
     SnpSharedPipe, desc="Latency for SnpShared requests";
     SnpInvPipe,    desc="Latency for SnpUnique and SnpCleanInv requests";
     SnpOncePipe,   desc="Latency for SnpOnce requests";
@@ -435,9 +457,9 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     SendReadUnique,       out_trans="yes", desc="Send a ReadUnique";
     SendCompAck,          desc="Send CompAck";
     // Read handling at the completer
-    SendCompData,    desc="Send CompData";
-    WaitCompAck,     desc="Expect to receive CompAck";
-    SendRespSepData, desc="Send RespSepData for a DMT request";
+    SendCompData,         desc="Send CompData";
+    WaitCompAck,          desc="Expect to receive CompAck";
+    SendRespSepData,      desc="Send RespSepData for a DMT request";
 
     // Send a write request downstream.
     SendWriteBackOrWriteEvict, out_trans="yes", desc="Send a WriteBackFull (if line is UD or SD) or WriteEvictFull (if UC)";
@@ -449,11 +471,25 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     SendWUData,                desc="Send write unique data";
     SendWUDataCB,              desc="Send write unique data from a sequencer callback";
     // Write handling at the completer
-    SendCompDBIDResp,      desc="Ack WB with CompDBIDResp";
-    SendCompDBIDRespStale, desc="Ack stale WB with CompDBIDResp";
-    SendCompDBIDResp_WU,   desc="Ack WU with CompDBIDResp and set expected data";
-    SendDBIDResp_WU,       desc="Ack WU with DBIDResp and set expected data";
-    SendComp_WU,           desc="Ack WU completion";
+    SendCompDBIDResp,          desc="Ack WB with CompDBIDResp";
+    SendCompDBIDRespStale,     desc="Ack stale WB with CompDBIDResp";
+    SendCompDBIDResp_WU,       desc="Ack WU with CompDBIDResp and set expected data";
+    SendDBIDResp_WU,           desc="Ack WU with DBIDResp and set expected data";
+    SendComp_WU,               desc="Ack WU completion";
+
+    // Send an atomic request downstream.
+    SendAtomicReturn,          out_trans="yes", desc="Send atomic request with return";
+    SendAtomicReturn_NoWait,   out_trans="yes", desc="Send atomic request with return, but no DBID";
+    SendAtomicNoReturn,        out_trans="yes", desc="Send atomic request without return";
+    SendARData,                desc="Send atomic return request data";
+    SendANRData,               desc="Send atomic no return request data";
+    // Atomic handling at the completer
+    SendDBIDResp_AR,       desc="Ack AR with DBIDResp and set expected data";
+    SendCompData_AR,       desc="Ack AR completion";
+    SendCompDBIDResp_ANR,  desc="Ack ANR with CompDBIDResp and set expected data";
+    SendDBIDResp_ANR,      desc="Ack ANR with DBIDResp and set expected data";
+    SendComp_ANR,          desc="Ack ANR completion";
+
 
     // Dataless requests
     SendEvict,      out_trans="yes", desc="Send a Evict";
@@ -502,6 +538,7 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     // Misc triggers
     LoadHit,  desc="Complete a load hit";
     StoreHit, desc="Complete a store hit";
+    AtomicHit, desc="Complete an atomic hit";
     UseTimeout, desc="Transition from UD_T -> UD";
     RestoreFromHazard, desc="Restore from a snoop hazard";
     TX_Data, desc="Transmit pending data messages";
@@ -613,6 +650,10 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     bool is_local_pf,       desc="Request generated by a local prefetcher";
     bool is_remote_pf,      desc="Request generated a prefetcher in another cache";
 
+    // Atomic info associated with the transaction
+    WriteMask atomic_op,    desc="Atomic Operation Wrapper";
+    bool atomic_to_be_done, desc="We have yet to perform the atomic";
+
     // NOTE: seqReq is a smart pointer pointing to original CPU request object
     // that triggers transactions associated with this TBE. seqReq carries some
     // information (e.g., PC of requesting instruction, virtual address of this
@@ -630,8 +671,10 @@ machine(MachineType:Cache, "Cache coherency protocol") :
     // stable state.
     bool hasUseTimeout,           desc="Line is locked under store/use timeout";
     DataBlock dataBlk,            desc="Local copy of the line";
+    DataBlock oldDataBlk,         desc="Local copy of the line before executing atomic";
     WriteMask dataBlkValid,       desc="Marks which bytes in the DataBlock are valid";
     bool dataValid,               desc="Local copy is valid";
+    bool dataAMOValid,            desc="Local copy is valid for AMO";
     bool dataDirty,               desc="Local copy is dirtry";
     bool dataMaybeDirtyUpstream,  desc="Line maybe dirty upstream";
     bool dataUnique,              desc="Line is unique either locally or upsatream";
diff --git a/src/mem/ruby/protocol/chi/CHI-msg.sm b/src/mem/ruby/protocol/chi/CHI-msg.sm
index f3c2d66363..b9e11d9dd9 100644
--- a/src/mem/ruby/protocol/chi/CHI-msg.sm
+++ b/src/mem/ruby/protocol/chi/CHI-msg.sm
@@ -46,6 +46,8 @@ enumeration(CHIRequestType, desc="") {
   Load;
   Store;
   StoreLine;
+  AtomicLoad;
+  AtomicStore;
   // Incoming DVM-related requests generated by the sequencer
   DvmTlbi_Initiate;
   DvmSync_Initiate;
@@ -66,6 +68,9 @@ enumeration(CHIRequestType, desc="") {
   WriteUniquePtl;
   WriteUniqueFull;
 
+  AtomicReturn;
+  AtomicNoReturn;
+
   SnpSharedFwd;
   SnpNotSharedDirtyFwd;
   SnpUniqueFwd;
@@ -108,6 +113,8 @@ structure(CHIRequestMsg, desc="", interface="Message") {
   bool is_local_pf,         desc="Request generated by a local prefetcher";
   bool is_remote_pf,        desc="Request generated a prefetcher in another cache";
 
+  WriteMask atomic_op,      desc="Atomic Operation Wrapper";
+
   bool usesTxnId,       desc="True if using a Transaction ID", default="false";
   Addr txnId,           desc="Transaction ID", default="0";
 
diff --git a/src/mem/ruby/slicc_interface/RubyRequest.cc b/src/mem/ruby/slicc_interface/RubyRequest.cc
index 643c1dec6f..c6faf2d76f 100644
--- a/src/mem/ruby/slicc_interface/RubyRequest.cc
+++ b/src/mem/ruby/slicc_interface/RubyRequest.cc
@@ -123,5 +123,14 @@ RubyRequest::functionalWrite(Packet *pkt)
     return cBase < cTail;
 }
 
+void
+RubyRequest::setWriteMask(uint32_t offset, uint32_t len,
+        std::vector< std::pair<int,AtomicOpFunctor*>> atomicOps)
+{
+    m_writeMask.setMask(offset, len);
+    m_writeMask.setAtomicOps(atomicOps);
+}
+
+
 } // namespace ruby
 } // namespace gem5
diff --git a/src/mem/ruby/slicc_interface/RubyRequest.hh b/src/mem/ruby/slicc_interface/RubyRequest.hh
index 89ce83451e..1e9674b9f5 100644
--- a/src/mem/ruby/slicc_interface/RubyRequest.hh
+++ b/src/mem/ruby/slicc_interface/RubyRequest.hh
@@ -226,6 +226,8 @@ class RubyRequest : public Message
     const PrefetchBit& getPrefetch() const { return m_Prefetch; }
     RequestPtr getRequestPtr() const { return m_pkt->req; }
 
+    void setWriteMask(uint32_t offset, uint32_t len,
+        std::vector< std::pair<int,AtomicOpFunctor*>> atomicOps);
     void print(std::ostream& out) const;
     bool functionalRead(Packet *pkt);
     bool functionalRead(Packet *pkt, WriteMask &mask);
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
index 82fc19b57c..48054febef 100644
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -466,8 +466,12 @@ Sequencer::writeCallback(Addr address, DataBlock& data,
     bool ruby_request = true;
     while (!seq_req_list.empty()) {
         SequencerRequest &seq_req = seq_req_list.front();
+        // Atomic Request may be executed remotly in the cache hierarchy
+        bool atomic_req =
+           ((seq_req.m_type == RubyRequestType_ATOMIC_RETURN) ||
+            (seq_req.m_type == RubyRequestType_ATOMIC_NO_RETURN));
 
-        if (noCoales && !ruby_request) {
+        if ((noCoales || atomic_req) && !ruby_request) {
             // Do not process follow-up requests
             // (e.g. if full line no present)
             // Reissue to the cache hierarchy
@@ -479,6 +483,8 @@ Sequencer::writeCallback(Addr address, DataBlock& data,
             assert(seq_req.m_type != RubyRequestType_LD);
             assert(seq_req.m_type != RubyRequestType_Load_Linked);
             assert(seq_req.m_type != RubyRequestType_IFETCH);
+            assert(seq_req.m_type != RubyRequestType_ATOMIC_RETURN);
+            assert(seq_req.m_type != RubyRequestType_ATOMIC_NO_RETURN);
         }
 
         // handle write request
@@ -594,6 +600,62 @@ Sequencer::readCallback(Addr address, DataBlock& data,
     }
 }
 
+void
+Sequencer::atomicCallback(Addr address, DataBlock& data,
+                         const bool externalHit, const MachineType mach,
+                         const Cycles initialRequestTime,
+                         const Cycles forwardRequestTime,
+                         const Cycles firstResponseTime)
+{
+    //
+    // Free the first request (an atomic operation) from the list.
+    // Then issue the next request to ruby system as we cannot
+    // assume the cache line is present in the cache
+    // (the opperation could be performed remotly)
+    //
+    assert(address == makeLineAddress(address));
+    assert(m_RequestTable.find(address) != m_RequestTable.end());
+    auto &seq_req_list = m_RequestTable[address];
+
+    // Perform hitCallback only on the first cpu request that
+    // issued the ruby request
+    bool ruby_request = true;
+    while (!seq_req_list.empty()) {
+        SequencerRequest &seq_req = seq_req_list.front();
+
+        if (ruby_request) {
+            // Check that the request was an atomic memory operation
+            // and record the latency
+            assert((seq_req.m_type == RubyRequestType_ATOMIC_RETURN) ||
+                   (seq_req.m_type == RubyRequestType_ATOMIC_NO_RETURN));
+            recordMissLatency(&seq_req, true, mach, externalHit,
+                              initialRequestTime, forwardRequestTime,
+                              firstResponseTime);
+        } else {
+            // Read, Write or Atomic request:
+            // reissue request to the cache hierarchy
+            // (we don't know if op was performed remotly)
+            issueRequest(seq_req.pkt, seq_req.m_second_type);
+            break;
+        }
+
+        // Atomics clean the monitor entry
+        llscClearMonitor(address);
+
+        markRemoved();
+        ruby_request = false;
+        hitCallback(&seq_req, data, true, mach, externalHit,
+                    initialRequestTime, forwardRequestTime,
+                    firstResponseTime, false);
+        seq_req_list.pop_front();
+    }
+
+    // free all outstanding requests corresponding to this address
+    if (seq_req_list.empty()) {
+        m_RequestTable.erase(address);
+    }
+}
+
 void
 Sequencer::hitCallback(SequencerRequest* srequest, DataBlock& data,
                        bool llscSuccess,
@@ -637,10 +699,16 @@ Sequencer::hitCallback(SequencerRequest* srequest, DataBlock& data,
             (type == RubyRequestType_IFETCH) ||
             (type == RubyRequestType_RMW_Read) ||
             (type == RubyRequestType_Locked_RMW_Read) ||
-            (type == RubyRequestType_Load_Linked)) {
+            (type == RubyRequestType_Load_Linked) ||
+            (type == RubyRequestType_ATOMIC_RETURN)) {
             pkt->setData(
                 data.getData(getOffset(request_address), pkt->getSize()));
-            DPRINTF(RubySequencer, "read data %s\n", data);
+
+           if (type == RubyRequestType_ATOMIC_RETURN) {
+               DPRINTF(RubySequencer, "ATOMIC RETURN data %s\n", data);
+           } else {
+               DPRINTF(RubySequencer, "read data %s\n", data);
+           }
         } else if (pkt->req->isSwap()) {
             assert(!pkt->isMaskedWrite());
             std::vector<uint8_t> overwrite_val(pkt->getSize());
@@ -807,6 +875,19 @@ Sequencer::makeRequest(PacketPtr pkt)
     } else if (pkt->req->isTlbiCmd()) {
         primary_type = secondary_type = tlbiCmdToRubyRequestType(pkt);
         DPRINTF(RubySequencer, "Issuing TLBI\n");
+#if defined (PROTOCOL_CHI)
+    } else if (pkt->isAtomicOp()) {
+        if (pkt->req->isAtomicReturn()){
+            DPRINTF(RubySequencer, "Issuing ATOMIC RETURN \n");
+            primary_type = secondary_type =
+                           RubyRequestType_ATOMIC_RETURN;
+        } else {
+            DPRINTF(RubySequencer, "Issuing ATOMIC NO RETURN\n");
+            primary_type = secondary_type =
+                           RubyRequestType_ATOMIC_NO_RETURN;
+
+        }
+#endif
     } else {
         //
         // To support SwapReq, we need to check isWrite() first: a SwapReq
@@ -914,6 +995,18 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
                                             RubyAccessMode_Supervisor, pkt,
                                             PrefetchBit_No, proc_id, core_id);
 
+        if (pkt->isAtomicOp() &&
+            ((secondary_type == RubyRequestType_ATOMIC_RETURN) ||
+             (secondary_type == RubyRequestType_ATOMIC_NO_RETURN))){
+            // Create the blocksize, access mask and atomicops
+            uint32_t offset = getOffset(pkt->getAddr());
+            std::vector<std::pair<int,AtomicOpFunctor*>> atomicOps;
+            atomicOps.push_back(std::make_pair<int,AtomicOpFunctor*>
+                                (offset, pkt->getAtomicOp()));
+
+            msg->setWriteMask(offset, pkt->getSize(), atomicOps);
+        }
+
         DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %#x %s\n",
                 curTick(), m_version, "Seq", "Begin", "", "",
                 printAddress(msg->getPhysicalAddress()),
diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh
index 020a7d8c20..8f736da6d5 100644
--- a/src/mem/ruby/system/Sequencer.hh
+++ b/src/mem/ruby/system/Sequencer.hh
@@ -126,6 +126,14 @@ class Sequencer : public RubyPort
                       const Cycles forwardRequestTime = Cycles(0),
                       const Cycles firstResponseTime = Cycles(0));
 
+    void atomicCallback(Addr address,
+                        DataBlock& data,
+                        const bool externalHit = false,
+                        const MachineType mach = MachineType_NUM,
+                        const Cycles initialRequestTime = Cycles(0),
+                        const Cycles forwardRequestTime = Cycles(0),
+                        const Cycles firstResponseTime = Cycles(0));
+
     void unaddressedCallback(Addr unaddressedReqId,
                              RubyRequestType requestType,
                              const MachineType mach = MachineType_NUM,