gpu-compute, mem-ruby, configs: Add GCN3 ISA support to GPU model

Change-Id: Ibe46970f3ba25d62ca2ade5cbc2054ad746b2254 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/29912 Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com> Tested-by: kokoro <noreply+kokoro@google.com>
2018-05-01 16:59:35 -04:00
parent b0eac7857a
commit b8da9abba7
86 changed files with 10299 additions and 3734 deletions
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -86,6 +86,14 @@ MemCmd::commandInfo[] =
            WriteResp, "WriteReq" },
    /* WriteResp */
    { SET2(IsWrite, IsResponse), InvalidCmd, "WriteResp" },
+    /* WriteCompleteResp - The WriteCompleteResp command is needed
+     * because in the GPU memory model we use a WriteResp to indicate
+     * that a write has reached the cache controller so we can free
+     * resources at the coalescer. Later, when the write succesfully
+     * completes we send a WriteCompleteResp to the CU so its wait
+     * counters can be updated. Wait counters in the CU is how memory
+     * dependences are handled in the GPU ISA. */
+    { SET2(IsWrite, IsResponse), InvalidCmd, "WriteCompleteResp" },
    /* WritebackDirty */
    { SET5(IsWrite, IsRequest, IsEviction, HasData, FromCache),
            InvalidCmd, "WritebackDirty" },
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -83,6 +83,7 @@ class MemCmd
        ReadRespWithInvalidate,
        WriteReq,
        WriteResp,
+        WriteCompleteResp,
        WritebackDirty,
        WritebackClean,
        WriteClean,            // writes dirty data below without evicting
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -298,9 +298,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
            trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
          } else {
            if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
-              if (in_msg.segment == HSASegment:SPILL) {
-                trigger(Event:StoreLocal, in_msg.LineAddress, cache_entry, tbe);
-              } else if (WB) {
+              if (WB) {
                trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
              } else {
                trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
--- a/src/mem/ruby/protocol/GPU_VIPER-msg.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-msg.sm
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+structure (GPUCoalescer, external = "yes") {
+  void readCallback(Addr, DataBlock);
+  void readCallback(Addr, MachineType, DataBlock);
+  void readCallback(Addr, MachineType, DataBlock,
+                    Cycles, Cycles, Cycles);
+  void readCallback(Addr, MachineType, DataBlock,
+                    Cycles, Cycles, Cycles, bool);
+  void writeCallback(Addr, DataBlock);
+  void writeCallback(Addr, MachineType, DataBlock);
+  void writeCallback(Addr, MachineType, DataBlock,
+                     Cycles, Cycles, Cycles);
+  void writeCallback(Addr, MachineType, DataBlock,
+                     Cycles, Cycles, Cycles, bool);
+  void evictionCallback(Addr);
+  void recordCPReadCallBack(MachineID, MachineID);
+  void recordCPWriteCallBack(MachineID, MachineID);
+}
+
+structure (VIPERCoalescer, external = "yes") {
+  void readCallback(Addr, DataBlock);
+  void readCallback(Addr, MachineType, DataBlock);
+  void readCallback(Addr, MachineType, DataBlock,
+                    Cycles, Cycles, Cycles);
+  void readCallback(Addr, MachineType, DataBlock,
+                    Cycles, Cycles, Cycles, bool);
+  void writeCallback(Addr, DataBlock);
+  void writeCallback(Addr, MachineType, DataBlock);
+  void writeCallback(Addr, MachineType, DataBlock,
+                     Cycles, Cycles, Cycles);
+  void writeCallback(Addr, MachineType, DataBlock,
+                     Cycles, Cycles, Cycles, bool);
+  void invCallback(Addr);
+  void wbCallback(Addr);
+  void evictionCallback(Addr);
+}
--- a/src/mem/ruby/protocol/GPU_VIPER.slicc
+++ b/src/mem/ruby/protocol/GPU_VIPER.slicc
@@ -3,6 +3,7 @@ include "RubySlicc_interfaces.slicc";
 include "MOESI_AMD_Base-msg.sm";
 include "MOESI_AMD_Base-dir.sm";
 include "MOESI_AMD_Base-CorePair.sm";
+include "GPU_VIPER-msg.sm";
 include "GPU_VIPER-TCP.sm";
 include "GPU_VIPER-SQC.sm";
 include "GPU_VIPER-TCC.sm";
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-msg.sm
@@ -135,7 +135,6 @@ structure(CPURequestMsg, desc="...", interface="Message") {
  CoherenceRequestType OriginalType, default="CoherenceRequestType_NA",  desc="Type of request from core fwded through region buffer";
  WriteMask writeMask, desc="Write Through Data";
  MachineID WTRequestor,            desc="Node who initiated the write through";
-  HSAScope scope,                      default="HSAScope_SYSTEM", desc="Request Scope";
  int wfid,                         default="0", desc="wavefront id";
  bool NoWriteConflict,             default="true", desc="write collided with CAB entry";
  int ProgramCounter,               desc="PC that accesses to this block";
--- a/src/mem/ruby/protocol/RubySlicc_Exports.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Exports.sm
@@ -103,26 +103,6 @@ enumeration(AccessPermission, desc="...", default="AccessPermission_NotPresent")
  NotPresent, desc="block is NotPresent";
  Busy,       desc="block is in a transient state, currently invalid";
 }
-//HSA scopes
-enumeration(HSAScope, desc="...", default="HSAScope_UNSPECIFIED") {
-  UNSPECIFIED, desc="Unspecified scope";
-  NOSCOPE,     desc="Explictly unscoped";
-  WAVEFRONT,   desc="Wavefront scope";
-  WORKGROUP,   desc="Workgroup scope";
-  DEVICE,      desc="Device scope";
-  SYSTEM,      desc="System scope";
-}
-
-// HSA segment types
-enumeration(HSASegment, desc="...", default="HSASegment_GLOBAL") {
-  GLOBAL,   desc="Global segment";
-  GROUP,    desc="Group segment";
-  PRIVATE,  desc="Private segment";
-  KERNARG,  desc="Kernarg segment";
-  READONLY, desc="Readonly segment";
-  SPILL,    desc="Spill segment";
-  ARG,      desc="Arg segment";
-}

 // TesterStatus
 enumeration(TesterStatus, desc="...") {
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -138,42 +138,6 @@ structure (Sequencer, external = "yes") {
  bool checkResourceAvailable(CacheResourceType, Addr);
 }

-structure (GPUCoalescer, external = "yes") {
-  void readCallback(Addr, DataBlock);
-  void readCallback(Addr, MachineType, DataBlock);
-  void readCallback(Addr, MachineType, DataBlock,
-                    Cycles, Cycles, Cycles);
-  void readCallback(Addr, MachineType, DataBlock,
-                    Cycles, Cycles, Cycles, bool);
-  void writeCallback(Addr, DataBlock);
-  void writeCallback(Addr, MachineType, DataBlock);
-  void writeCallback(Addr, MachineType, DataBlock,
-                     Cycles, Cycles, Cycles);
-  void writeCallback(Addr, MachineType, DataBlock,
-                     Cycles, Cycles, Cycles, bool);
-  void evictionCallback(Addr);
-  void recordCPReadCallBack(MachineID, MachineID);
-  void recordCPWriteCallBack(MachineID, MachineID);
-}
-
-structure (VIPERCoalescer, external = "yes") {
-  void readCallback(Addr, DataBlock);
-  void readCallback(Addr, MachineType, DataBlock);
-  void readCallback(Addr, MachineType, DataBlock,
-                    Cycles, Cycles, Cycles);
-  void readCallback(Addr, MachineType, DataBlock,
-                    Cycles, Cycles, Cycles, bool);
-  void writeCallback(Addr, DataBlock);
-  void writeCallback(Addr, MachineType, DataBlock);
-  void writeCallback(Addr, MachineType, DataBlock,
-                     Cycles, Cycles, Cycles);
-  void writeCallback(Addr, MachineType, DataBlock,
-                     Cycles, Cycles, Cycles, bool);
-  void invCallback(Addr);
-  void wbCallback(Addr);
-  void evictionCallback(Addr);
-}
-
 structure(RubyRequest, desc="...", interface="Message", external="yes") {
  Addr LineAddress,       desc="Line address for this request";
  Addr PhysicalAddress,   desc="Physical address for this request";
@@ -186,8 +150,6 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") {
  WriteMask writeMask,       desc="Writethrough mask";
  DataBlock WTData,          desc="Writethrough data block";
  int wfid,                  desc="Writethrough wavefront";
-  HSAScope scope,            desc="HSA scope";
-  HSASegment segment,        desc="HSA segment";
  PacketPtr pkt,             desc="Packet associated with this request";
 }

--- a/src/mem/ruby/slicc_interface/AbstractController.cc
+++ b/src/mem/ruby/slicc_interface/AbstractController.cc
@@ -43,7 +43,6 @@
 #include "debug/RubyQueue.hh"
 #include "mem/ruby/network/Network.hh"
 #include "mem/ruby/protocol/MemoryMsg.hh"
-#include "mem/ruby/system/GPUCoalescer.hh"
 #include "mem/ruby/system/RubySystem.hh"
 #include "mem/ruby/system/Sequencer.hh"
 #include "sim/system.hh"
--- a/src/mem/ruby/slicc_interface/RubyRequest.hh
+++ b/src/mem/ruby/slicc_interface/RubyRequest.hh
@@ -35,8 +35,6 @@
 #include "mem/ruby/common/Address.hh"
 #include "mem/ruby/common/DataBlock.hh"
 #include "mem/ruby/common/WriteMask.hh"
-#include "mem/ruby/protocol/HSAScope.hh"
-#include "mem/ruby/protocol/HSASegment.hh"
 #include "mem/ruby/protocol/Message.hh"
 #include "mem/ruby/protocol/PrefetchBit.hh"
 #include "mem/ruby/protocol/RubyAccessMode.hh"
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -61,58 +61,6 @@

 using namespace std;

-GPUCoalescer *
-RubyGPUCoalescerParams::create()
-{
-    return new GPUCoalescer(this);
-}
-
-HSAScope
-reqScopeToHSAScope(const RequestPtr &req)
-{
-    HSAScope accessScope = HSAScope_UNSPECIFIED;
-    if (req->isScoped()) {
-        if (req->isWavefrontScope()) {
-            accessScope = HSAScope_WAVEFRONT;
-        } else if (req->isWorkgroupScope()) {
-            accessScope = HSAScope_WORKGROUP;
-        } else if (req->isDeviceScope()) {
-            accessScope = HSAScope_DEVICE;
-        } else if (req->isSystemScope()) {
-            accessScope = HSAScope_SYSTEM;
-        } else {
-            fatal("Bad scope type");
-        }
-    }
-    return accessScope;
-}
-
-HSASegment
-reqSegmentToHSASegment(const RequestPtr &req)
-{
-    HSASegment accessSegment = HSASegment_GLOBAL;
-
-    if (req->isGlobalSegment()) {
-        accessSegment = HSASegment_GLOBAL;
-    } else if (req->isGroupSegment()) {
-        accessSegment = HSASegment_GROUP;
-    } else if (req->isPrivateSegment()) {
-        accessSegment = HSASegment_PRIVATE;
-    } else if (req->isKernargSegment()) {
-        accessSegment = HSASegment_KERNARG;
-    } else if (req->isReadonlySegment()) {
-        accessSegment = HSASegment_READONLY;
-    } else if (req->isSpillSegment()) {
-        accessSegment = HSASegment_SPILL;
-    } else if (req->isArgSegment()) {
-        accessSegment = HSASegment_ARG;
-    } else {
-        fatal("Bad segment type");
-    }
-
-    return accessSegment;
-}
-
 UncoalescedTable::UncoalescedTable(GPUCoalescer *gc)
    : coalescer(gc)
 {
@@ -152,6 +100,7 @@ UncoalescedTable::updateResources()
 {
    for (auto iter = instMap.begin(); iter != instMap.end(); ) {
        if (iter->second.empty()) {
+            DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", iter->first);
            instMap.erase(iter++);
            coalescer->getGMTokenPort().sendTokens(1);
        } else {
@@ -160,15 +109,27 @@ UncoalescedTable::updateResources()
    }
 }

+bool
+UncoalescedTable::areRequestsDone(const uint64_t instSeqNum) {
+    // iterate the instructions held in UncoalescedTable to see whether there
+    // are more requests to issue; if yes, not yet done; otherwise, done
+    for (auto& inst : instMap) {
+        DPRINTF(GPUCoalescer, "instSeqNum= %d, pending packets=%d\n"
+            ,inst.first, inst.second.size());
+        if (inst.first == instSeqNum) { return false; }
+    }
+
+    return true;
+}
+
 void
 UncoalescedTable::printRequestTable(std::stringstream& ss)
 {
-    ss << "UncoalescedTable contains " << instMap.size()
-       << " address entries." << std::endl;
+    ss << "Listing pending packets from " << instMap.size() << " instructions";
+
    for (auto& inst : instMap) {
-        ss << "Addr 0x" << std::hex << inst.first << std::dec
-           << " with " << inst.second.size() << " packets"
-           << std::endl;
+        ss << "\tAddr: " << printAddress(inst.first) << " with "
+           << inst.second.size() << " pending packets" << std::endl;
    }
 }

@@ -227,7 +188,6 @@ GPUCoalescer::GPUCoalescer(const Params *p)
    assert(m_dataCache_ptr);

    m_runningGarnetStandalone = p->garnet_standalone;
-    assumingRfOCoherence = p->assume_rfo;
 }

 GPUCoalescer::~GPUCoalescer()
@@ -254,18 +214,9 @@ GPUCoalescer::wakeup()
            if (current_time - req->getIssueTime() > m_deadlock_threshold) {
                std::stringstream ss;
                printRequestTable(ss);
-                ss << "Outstanding requests: " << m_outstanding_count
-                   << std::endl;
-
-                panic("Possible Deadlock detected. Aborting!\n"
-                     "version: %d request.paddr: 0x%x coalescedTable: %d "
-                     "current time: %u issue_time: %d difference: %d\n"
-                     "Request Tables:\n %s", m_version,
-                      req->getFirstPkt()->getAddr(),
-                      coalescedTable.size(), cyclesToTicks(current_time),
-                      cyclesToTicks(req->getIssueTime()),
-                      cyclesToTicks(current_time - req->getIssueTime()),
-                      ss.str());
+                warn("GPUCoalescer %d Possible deadlock detected!\n%s\n",
+                     m_version, ss.str());
+                panic("Aborting due to deadlock!\n");
            }
        }
    }
@@ -283,21 +234,27 @@ GPUCoalescer::wakeup()
 void
 GPUCoalescer::printRequestTable(std::stringstream& ss)
 {
-    uncoalescedTable.printRequestTable(ss);
+    ss << "Printing out " << coalescedTable.size()
+       << " outstanding requests in the coalesced table\n";

-    ss << "CoalescedTable contains " << coalescedTable.size()
-       << " address entries." << std::endl;
    for (auto& requestList : coalescedTable) {
-        ss << "Addr 0x" << std::hex << requestList.first << std::dec
-           << ": type-";
        for (auto& request : requestList.second) {
-            ss << RubyRequestType_to_string(request->getRubyType())
-               << " pkts-" << request->getPackets().size()
-               << " issued-" << request->getIssueTime() << " seqNum-"
-               << request->getSeqNum() << "; ";
+            ss << "\tAddr: " << printAddress(requestList.first) << "\n"
+               << "\tInstruction sequence number: "
+               << request->getSeqNum() << "\n"
+               << "\t\tType: "
+               << RubyRequestType_to_string(request->getRubyType()) << "\n"
+               << "\t\tNumber of associated packets: "
+               << request->getPackets().size() << "\n"
+               << "\t\tIssue time: "
+               << request->getIssueTime() * clockPeriod() << "\n"
+               << "\t\tDifference from current tick: "
+               << (curCycle() - request->getIssueTime()) * clockPeriod();
        }
-        ss << std::endl;
    }
+
+    // print out packets waiting to be issued in uncoalesced table
+    uncoalescedTable.printRequestTable(ss);
 }

 void
@@ -387,6 +344,7 @@ GPUCoalescer::writeCallback(Addr address,
    hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
                forwardRequestTime, firstResponseTime, isRegion);

+    // remove this crequest in coalescedTable
    delete crequest;
    coalescedTable.at(address).pop_front();

@@ -398,6 +356,36 @@ GPUCoalescer::writeCallback(Addr address,
    }
 }

+void
+GPUCoalescer::writeCompleteCallback(Addr address,
+                                    uint64_t instSeqNum,
+                                    MachineType mach)
+{
+    DPRINTF(GPUCoalescer, "writeCompleteCallback for address 0x%x"
+            " instSeqNum = %d\n", address, instSeqNum);
+
+    assert(pendingWriteInsts.count(instSeqNum) == 1);
+    PendingWriteInst& inst = pendingWriteInsts[instSeqNum];
+
+    // check the uncoalescedTable to see whether all requests for the inst
+    // have been issued or not
+    bool reqsAllIssued = uncoalescedTable.areRequestsDone(instSeqNum);
+    DPRINTF(GPUCoalescer, "instSeqNum = %d, pendingStores=%d, "
+                    "reqsAllIssued=%d\n", reqsAllIssued,
+                    inst.getNumPendingStores()-1, reqsAllIssued);
+
+    if (inst.receiveWriteCompleteAck() && reqsAllIssued ) {
+        // if the pending write instruction has received all write completion
+        // callbacks for its issued Ruby requests, we can now start respond
+        // the requesting CU in one response packet.
+        inst.ackWriteCompletion(m_usingRubyTester);
+
+        DPRINTF(GPUCoalescer, "write inst %d completed at coalescer\n",
+                instSeqNum);
+        pendingWriteInsts.erase(instSeqNum);
+    }
+}
+
 void
 GPUCoalescer::readCallback(Addr address, DataBlock& data)
 {
@@ -477,7 +465,7 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
 {
    PacketPtr pkt = crequest->getFirstPkt();
    Addr request_address = pkt->getAddr();
-    Addr request_line_address = makeLineAddress(request_address);
+    Addr request_line_address M5_VAR_USED = makeLineAddress(request_address);

    RubyRequestType type = crequest->getRubyType();

@@ -516,20 +504,6 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
                    "%s\n",
                    RubyRequestType_to_string(type));
        }
-
-        // If using the RubyTester, update the RubyTester sender state's
-        // subBlock with the recieved data.  The tester will later access
-        // this state.
-        // Note: RubyPort will access it's sender state before the
-        // RubyTester.
-        if (m_usingRubyTester) {
-            RubyPort::SenderState *requestSenderState =
-                safe_cast<RubyPort::SenderState*>(pkt->senderState);
-            RubyTester::SenderState* testerSenderState =
-                safe_cast<RubyTester::SenderState*>
-                    (requestSenderState->predecessor);
-            testerSenderState->subBlock.mergeFrom(data);
-        }
    }


@@ -566,8 +540,6 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
    } else if (pkt->isWrite()) {
        req_type = RubyRequestType_ST;
    } else {
-        // Acquire and release packets will have been issued by
-        // makeRequest, so we do not need to check for it here.
        panic("Unsupported ruby packet type\n");
    }

@@ -579,71 +551,43 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
 RequestStatus
 GPUCoalescer::makeRequest(PacketPtr pkt)
 {
-    // Check for GPU Barrier Kernel End or Kernel Begin
-    // Leave these to be handled by the child class
-    // Kernel End/Barrier = isFlush + isRelease
-    // Kernel Begin = isFlush + isAcquire
-    if (pkt->req->isKernel()) {
-        if (pkt->req->isAcquire()){
-            // This is a Kernel Begin leave handling to
-            // virtual xCoalescer::makeRequest
-            return RequestStatus_Issued;
-        }else if (pkt->req->isRelease()) {
-            // This is a Kernel End leave handling to
-            // virtual xCoalescer::makeRequest
-            // If we are here then we didn't call
-            // a virtual version of this function
-            // so we will also schedule the callback
-            int wf_id = 0;
-            if (pkt->req->hasContextId()) {
-                wf_id = pkt->req->contextId();
-            }
-            insertKernel(wf_id, pkt);
-            newKernelEnds.push_back(wf_id);
-            if (!issueEvent.scheduled()) {
-                schedule(issueEvent, curTick());
-            }
-            return RequestStatus_Issued;
+    // all packets must have valid instruction sequence numbers
+    assert(pkt->req->hasInstSeqNum());
+
+    if (pkt->cmd == MemCmd::MemSyncReq) {
+        // issue mem_sync requests immedidately to the cache system without
+        // going though uncoalescedTable like normal LD/ST/Atomic requests
+        issueMemSyncRequest(pkt);
+    } else {
+        // otherwise, this must be either read or write command
+        assert(pkt->isRead() || pkt->isWrite());
+
+        // the pkt is temporarily stored in the uncoalesced table until
+        // it's picked for coalescing process later in this cycle or in a
+        // future cycle
+        uncoalescedTable.insertPacket(pkt);
+        DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
+                pkt->getAddr());
+
+        // we schedule an issue event here to process the uncoalesced table
+        // and try to issue Ruby request to cache system
+        if (!issueEvent.scheduled()) {
+            schedule(issueEvent, curTick());
        }
    }

-    if (!pkt->isLLSC() && !pkt->req->isLockedRMW() && !pkt->isAtomicOp() &&
-        !pkt->isRead() && !pkt->isWrite() && !pkt->isFlush() &&
-        (pkt->req->isRelease() || pkt->req->isAcquire())) {
-        if (assumingRfOCoherence) {
-            // If we reached here, this request must be a memFence
-            // and the protocol implements RfO, the coalescer can
-            // assume sequentially consistency and schedule the callback
-            // immediately.
-            // Currently the code implements fence callbacks
-            // by reusing the mechanism for kernel completions.
-            // This should be fixed.
-            int wf_id = 0;
-            if (pkt->req->hasContextId()) {
-                wf_id = pkt->req->contextId();
-            }
-            insertKernel(wf_id, pkt);
-            newKernelEnds.push_back(wf_id);
-            if (!issueEvent.scheduled()) {
-                schedule(issueEvent, curTick());
-            }
-            return RequestStatus_Issued;
-        } else {
-            // If not RfO, return issued here and let the child coalescer
-            // take care of it.
-            return RequestStatus_Issued;
-        }
-    }
-
-    uncoalescedTable.insertPacket(pkt);
-    DPRINTF(GPUCoalescer, "UC insertPacket 0x%X\n", pkt->getAddr());
-
-    if (!issueEvent.scheduled())
-        schedule(issueEvent, curTick());
-    // TODO: issue hardware prefetches here
+    // we always return RequestStatus_Issued in this coalescer
+    // b/c the coalescer's resouce was checked ealier and the coalescer is
+    // queueing up aliased requets in its coalesced table
    return RequestStatus_Issued;
 }

+/**
+ * TODO: Figure out what do with this code. This code may go away
+ *       and/or be merged into the VIPER coalescer once the VIPER
+ *       protocol is re-integrated with GCN3 codes.
+ */
+/*
 void
 GPUCoalescer::issueRequest(CoalescedRequest* crequest)
 {
@@ -736,8 +680,8 @@ GPUCoalescer::issueRequest(CoalescedRequest* crequest)
    }

    assert(m_mandatory_q_ptr);
-    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
-}
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+}*/

 template <class KEY, class VALUE>
 std::ostream &
@@ -760,12 +704,6 @@ GPUCoalescer::print(ostream& out) const
 }


-void
-GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
-    DPRINTF(RubyStats, "Recorded statistic: %s\n",
-            SequencerRequestType_to_string(requestType));
-}
-
 bool
 GPUCoalescer::coalescePacket(PacketPtr pkt)
 {
@@ -819,6 +757,41 @@ GPUCoalescer::coalescePacket(PacketPtr pkt)
        // be counted as outstanding requests.
        m_outstanding_count++;

+        // We track all issued or to-be-issued Ruby requests associated with
+        // write instructions. An instruction may have multiple Ruby
+        // requests.
+        if (pkt->cmd == MemCmd::WriteReq) {
+            DPRINTF(GPUCoalescer, "adding write inst %d at line 0x%x to"
+                    " the pending write instruction list\n", seqNum,
+                    line_addr);
+
+            RubyPort::SenderState* ss =
+                    safe_cast<RubyPort::SenderState*>(pkt->senderState);
+
+            // we need to save this port because it will be used to call
+            // back the requesting CU when we receive write
+            // complete callbacks for all issued Ruby requests of this
+            // instruction.
+            RubyPort::MemSlavePort* mem_slave_port = ss->port;
+
+            GPUDynInstPtr gpuDynInst = nullptr;
+
+            if (!m_usingRubyTester) {
+                // If this coalescer is connected to a real CU, we need
+                // to save the corresponding gpu dynamic instruction.
+                // CU will use that instruction to decrement wait counters
+                // in the issuing wavefront.
+                // For Ruby tester, gpuDynInst == nullptr
+                ComputeUnit::DataPort::SenderState* cu_state =
+                    safe_cast<ComputeUnit::DataPort::SenderState*>
+                        (ss->predecessor);
+                gpuDynInst = cu_state->_gpuDynInst;
+            }
+
+            PendingWriteInst& inst = pendingWriteInsts[seqNum];
+            inst.addPendingReq(mem_slave_port, gpuDynInst, m_usingRubyTester);
+        }
+
        return true;
    }

@@ -907,34 +880,6 @@ GPUCoalescer::atomicCallback(Addr address,
    }
 }

-void
-GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
-{
-    if (myMachID == senderMachID) {
-        CP_TCPLdHits++;
-    } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
-        CP_TCPLdTransfers++;
-    } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
-        CP_TCCLdHits++;
-    } else {
-        CP_LdMiss++;
-    }
-}
-
-void
-GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
-{
-    if (myMachID == senderMachID) {
-        CP_TCPStHits++;
-    } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
-        CP_TCPStTransfers++;
-    } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
-        CP_TCCStHits++;
-    } else {
-        CP_StMiss++;
-    }
-}
-
 void
 GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
 {
@@ -970,74 +915,6 @@ GPUCoalescer::recordMissLatency(CoalescedRequest* crequest,
                                Cycles firstResponseTime,
                                bool success, bool isRegion)
 {
-    RubyRequestType type = crequest->getRubyType();
-    Cycles issued_time = crequest->getIssueTime();
-    Cycles completion_time = curCycle();
-    assert(completion_time >= issued_time);
-    Cycles total_lat = completion_time - issued_time;
-
-    // cache stats (valid for RfO protocol only)
-    if (mach == MachineType_TCP) {
-        if (type == RubyRequestType_LD) {
-            GPU_TCPLdHits++;
-        } else {
-            GPU_TCPStHits++;
-        }
-    } else if (mach == MachineType_L1Cache_wCC) {
-        if (type == RubyRequestType_LD) {
-            GPU_TCPLdTransfers++;
-        } else {
-            GPU_TCPStTransfers++;
-        }
-    } else if (mach == MachineType_TCC) {
-        if (type == RubyRequestType_LD) {
-            GPU_TCCLdHits++;
-        } else {
-            GPU_TCCStHits++;
-        }
-    } else  {
-        if (type == RubyRequestType_LD) {
-            GPU_LdMiss++;
-        } else {
-            GPU_StMiss++;
-        }
-    }
-
-    // Profile all access latency, even zero latency accesses
-    m_latencyHist.sample(total_lat);
-    m_typeLatencyHist[type]->sample(total_lat);
-
-    // Profile the miss latency for all non-zero demand misses
-    if (total_lat != Cycles(0)) {
-        m_missLatencyHist.sample(total_lat);
-        m_missTypeLatencyHist[type]->sample(total_lat);
-
-        if (mach != MachineType_NUM) {
-            m_missMachLatencyHist[mach]->sample(total_lat);
-            m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
-
-            if ((issued_time <= initialRequestTime) &&
-                (initialRequestTime <= forwardRequestTime) &&
-                (forwardRequestTime <= firstResponseTime) &&
-                (firstResponseTime <= completion_time)) {
-
-                m_IssueToInitialDelayHist[mach]->sample(
-                    initialRequestTime - issued_time);
-                m_InitialToForwardDelayHist[mach]->sample(
-                    forwardRequestTime - initialRequestTime);
-                m_ForwardToFirstResponseDelayHist[mach]->sample(
-                    firstResponseTime - forwardRequestTime);
-                m_FirstResponseToCompletionDelayHist[mach]->sample(
-                    completion_time - firstResponseTime);
-            }
-        }
-
-    }
-
-    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
-             curTick(), m_version, "Coal",
-             success ? "Done" : "SC_Failed", "", "",
-             printAddress(crequest->getFirstPkt()->getAddr()), total_lat);
 }

 void
@@ -1085,74 +962,4 @@ GPUCoalescer::regStats()
            m_missTypeMachLatencyHist[i][j]->init(10);
        }
    }
-
-    // GPU cache stats
-    GPU_TCPLdHits
-        .name(name() + ".gpu_tcp_ld_hits")
-        .desc("loads that hit in the TCP")
-        ;
-    GPU_TCPLdTransfers
-        .name(name() + ".gpu_tcp_ld_transfers")
-        .desc("TCP to TCP load transfers")
-        ;
-    GPU_TCCLdHits
-        .name(name() + ".gpu_tcc_ld_hits")
-        .desc("loads that hit in the TCC")
-        ;
-    GPU_LdMiss
-        .name(name() + ".gpu_ld_misses")
-        .desc("loads that miss in the GPU")
-        ;
-
-    GPU_TCPStHits
-        .name(name() + ".gpu_tcp_st_hits")
-        .desc("stores that hit in the TCP")
-        ;
-    GPU_TCPStTransfers
-        .name(name() + ".gpu_tcp_st_transfers")
-        .desc("TCP to TCP store transfers")
-        ;
-    GPU_TCCStHits
-        .name(name() + ".gpu_tcc_st_hits")
-        .desc("stores that hit in the TCC")
-        ;
-    GPU_StMiss
-        .name(name() + ".gpu_st_misses")
-        .desc("stores that miss in the GPU")
-        ;
-
-    // CP cache stats
-    CP_TCPLdHits
-        .name(name() + ".cp_tcp_ld_hits")
-        .desc("loads that hit in the TCP")
-        ;
-    CP_TCPLdTransfers
-        .name(name() + ".cp_tcp_ld_transfers")
-        .desc("TCP to TCP load transfers")
-        ;
-    CP_TCCLdHits
-        .name(name() + ".cp_tcc_ld_hits")
-        .desc("loads that hit in the TCC")
-        ;
-    CP_LdMiss
-        .name(name() + ".cp_ld_misses")
-        .desc("loads that miss in the GPU")
-        ;
-
-    CP_TCPStHits
-        .name(name() + ".cp_tcp_st_hits")
-        .desc("stores that hit in the TCP")
-        ;
-    CP_TCPStTransfers
-        .name(name() + ".cp_tcp_st_transfers")
-        .desc("TCP to TCP store transfers")
-        ;
-    CP_TCCStHits
-        .name(name() + ".cp_tcc_st_hits")
-        .desc("stores that hit in the TCC")
-        ;
-    CP_StMiss
-        .name(name() + ".cp_st_misses")
-        .desc("stores that miss in the GPU")
-        ;
 }
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -38,11 +38,11 @@
 #include <unordered_map>

 #include "base/statistics.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/misc.hh"
 #include "mem/request.hh"
 #include "mem/ruby/common/Address.hh"
 #include "mem/ruby/common/Consumer.hh"
-#include "mem/ruby/protocol/HSAScope.hh"
-#include "mem/ruby/protocol/HSASegment.hh"
 #include "mem/ruby/protocol/PrefetchBit.hh"
 #include "mem/ruby/protocol/RubyAccessMode.hh"
 #include "mem/ruby/protocol/RubyRequestType.hh"
@@ -57,9 +57,6 @@ class CacheMemory;

 class RubyGPUCoalescerParams;

-HSAScope reqScopeToHSAScope(const RequestPtr &req);
-HSASegment reqSegmentToHSASegment(const RequestPtr &req);
-
 // List of packets that belongs to a specific instruction.
 typedef std::list<PacketPtr> PerInstPackets;

@@ -78,6 +75,7 @@ class UncoalescedTable
    // instructions at the offset.
    PerInstPackets* getInstPackets(int offset);
    void updateResources();
+    bool areRequestsDone(const uint64_t instSeqNum);

    // Check if a packet hasn't been removed from instMap in too long.
    // Panics if a deadlock is detected and returns nothing otherwise.
@@ -120,6 +118,86 @@ class CoalescedRequest
    std::vector<PacketPtr> pkts;
 };

+// PendingWriteInst tracks the number of outstanding Ruby requests
+// per write instruction. Once all requests associated with one instruction
+// are completely done in Ruby, we call back the requester to mark
+// that this instruction is complete.
+class PendingWriteInst
+{
+  public:
+    PendingWriteInst()
+        : numPendingStores(0),
+          originalPort(nullptr),
+          gpuDynInstPtr(nullptr)
+    {}
+
+    ~PendingWriteInst()
+    {}
+
+    void
+    addPendingReq(RubyPort::MemSlavePort* port, GPUDynInstPtr inst,
+                  bool usingRubyTester)
+    {
+        assert(port);
+        originalPort = port;
+
+        if (!usingRubyTester) {
+            gpuDynInstPtr = inst;
+        }
+
+        numPendingStores++;
+    }
+
+    // return true if no more ack is expected
+    bool
+    receiveWriteCompleteAck()
+    {
+        assert(numPendingStores > 0);
+        numPendingStores--;
+        return (numPendingStores == 0) ? true : false;
+    }
+
+    // ack the original requester that this write instruction is complete
+    void
+    ackWriteCompletion(bool usingRubyTester)
+    {
+        assert(numPendingStores == 0);
+
+        // make a response packet
+        PacketPtr pkt = new Packet(std::make_shared<Request>(),
+                                   MemCmd::WriteCompleteResp);
+
+        if (!usingRubyTester) {
+            assert(gpuDynInstPtr);
+            ComputeUnit::DataPort::SenderState* ss =
+                    new ComputeUnit::DataPort::SenderState
+                                            (gpuDynInstPtr, 0, nullptr);
+            pkt->senderState = ss;
+        }
+
+        // send the ack response to the requester
+        originalPort->sendTimingResp(pkt);
+    }
+
+    int
+    getNumPendingStores() {
+        return numPendingStores;
+    }
+
+  private:
+    // the number of stores waiting for writeCompleteCallback
+    int numPendingStores;
+    // The original port that sent one of packets associated with this
+    // write instruction. We may have more than one packet per instruction,
+    // which implies multiple ports per instruction. However, we need
+    // only 1 of the ports to call back the CU. Therefore, here we keep
+    // track the port that sent the first packet of this instruction.
+    RubyPort::MemSlavePort* originalPort;
+    // similar to the originalPort, this gpuDynInstPtr is set only for
+    // the first packet of this instruction.
+    GPUDynInstPtr gpuDynInstPtr;
+};
+
 class GPUCoalescer : public RubyPort
 {
  public:
@@ -159,6 +237,17 @@ class GPUCoalescer : public RubyPort
    void collateStats();
    void regStats() override;

+    // each store request needs two callbacks:
+    //  (1) writeCallback is called when the store is received and processed
+    //      by TCP. This writeCallback does not guarantee the store is actually
+    //      completed at its destination cache or memory. writeCallback helps
+    //      release hardware resources (e.g., its entry in coalescedTable)
+    //      allocated for the store so that subsequent requests will not be
+    //      blocked unnecessarily due to hardware resource constraints.
+    //  (2) writeCompleteCallback is called when the store is fully completed
+    //      at its destination cache or memory. writeCompleteCallback
+    //      guarantees that the store is fully completed. This callback
+    //      will decrement hardware counters in CU
    void writeCallback(Addr address, DataBlock& data);

    void writeCallback(Addr address,
@@ -180,6 +269,10 @@ class GPUCoalescer : public RubyPort
                       Cycles forwardRequestTime,
                       Cycles firstResponseTime);

+    void writeCompleteCallback(Addr address,
+                               uint64_t instSeqNum,
+                               MachineType mach);
+
    void readCallback(Addr address, DataBlock& data);

    void readCallback(Addr address,
@@ -200,18 +293,12 @@ class GPUCoalescer : public RubyPort
                      Cycles forwardRequestTime,
                      Cycles firstResponseTime,
                      bool isRegion);
-    /* atomics need their own callback because the data
-       might be const coming from SLICC */
+
    void atomicCallback(Addr address,
                        MachineType mach,
                        const DataBlock& data);

-    void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID);
-    void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID);
-
-    // Alternate implementations in VIPER Coalescer
-    virtual RequestStatus makeRequest(PacketPtr pkt) override;
-
+    RequestStatus makeRequest(PacketPtr pkt) override;
    int outstandingCount() const override { return m_outstanding_count; }

    bool
@@ -237,7 +324,6 @@ class GPUCoalescer : public RubyPort

    GMTokenPort& getGMTokenPort() { return gmTokenPort; }

-    void recordRequestType(SequencerRequestType requestType);
    Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }

    Stats::Histogram& getLatencyHist() { return m_latencyHist; }
@@ -271,15 +357,17 @@ class GPUCoalescer : public RubyPort
    getFirstResponseToCompletionDelayHist(const MachineType t) const
    { return *m_FirstResponseToCompletionDelayHist[t]; }

-  // Changed to protected to enable inheritance by VIPER Coalescer
  protected:
    bool tryCacheAccess(Addr addr, RubyRequestType type,
                        Addr pc, RubyAccessMode access_mode,
                        int size, DataBlock*& data_ptr);
-    // Alternate implementations in VIPER Coalescer
-    virtual void issueRequest(CoalescedRequest* crequest);

-    void kernelCallback(int wavfront_id);
+    // since the two following issue functions are protocol-specific,
+    // they must be implemented in a derived coalescer
+    virtual void issueRequest(CoalescedRequest* crequest) = 0;
+    virtual void issueMemSyncRequest(PacketPtr pkt) = 0;
+
+    void kernelCallback(int wavefront_id);

    void hitCallback(CoalescedRequest* crequest,
                     MachineType mach,
@@ -297,7 +385,6 @@ class GPUCoalescer : public RubyPort
                           bool success, bool isRegion);
    void completeHitCallback(std::vector<PacketPtr> & mylist);

-
    virtual RubyRequestType getRequestType(PacketPtr pkt);

    // Attempt to remove a packet from the uncoalescedTable and coalesce
@@ -309,8 +396,6 @@ class GPUCoalescer : public RubyPort

    EventFunctionWrapper issueEvent;

-
-  // Changed to protected to enable inheritance by VIPER Coalescer
  protected:
    int m_max_outstanding_requests;
    Cycles m_deadlock_threshold;
@@ -334,6 +419,11 @@ class GPUCoalescer : public RubyPort
    // an address, the are serviced in age order.
    std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;

+    // a map btw an instruction sequence number and PendingWriteInst
+    // this is used to do a final call back for each write when it is
+    // completely done in the memory system
+    std::unordered_map<uint64_t, PendingWriteInst> pendingWriteInsts;
+
    // Global outstanding request count, across all request tables
    int m_outstanding_count;
    bool m_deadlock_check_scheduled;
@@ -350,26 +440,28 @@ class GPUCoalescer : public RubyPort
    EventFunctionWrapper deadlockCheckEvent;
    bool assumingRfOCoherence;

-    // m5 style stats for TCP hit/miss counts
-    Stats::Scalar GPU_TCPLdHits;
-    Stats::Scalar GPU_TCPLdTransfers;
-    Stats::Scalar GPU_TCCLdHits;
-    Stats::Scalar GPU_LdMiss;
-
-    Stats::Scalar GPU_TCPStHits;
-    Stats::Scalar GPU_TCPStTransfers;
-    Stats::Scalar GPU_TCCStHits;
-    Stats::Scalar GPU_StMiss;
-
-    Stats::Scalar CP_TCPLdHits;
-    Stats::Scalar CP_TCPLdTransfers;
-    Stats::Scalar CP_TCCLdHits;
-    Stats::Scalar CP_LdMiss;
-
-    Stats::Scalar CP_TCPStHits;
-    Stats::Scalar CP_TCPStTransfers;
-    Stats::Scalar CP_TCCStHits;
-    Stats::Scalar CP_StMiss;
+// TODO - Need to update the following stats once the VIPER protocol
+//        is re-integrated.
+//    // m5 style stats for TCP hit/miss counts
+//    Stats::Scalar GPU_TCPLdHits;
+//    Stats::Scalar GPU_TCPLdTransfers;
+//    Stats::Scalar GPU_TCCLdHits;
+//    Stats::Scalar GPU_LdMiss;
+//
+//    Stats::Scalar GPU_TCPStHits;
+//    Stats::Scalar GPU_TCPStTransfers;
+//    Stats::Scalar GPU_TCCStHits;
+//    Stats::Scalar GPU_StMiss;
+//
+//    Stats::Scalar CP_TCPLdHits;
+//    Stats::Scalar CP_TCPLdTransfers;
+//    Stats::Scalar CP_TCCLdHits;
+//    Stats::Scalar CP_LdMiss;
+//
+//    Stats::Scalar CP_TCPStHits;
+//    Stats::Scalar CP_TCPStTransfers;
+//    Stats::Scalar CP_TCCStHits;
+//    Stats::Scalar CP_StMiss;

    //! Histogram for number of outstanding requests per cycle.
    Stats::Histogram m_outstandReqHist;
@@ -394,6 +486,21 @@ class GPUCoalescer : public RubyPort
    std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
    std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;

+// TODO - Need to update the following stats once the VIPER protocol
+//        is re-integrated.
+//    Stats::Distribution numHopDelays;
+//    Stats::Distribution tcpToTccDelay;
+//    Stats::Distribution tccToSdDelay;
+//    Stats::Distribution sdToSdDelay;
+//    Stats::Distribution sdToTccDelay;
+//    Stats::Distribution tccToTcpDelay;
+//
+//    Stats::Average avgTcpToTcc;
+//    Stats::Average avgTccToSd;
+//    Stats::Average avgSdToSd;
+//    Stats::Average avgSdToTcc;
+//    Stats::Average avgTccToTcp;
+
  private:
    // Token port is used to send/receive tokens to/from GPU's global memory
    // pipeline across the port boundary. There is one per <wave size> data
--- a/src/mem/ruby/system/GPUCoalescer.py
+++ b/src/mem/ruby/system/GPUCoalescer.py
@@ -36,6 +36,7 @@ from m5.objects.Sequencer import *

 class RubyGPUCoalescer(RubyPort):
   type = 'RubyGPUCoalescer'
+   abstract = True
   cxx_class = 'GPUCoalescer'
   cxx_header = "mem/ruby/system/GPUCoalescer.hh"

@@ -44,8 +45,6 @@ class RubyGPUCoalescer(RubyPort):
                                "max requests (incl. prefetches) outstanding")
   max_coalesces_per_cycle = Param.Int(1, "max instructions that can be " \
                                "coalesced in a single cycle")
-   assume_rfo = Param.Bool(True, "assume protocol implementes Read for "
-                           "Ownership coherence");

   icache = Param.RubyCache("")
   dcache = Param.RubyCache("")
--- a/src/mem/ruby/system/VIPERCoalescer.hh
+++ b/src/mem/ruby/system/VIPERCoalescer.hh
@@ -58,7 +58,7 @@ class VIPERCoalescer : public GPUCoalescer
    VIPERCoalescer(const Params *);
    ~VIPERCoalescer();

-    void issueMemSyncRequest(PacketPtr pkt);
+    void issueMemSyncRequest(PacketPtr pkt) override;
    void issueRequest(CoalescedRequest* crequest) override;
    void wbCallback(Addr address);
    void invCallback(Addr address);
--- a/src/mem/ruby/system/VIPERCoalescer.py
+++ b/src/mem/ruby/system/VIPERCoalescer.py
@@ -39,4 +39,3 @@ class VIPERCoalescer(RubyGPUCoalescer):
    cxx_header = "mem/ruby/system/VIPERCoalescer.hh"
    max_inv_per_cycle = Param.Int(32, "max invalidations per cycle")
    max_wb_per_cycle = Param.Int(32, "max writebacks per cycle")
-    assume_rfo = False