From 107e05266d9cd03b2f9ff5ba7ac4d8be430a3aa0 Mon Sep 17 00:00:00 2001
From: Vishnu Ramadas <vramadas@outlook.com>
Date: Fri, 29 Sep 2023 14:29:47 -0500
Subject: [PATCH 1/8] dev-amdgpu: Add aql, hsa queue information to
 checkpoint-restore

GPUFS uses aql information from PM4 queues to initialize doorbells. This
commit adds aql information to the checkpoint so that it can be used
during restoration to correctly initialize all doorbells. Additionally,
this commit also sets the hsa queue correctly during checkpoint-restoration

Change-Id: Ief3ef6dc973f70f27255234872a12c396df05d89
---
 src/dev/amdgpu/pm4_packet_processor.cc | 29 +++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc
index e7b846529e..63a3bf8887 100644
--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -1044,6 +1044,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
     int num_queues = queues.size();
     Addr id[num_queues];
     Addr mqd_base[num_queues];
+    uint64_t mqd_read_index[num_queues];
     Addr base[num_queues];
     Addr rptr[num_queues];
     Addr wptr[num_queues];
@@ -1060,6 +1061,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
     uint32_t hqd_active[num_queues];
     uint32_t hqd_vmid[num_queues];
     Addr aql_rptr[num_queues];
+    uint32_t aql[num_queues];
     uint32_t doorbell[num_queues];
     uint32_t hqd_pq_control[num_queues];
 
@@ -1068,9 +1070,10 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
         PM4Queue *q = iter.second;
         id[i] = q->id();
         mqd_base[i] = q->mqdBase();
+        mqd_read_index[i] = q->getMQD()->mqdReadIndex;
         bool cur_state = q->ib();
         q->ib(false);
-        base[i] = q->base() >> 8;
+        base[i] = q->base();
         rptr[i] = q->getRptr();
         wptr[i] = q->getWptr();
         q->ib(true);
@@ -1088,6 +1091,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
         hqd_active[i] = q->getMQD()->hqd_active;
         hqd_vmid[i] = q->getMQD()->hqd_vmid;
         aql_rptr[i] = q->getMQD()->aqlRptr;
+        aql[i] = q->getMQD()->aql;
         doorbell[i] = q->getMQD()->doorbell;
         hqd_pq_control[i] = q->getMQD()->hqd_pq_control;
         i++;
@@ -1096,6 +1100,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
     SERIALIZE_SCALAR(num_queues);
     SERIALIZE_ARRAY(id, num_queues);
     SERIALIZE_ARRAY(mqd_base, num_queues);
+    SERIALIZE_ARRAY(mqd_read_index, num_queues);
     SERIALIZE_ARRAY(base, num_queues);
     SERIALIZE_ARRAY(rptr, num_queues);
     SERIALIZE_ARRAY(wptr, num_queues);
@@ -1112,6 +1117,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
     SERIALIZE_ARRAY(hqd_active, num_queues);
     SERIALIZE_ARRAY(hqd_vmid, num_queues);
     SERIALIZE_ARRAY(aql_rptr, num_queues);
+    SERIALIZE_ARRAY(aql, num_queues);
     SERIALIZE_ARRAY(doorbell, num_queues);
     SERIALIZE_ARRAY(hqd_pq_control, num_queues);
 }
@@ -1127,6 +1133,7 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
 
     Addr id[num_queues];
     Addr mqd_base[num_queues];
+    uint64_t mqd_read_index[num_queues];
     Addr base[num_queues];
     Addr rptr[num_queues];
     Addr wptr[num_queues];
@@ -1143,11 +1150,13 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
     uint32_t hqd_active[num_queues];
     uint32_t hqd_vmid[num_queues];
     Addr aql_rptr[num_queues];
+    uint32_t aql[num_queues];
     uint32_t doorbell[num_queues];
     uint32_t hqd_pq_control[num_queues];
 
     UNSERIALIZE_ARRAY(id, num_queues);
     UNSERIALIZE_ARRAY(mqd_base, num_queues);
+    UNSERIALIZE_ARRAY(mqd_read_index, num_queues);
     UNSERIALIZE_ARRAY(base, num_queues);
     UNSERIALIZE_ARRAY(rptr, num_queues);
     UNSERIALIZE_ARRAY(wptr, num_queues);
@@ -1164,6 +1173,7 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
     UNSERIALIZE_ARRAY(hqd_active, num_queues);
     UNSERIALIZE_ARRAY(hqd_vmid, num_queues);
     UNSERIALIZE_ARRAY(aql_rptr, num_queues);
+    UNSERIALIZE_ARRAY(aql, num_queues);
     UNSERIALIZE_ARRAY(doorbell, num_queues);
     UNSERIALIZE_ARRAY(hqd_pq_control, num_queues);
 
@@ -1172,19 +1182,20 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
         memset(mqd, 0, sizeof(QueueDesc));
 
         mqd->mqdBase = mqd_base[i] >> 8;
-        mqd->base = base[i];
-        mqd->rptr = rptr[i];
-        mqd->ibBase = ib_base[i];
-        mqd->ibRptr = ib_rptr[i];
+        mqd->mqdReadIndex = mqd_read_index[i];
+        mqd->base = base[i] >> 8;
+        mqd->aql = aql[i];
 
         PM4MapQueues* pkt = new PM4MapQueues;
         memset(pkt, 0, sizeof(PM4MapQueues));
         newQueue(mqd, offset[i], pkt, id[i]);
 
         queues[id[i]]->ib(false);
+        queues[id[i]]->rptr(rptr[i]);
         queues[id[i]]->wptr(wptr[i]);
         queues[id[i]]->ib(true);
         queues[id[i]]->wptr(ib_wptr[i]);
+        queues[id[i]]->rptr(ib_rptr[i]);
         queues[id[i]]->offset(offset[i]);
         queues[id[i]]->processing(processing[i]);
         queues[id[i]]->ib(ib[i]);
@@ -1195,6 +1206,14 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
         queues[id[i]]->getMQD()->doorbell = doorbell[i];
         queues[id[i]]->getMQD()->hqd_pq_control = hqd_pq_control[i];
 
+        if (mqd->aql) {
+            int mqd_size = (1 << ((hqd_pq_control[i] & 0x3f) + 1)) * 4;
+            auto &hsa_pp = gpuDevice->CP()->hsaPacketProc();
+            hsa_pp.setDeviceQueueDesc(aql_rptr[i], base[i], id[i],
+                                  mqd_size, 8, GfxVersion::gfx900, offset[i],
+                                  mqd_read_index[i]);
+        }
+
         DPRINTF(PM4PacketProcessor, "PM4 queue %d, rptr: %p wptr: %p\n",
                 queues[id[i]]->id(), queues[id[i]]->rptr(),
                 queues[id[i]]->wptr());

From a50ead5907771d4b38bcfed615069d9e3ad9283d Mon Sep 17 00:00:00 2001
From: Vishnu Ramadas <vramadas@outlook.com>
Date: Fri, 29 Sep 2023 16:32:22 -0500
Subject: [PATCH 2/8] mem-ruby: Add Flush as a supported memory type in
 VIPERCoalescer

This commit adds flush as a recognized memory type in VIPERCoalescer.

Change-Id: I0f1b6f4518548e8e893ef681955b12a49293d8b4
---
 src/mem/ruby/system/VIPERCoalescer.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc
index ea95129841..a5198cce63 100644
--- a/src/mem/ruby/system/VIPERCoalescer.cc
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -75,12 +75,14 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
     //    ReadReq             : cache read
     //    WriteReq            : cache write
     //    AtomicOp            : cache atomic
+    //    Flush               : flush and invalidate cache
     //
     // VIPER does not expect MemSyncReq & Release since in GCN3, compute unit
     // does not specify an equivalent type of memory request.
     assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL1()) ||
             pkt->cmd == MemCmd::ReadReq ||
             pkt->cmd == MemCmd::WriteReq ||
+            pkt->cmd == MemCmd::FlushReq ||
             pkt->isAtomicOp());
 
     if (pkt->req->isInvL1() && m_cache_inv_pkt) {

From 61e39d5b26465ce362249bd544cc68a725af1fdf Mon Sep 17 00:00:00 2001
From: Vishnu Ramadas <vramadas@outlook.com>
Date: Fri, 29 Sep 2023 16:37:41 -0500
Subject: [PATCH 3/8] mem-ruby: Add cache cooldown and warmup support to
 GPUCoalescer

The GPU Coalescer does not contain cache cooldown and warmup support.
This commit updates the coalsecer to support cache cooldown during flush
and warmup during checkpoint restore.

Change-Id: I5459471dec20ff304fd5954af1079a7486ee860a
---
 src/mem/ruby/system/GPUCoalescer.cc | 94 ++++++++++++++++++++++-------
 src/mem/ruby/system/GPUCoalescer.hh |  3 +
 2 files changed, 75 insertions(+), 22 deletions(-)

diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
index beb8da3f9c..a70af07467 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -73,6 +73,14 @@ UncoalescedTable::insertPacket(PacketPtr pkt)
             pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size());
 }
 
+void
+UncoalescedTable::insertReqType(PacketPtr pkt, RubyRequestType type)
+{
+    uint64_t seqNum = pkt->req->getReqInstSeqNum();
+
+    reqTypeMap[seqNum] = type;
+}
+
 bool
 UncoalescedTable::packetAvailable()
 {
@@ -128,9 +136,21 @@ UncoalescedTable::updateResources()
             instMap.erase(iter++);
             instPktsRemaining.erase(seq_num);
 
-            // Release the token
-            DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", seq_num);
-            coalescer->getGMTokenPort().sendTokens(1);
+            // Release the token if the Ruby system is not in cooldown
+            // or warmup phases. When in these phases, the RubyPorts
+            // are accessed directly using the makeRequest() command
+            // instead of accessing through the port. This makes
+            // sending tokens through the port unnecessary
+            if (!RubySystem::getWarmupEnabled()
+                    && !RubySystem::getCooldownEnabled()) {
+                if (reqTypeMap[seq_num] != RubyRequestType_FLUSH) {
+                    DPRINTF(GPUCoalescer,
+                            "Returning token seqNum %d\n", seq_num);
+                    coalescer->getGMTokenPort().sendTokens(1);
+                }
+            }
+
+            reqTypeMap.erase(seq_num);
         } else {
             ++iter;
         }
@@ -565,6 +585,14 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
     for (auto& pkt : pktList) {
         offset = getOffset(pkt->getAddr());
         pkt_size = pkt->getSize();
+        request_address = pkt->getAddr();
+
+        // When the Ruby system is cooldown phase, the requests come from
+        // the cache recorder. These requests do not get coalesced and
+        // do not return valid data.
+        if (RubySystem::getCooldownEnabled())
+            continue;
+
         if (pkt->getPtr<uint8_t>()) {
             switch(type) {
                 // Store and AtomicNoReturns follow the same path, as the
@@ -627,7 +655,6 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
     assert(!pkt->req->isLLSC());
     assert(!pkt->req->isLockedRMW());
     assert(!pkt->req->isInstFetch());
-    assert(!pkt->isFlush());
 
     if (pkt->req->isAtomicReturn()) {
         req_type = RubyRequestType_ATOMIC_RETURN;
@@ -637,6 +664,8 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
         req_type = RubyRequestType_LD;
     } else if (pkt->isWrite()) {
         req_type = RubyRequestType_ST;
+    } else if (pkt->isFlush()) {
+        req_type = RubyRequestType_FLUSH;
     } else {
         panic("Unsupported ruby packet type\n");
     }
@@ -658,7 +687,7 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
         issueMemSyncRequest(pkt);
     } else {
         // otherwise, this must be either read or write command
-        assert(pkt->isRead() || pkt->isWrite());
+        assert(pkt->isRead() || pkt->isWrite() || pkt->isFlush());
 
         InstSeqNum seq_num = pkt->req->getReqInstSeqNum();
 
@@ -667,10 +696,17 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
         // number of lanes actives for that vmem request (i.e., the popcnt
         // of the exec_mask.
         int num_packets = 1;
-        if (!m_usingRubyTester) {
-            num_packets = 0;
-            for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) {
-                num_packets += getDynInst(pkt)->getLaneStatus(i);
+
+        // When Ruby is in warmup or cooldown phase, the requests come from
+        // the cache recorder. There is no dynamic instruction associated
+        // with these requests either
+        if (!RubySystem::getWarmupEnabled()
+                && !RubySystem::getCooldownEnabled()) {
+            if (!m_usingRubyTester) {
+                num_packets = 0;
+                for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) {
+                    num_packets += getDynInst(pkt)->getLaneStatus(i);
+                }
             }
         }
 
@@ -679,6 +715,7 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
         // future cycle. Packets remaining is set to the number of excepted
         // requests from the instruction based on its exec_mask.
         uncoalescedTable.insertPacket(pkt);
+        uncoalescedTable.insertReqType(pkt, getRequestType(pkt));
         uncoalescedTable.initPacketsRemaining(seq_num, num_packets);
         DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
                 pkt->getAddr());
@@ -945,21 +982,27 @@ void
 GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
 {
     for (auto& pkt : mylist) {
-        RubyPort::SenderState *ss =
-            safe_cast<RubyPort::SenderState *>(pkt->senderState);
-        MemResponsePort *port = ss->port;
-        assert(port != NULL);
+        // When Ruby is in warmup or cooldown phase, the requests come
+        // from the cache recorder. They do not track which port to use
+        // and do not need to send the response back
+        if (!RubySystem::getWarmupEnabled()
+                && !RubySystem::getCooldownEnabled()) {
+            RubyPort::SenderState *ss =
+                safe_cast<RubyPort::SenderState *>(pkt->senderState);
+            MemResponsePort *port = ss->port;
+            assert(port != NULL);
 
-        pkt->senderState = ss->predecessor;
+            pkt->senderState = ss->predecessor;
 
-        if (pkt->cmd != MemCmd::WriteReq) {
-            // for WriteReq, we keep the original senderState until
-            // writeCompleteCallback
-            delete ss;
+            if (pkt->cmd != MemCmd::WriteReq) {
+                // for WriteReq, we keep the original senderState until
+                // writeCompleteCallback
+                delete ss;
+            }
+
+            port->hitCallback(pkt);
+            trySendRetries();
         }
-
-        port->hitCallback(pkt);
-        trySendRetries();
     }
 
     // We schedule an event in the same tick as hitCallback (similar to
@@ -971,7 +1014,14 @@ GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
         schedule(issueEvent, curTick());
     }
 
-    testDrainComplete();
+    RubySystem *rs = m_ruby_system;
+    if (RubySystem::getWarmupEnabled()) {
+        rs->m_cache_recorder->enqueueNextFetchRequest();
+    } else if (RubySystem::getCooldownEnabled()) {
+        rs->m_cache_recorder->enqueueNextFlushRequest();
+    } else {
+        testDrainComplete();
+    }
 }
 
 void
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh
index dd28855547..d6db5c00ba 100644
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -71,6 +71,7 @@ class UncoalescedTable
     ~UncoalescedTable() {}
 
     void insertPacket(PacketPtr pkt);
+    void insertReqType(PacketPtr pkt, RubyRequestType type);
     bool packetAvailable();
     void printRequestTable(std::stringstream& ss);
 
@@ -101,6 +102,8 @@ class UncoalescedTable
     std::map<InstSeqNum, PerInstPackets> instMap;
 
     std::map<InstSeqNum, int> instPktsRemaining;
+
+    std::map<InstSeqNum, RubyRequestType> reqTypeMap;
 };
 
 class CoalescedRequest

From 085789d00c4391b6b863981fb25e9cb8a7e7a445 Mon Sep 17 00:00:00 2001
From: Vishnu Ramadas <vramadas@outlook.com>
Date: Fri, 29 Sep 2023 18:19:37 -0500
Subject: [PATCH 4/8] mem-ruby: Add flush support to GPU_VIPER protocol

This commit adds flush support to the GPU VIPER coherence protocol. The
L1 cache will now initiate a flush request if the packet it receives
is of type RubyRequestType_FLUSH. During the flush process, the L1 cache
will a request to L2 if its in either V or I state. L2 will issue a
flush request to the directory if its cache line is in the valid
state before invalidating its copy. The directory, on receiving this
request, writes data to memory and sends an ack back to the L2. L2
forwards this ack back to the L1, which then ends the flush by calling
the write callback

Change-Id: I9dfc0c7b71a1e9f6d5e9e6ed4977c1e6a3b5ba46
---
 src/mem/ruby/protocol/GPU_VIPER-TCC.sm      | 52 ++++++++++++++++++-
 src/mem/ruby/protocol/GPU_VIPER-TCP.sm      | 46 ++++++++++++++++-
 src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm | 55 +++++++++++++++++++++
 3 files changed, 151 insertions(+), 2 deletions(-)

diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
index 20a0979af1..be1243aaa5 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -65,7 +65,8 @@ machine(MachineType:TCC, "TCC Cache")
     AtomicPassOn,           desc="Atomic Op Passed on to Directory";
     AtomicDone,             desc="AtomicOps Complete";
     AtomicNotDone,          desc="AtomicOps not Complete";
-    Data,                   desc="data messgae";
+    Data,                   desc="Data message";
+    Flush,                  desc="Flush cache entry";
     // Coming from this TCC
     L2_Repl,                desc="L2 Replacement";
     // Probes
@@ -376,6 +377,8 @@ machine(MachineType:TCC, "TCC Cache")
           } else {
             trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
           }
+        } else if (in_msg.Type == CoherenceRequestType:WriteFlush) {
+            trigger(Event:Flush, in_msg.addr, cache_entry, tbe);
         } else {
           DPRINTF(RubySlicc, "%s\n", in_msg);
           error("Unexpected Response Message to Core");
@@ -509,6 +512,20 @@ machine(MachineType:TCC, "TCC Cache")
     }
   }
 
+  action(fw_sendFlushResponse, "fw", desc="send Flush Response") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysWBAck;
+        out_msg.Destination.clear();
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        out_msg.instSeqNum := in_msg.instSeqNum;
+      }
+    }
+  }
+
   action(ar_sendAtomicResponse, "ar", desc="send Atomic Ack") {
     peek(coreRequestNetwork_in, CPURequestMsg) {
         enqueue(responseToCore_out, ResponseMsg, l2_response_latency + glc_atomic_latency, true) {
@@ -628,6 +645,22 @@ machine(MachineType:TCC, "TCC Cache")
     }
   }
 
+  action(f_flush, "f", desc="write back data") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+        out_msg.addr := address;
+        out_msg.Requestor := machineID;
+        out_msg.WTRequestor := in_msg.Requestor;
+        out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
+        out_msg.MessageSize := MessageSizeType:Data;
+        out_msg.Type := CoherenceRequestType:WriteFlush;
+        out_msg.Dirty := true;
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.writeMask.orMask(cache_entry.writeMask);
+      }
+    }
+  }
+
   action(at_atomicThrough, "at", desc="write back data") {
     peek(coreRequestNetwork_in, CPURequestMsg) {
       enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
@@ -1075,4 +1108,21 @@ machine(MachineType:TCC, "TCC Cache")
   transition(WIB, WBAck,I) {
     pr_popResponseQueue;
   }
+
+  transition({A, IV, WI, WIB}, Flush) {
+    st_stallAndWaitRequest;
+  }
+
+  transition(I, Flush) {
+    fw_sendFlushResponse;
+    p_popRequestQueue;
+  }
+
+  transition({V, W}, Flush, I) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    ut_updateTag;
+    f_flush;
+    i_invL2;
+    p_popRequestQueue;
+   }
 }
diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
index 7e0ad4ed96..8244879c55 100644
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -55,6 +55,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     I, AccessPermission:Invalid, desc="Invalid";
     V, AccessPermission:Read_Only, desc="Valid";
     A, AccessPermission:Invalid, desc="Waiting on Atomic";
+
+    F, AccessPermission:Invalid, desc="Flushing; Waiting for Ack";
   }
 
   enumeration(Event, desc="TCP Events") {
@@ -256,6 +258,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
       peek(responseToTCP_in, ResponseMsg, block_on="addr") {
         Entry cache_entry := getCacheEntry(in_msg.addr);
         TBE tbe := TBEs.lookup(in_msg.addr);
+        DPRINTF(RubySlicc, "In responseToTCP_in with %s\n", in_msg);
+
         if (in_msg.Type == CoherenceResponseType:TDSysResp) {
           if (disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) {
               // If L1 is disabled or requests have GLC or SLC flag set,
@@ -273,6 +277,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
         } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck ||
                      in_msg.Type == CoherenceResponseType:NBSysWBAck) {
             trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
+            DPRINTF(RubySlicc, "Issuing TCC_AckWB\n");
           } else {
             error("Unexpected Response Message to Core");
           }
@@ -469,6 +474,24 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
   action(sf_setFlush, "sf", desc="set flush") {
     inFlush := true;
     APPEND_TRANSITION_COMMENT(" inFlush is true");
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      assert(is_valid(cache_entry));
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.writeMask.clear();
+      out_msg.writeMask.orMask(cache_entry.writeMask);
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Data;
+      out_msg.Type := CoherenceRequestType:WriteFlush;
+      out_msg.InitialRequestTime := curCycle();
+      out_msg.Shared := false;
+      out_msg.isSLCSet := false;
+      peek(mandatoryQueue_in, RubyRequest) {
+        out_msg.instSeqNum := in_msg.instSeqNum;
+      }
+    }
   }
 
   action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
@@ -524,6 +547,16 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     cache_entry.Dirty := true;
   }
 
+  action(f_flushDone, "f", desc="flush done") {
+    assert(is_valid(cache_entry));
+
+    if (use_seq_not_coal) {
+        sequencer.writeCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
+    } else {
+        coalescer.writeCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
+    }
+  }
+
   action(inv_invDone, "inv", desc="local inv done") {
     if (use_seq_not_coal) {
         DPRINTF(RubySlicc, "Sequencer does not define invCallback!\n");
@@ -695,11 +728,16 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     ic_invCache;
   }
 
-  transition({V, I, A},Flush) {TagArrayFlash} {
+  transition({V,I}, Flush, F) {TagArrayFlash} {
+    a_allocate;
     sf_setFlush;
     p_popMandatoryQueue;
   }
 
+  transition(A, Flush) {
+    z_stall;
+  }
+
   transition({I, V}, Evict, I) {TagArrayFlash} {
     inv_invDone;
     p_popMandatoryQueue;
@@ -716,4 +754,10 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
     wd_wtDone;
     pr_popResponseQueue;
   }
+
+  transition(F, TCC_AckWB, I) {
+    f_flushDone;
+    pr_popResponseQueue;
+    ic_invCache;
+  }
 }
diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
index 774b54a432..eed750832f 100644
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
@@ -83,6 +83,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
     BM_Pm, AccessPermission:Backing_Store,      desc="blocked waiting for probes, already got memory";
     B_Pm, AccessPermission:Backing_Store,       desc="blocked waiting for probes, already got memory";
     B, AccessPermission:Backing_Store,          desc="sent response, Blocked til ack";
+
+    F, AccessPermission:Busy, desc="sent Flus, blocked till ack";
   }
 
   // Events
@@ -120,6 +122,9 @@ machine(MachineType:Directory, "AMD Baseline protocol")
     // DMA
     DmaRead,            desc="DMA read";
     DmaWrite,           desc="DMA write";
+
+    // Flush
+    Flush,              desc="Flush entry";
   }
 
   enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
@@ -411,6 +416,9 @@ machine(MachineType:Directory, "AMD Baseline protocol")
             DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr);
             trigger(Event:VicClean, in_msg.addr, entry, tbe);
           }
+        } else if (in_msg.Type == CoherenceRequestType:WriteFlush) {
+            DPRINTF(RubySlicc, "Got Flush from %s on %s\n", in_msg.Requestor, in_msg.addr);
+            trigger(Event:Flush, in_msg.addr, entry, tbe);
         } else {
           error("Bad request message type");
         }
@@ -562,6 +570,23 @@ machine(MachineType:Directory, "AMD Baseline protocol")
     }
   }
 
+  action(rf_sendResponseFlush, "rf", desc="send Flush Ack") {
+    peek(memQueue_in, MemoryMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysWBAck;
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.WTRequestor := tbe.WTRequestor;
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        out_msg.InitialRequestTime := tbe.InitialRequestTime;
+        out_msg.ForwardRequestTime := curCycle();
+        out_msg.ProbeRequestStartTime := curCycle();
+        //out_msg.instSeqNum := in_msg.instSeqNum;
+      }
+    }
+  }
+
   action(l_queueMemWBReq, "lq", desc="Write WB data to memory") {
     peek(responseNetwork_in, ResponseMsg) {
       enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
@@ -933,6 +958,23 @@ machine(MachineType:Directory, "AMD Baseline protocol")
     }
   }
 
+  action(f_writeFlushDataToMemory, "f", desc="Write flush data to memory") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
+        out_msg.addr := address;
+        out_msg.Type := MemoryRequestType:MEMORY_WB;
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Data;
+        out_msg.DataBlk := in_msg.DataBlk;
+      }
+      if (tbe.Dirty == false) {
+          // have to update the TBE, too, because of how this
+          // directory deals with functional writes
+        tbe.DataBlk := in_msg.DataBlk;
+      }
+    }
+  }
+
   action(atd_allocateTBEforDMA, "atd", desc="allocate TBE Entry for DMA") {
     check_allocate(TBEs);
     peek(dmaRequestQueue_in, DMARequestMsg) {
@@ -1553,4 +1595,17 @@ machine(MachineType:Directory, "AMD Baseline protocol")
     dt_deallocateTBE;
     pt_popTriggerQueue;
   }
+
+ transition(U, Flush, F) {L3TagArrayRead, L3TagArrayWrite} {
+    t_allocateTBE;
+    f_writeFlushDataToMemory;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+ }
+
+ transition(F, WBAck, U) {
+    pm_popMemQueue;
+    dt_deallocateTBE;
+ }
+
 }

From ae5a51994c112d04bfe0eb84189ca9ec5a46102e Mon Sep 17 00:00:00 2001
From: Vishnu Ramadas <vramadas@outlook.com>
Date: Fri, 29 Sep 2023 18:27:46 -0500
Subject: [PATCH 5/8] mem-ruby: Update cache recorder to use GPUCoalescer port
 for GPUs

Previously, the cache recorder used the Sequencer to issue flush
requests and cache warmup requests. The GPU however uses GPUCoalescer to
access the cache, and not the Sequencer. This commit adds a GPUCoalescer
map to the cache recorder and uses it to send flushes and cache warmup
requests to any GPU caches in the system

Change-Id: I10490cf5e561c8559a98d4eb0550c62eefe769c9
---
 src/mem/ruby/system/CacheRecorder.cc | 34 ++++++++++++++++++++++++----
 src/mem/ruby/system/CacheRecorder.hh |  3 +++
 src/mem/ruby/system/RubySystem.cc    | 17 +++++++++++++-
 3 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/src/mem/ruby/system/CacheRecorder.cc b/src/mem/ruby/system/CacheRecorder.cc
index 20a8a30ebc..ec552c07c5 100644
--- a/src/mem/ruby/system/CacheRecorder.cc
+++ b/src/mem/ruby/system/CacheRecorder.cc
@@ -30,8 +30,11 @@
 #include "mem/ruby/system/CacheRecorder.hh"
 
 #include "debug/RubyCacheTrace.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/system/GPUCoalescer.hh"
 #include "mem/ruby/system/RubySystem.hh"
 #include "mem/ruby/system/Sequencer.hh"
+#include "sim/sim_exit.hh"
 
 namespace gem5
 {
@@ -57,11 +60,13 @@ CacheRecorder::CacheRecorder()
 CacheRecorder::CacheRecorder(uint8_t* uncompressed_trace,
                              uint64_t uncompressed_trace_size,
                              std::vector<Sequencer*>& seq_map,
+                             std::vector<GPUCoalescer*>& coal_map,
                              uint64_t block_size_bytes)
     : m_uncompressed_trace(uncompressed_trace),
       m_uncompressed_trace_size(uncompressed_trace_size),
-      m_seq_map(seq_map),  m_bytes_read(0), m_records_read(0),
-      m_records_flushed(0), m_block_size_bytes(block_size_bytes)
+      m_seq_map(seq_map), m_coalescer_map(coal_map), m_bytes_read(0),
+      m_records_read(0), m_records_flushed(0),
+      m_block_size_bytes(block_size_bytes)
 {
     if (m_uncompressed_trace != NULL) {
         if (m_block_size_bytes < RubySystem::getBlockSizeBytes()) {
@@ -81,6 +86,7 @@ CacheRecorder::~CacheRecorder()
         m_uncompressed_trace = NULL;
     }
     m_seq_map.clear();
+    m_coalescer_map.clear();
 }
 
 void
@@ -96,11 +102,21 @@ CacheRecorder::enqueueNextFlushRequest()
         Packet *pkt = new Packet(req, requestType);
 
         Sequencer* m_sequencer_ptr = m_seq_map[rec->m_cntrl_id];
+        GPUCoalescer* m_coal_ptr = m_coalescer_map[rec->m_cntrl_id];
         assert(m_sequencer_ptr != NULL);
-        m_sequencer_ptr->makeRequest(pkt);
+        if (m_coal_ptr == NULL)
+            m_sequencer_ptr->makeRequest(pkt);
+        else {
+            pkt->req->setReqInstSeqNum(m_records_flushed - 1);
+            m_coal_ptr->makeRequest(pkt);
+        }
 
         DPRINTF(RubyCacheTrace, "Flushing %s\n", *rec);
+
     } else {
+        if (m_records_flushed > 0) {
+            exitSimLoop("Finished Drain", 0);
+        }
         DPRINTF(RubyCacheTrace, "Flushed all %d records\n", m_records_flushed);
     }
 }
@@ -143,13 +159,21 @@ CacheRecorder::enqueueNextFetchRequest()
             pkt->dataStatic(traceRecord->m_data + rec_bytes_read);
 
             Sequencer* m_sequencer_ptr = m_seq_map[traceRecord->m_cntrl_id];
+            GPUCoalescer* m_coal_ptr;
+            m_coal_ptr = m_coalescer_map[traceRecord->m_cntrl_id];
             assert(m_sequencer_ptr != NULL);
-            m_sequencer_ptr->makeRequest(pkt);
+            if (m_coal_ptr == NULL)
+                m_sequencer_ptr->makeRequest(pkt);
+            else {
+                pkt->req->setReqInstSeqNum(m_records_read);
+                m_coal_ptr->makeRequest(pkt);
+            }
         }
 
         m_bytes_read += (sizeof(TraceRecord) + m_block_size_bytes);
         m_records_read++;
     } else {
+        exitSimLoop("Finished Warmup", 0);
         DPRINTF(RubyCacheTrace, "Fetched all %d records\n", m_records_read);
     }
 }
@@ -168,6 +192,8 @@ CacheRecorder::addRecord(int cntrl, Addr data_addr, Addr pc_addr,
     memcpy(rec->m_data, data.getData(0, m_block_size_bytes),
            m_block_size_bytes);
 
+    DPRINTF(RubyCacheTrace, "Inside addRecord with cntrl id %d and type %d\n",
+            cntrl, type);
     m_records.push_back(rec);
 }
 
diff --git a/src/mem/ruby/system/CacheRecorder.hh b/src/mem/ruby/system/CacheRecorder.hh
index be95590313..9363e2fde7 100644
--- a/src/mem/ruby/system/CacheRecorder.hh
+++ b/src/mem/ruby/system/CacheRecorder.hh
@@ -50,6 +50,7 @@ namespace ruby
 {
 
 class Sequencer;
+class GPUCoalescer;
 
 /*!
  * Class for recording cache contents. Note that the last element of the
@@ -79,6 +80,7 @@ class CacheRecorder
     CacheRecorder(uint8_t* uncompressed_trace,
                   uint64_t uncompressed_trace_size,
                   std::vector<Sequencer*>& SequencerMap,
+                  std::vector<GPUCoalescer*>& CoalescerMap,
                   uint64_t block_size_bytes);
     void addRecord(int cntrl, Addr data_addr, Addr pc_addr,
                    RubyRequestType type, Tick time, DataBlock& data);
@@ -115,6 +117,7 @@ class CacheRecorder
     uint8_t* m_uncompressed_trace;
     uint64_t m_uncompressed_trace_size;
     std::vector<Sequencer*> m_seq_map;
+    std::vector<GPUCoalescer*> m_coalescer_map;
     uint64_t m_bytes_read;
     uint64_t m_records_read;
     uint64_t m_records_flushed;
diff --git a/src/mem/ruby/system/RubySystem.cc b/src/mem/ruby/system/RubySystem.cc
index b38c903b09..232e337752 100644
--- a/src/mem/ruby/system/RubySystem.cc
+++ b/src/mem/ruby/system/RubySystem.cc
@@ -178,13 +178,22 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
                               uint64_t block_size_bytes)
 {
     std::vector<Sequencer*> sequencer_map;
+    std::vector<GPUCoalescer*> coalescer_map;
     Sequencer* sequencer_ptr = NULL;
+    GPUCoalescer* coalescer_ptr = NULL;
 
     for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
         sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getCPUSequencer());
+        coalescer_map.push_back(m_abs_cntrl_vec[cntrl]->getGPUCoalescer());
+
         if (sequencer_ptr == NULL) {
             sequencer_ptr = sequencer_map[cntrl];
         }
+
+        if (coalescer_ptr == NULL) {
+            coalescer_ptr = coalescer_map[cntrl];
+        }
+
     }
 
     assert(sequencer_ptr != NULL);
@@ -193,6 +202,11 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
         if (sequencer_map[cntrl] == NULL) {
             sequencer_map[cntrl] = sequencer_ptr;
         }
+
+        if (coalescer_map[cntrl] == NULL) {
+            coalescer_map[cntrl] = coalescer_ptr;
+        }
+
     }
 
     // Remove the old CacheRecorder if it's still hanging about.
@@ -202,7 +216,8 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
 
     // Create the CacheRecorder and record the cache trace
     m_cache_recorder = new CacheRecorder(uncompressed_trace, cache_trace_size,
-                                         sequencer_map, block_size_bytes);
+                                         sequencer_map, coalescer_map,
+                                         block_size_bytes);
 }
 
 void

From f69191a31d091562201f544c28462b87d6c46206 Mon Sep 17 00:00:00 2001
From: Vishnu Ramadas <vramadas@outlook.com>
Date: Mon, 2 Oct 2023 19:37:46 -0500
Subject: [PATCH 6/8] dev-amdgpu: Remove duplicate writes to PM4 queue pointers

During checkpoint restoration, the unserialize() function writes rptr,
wptr, and indirect buffer rptr, wptr to PM4 queue's rptr, wptr fields.
This commit updates this to write only the relevant pointers to the
queue structure. If indirect buffers are used, then it writes only the
indirect buffer pointers to the queue. If they are not used, then it
writes rptr, wptr values to the queue.

Change-Id: Iedb25a726112e1af99cc1e7bc012de51c4ebfd45
---
 src/dev/amdgpu/pm4_packet_processor.cc | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc
index 63a3bf8887..fdb6f9d7ce 100644
--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -1190,15 +1190,16 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
         memset(pkt, 0, sizeof(PM4MapQueues));
         newQueue(mqd, offset[i], pkt, id[i]);
 
-        queues[id[i]]->ib(false);
-        queues[id[i]]->rptr(rptr[i]);
-        queues[id[i]]->wptr(wptr[i]);
-        queues[id[i]]->ib(true);
-        queues[id[i]]->wptr(ib_wptr[i]);
-        queues[id[i]]->rptr(ib_rptr[i]);
+        if (ib[i]) {
+            queues[id[i]]->wptr(ib_wptr[i]);
+            queues[id[i]]->rptr(ib_rptr[i]);
+        } else {
+            queues[id[i]]->rptr(rptr[i]);
+            queues[id[i]]->wptr(wptr[i]);
+        }
+        queues[id[i]]->ib(ib[i]);
         queues[id[i]]->offset(offset[i]);
         queues[id[i]]->processing(processing[i]);
-        queues[id[i]]->ib(ib[i]);
         queues[id[i]]->setPkt(me[i], pipe[i], queue[i], privileged[i]);
         queues[id[i]]->getMQD()->hqd_active = hqd_active[i];
         queues[id[i]]->getMQD()->hqd_vmid = hqd_vmid[i];

From d3637a489d6ddcc8ca5d99f20b53a1ea64bbc422 Mon Sep 17 00:00:00 2001
From: Vishnu Ramadas <vramadas@outlook.com>
Date: Tue, 3 Oct 2023 12:10:42 -0500
Subject: [PATCH 7/8] configs: Add option to disable AVX in GPUFS

GPUFS+KVM simulations automatically enable AVX. This commit adds a
command line option to disable AVX if its not needed for a GPUFS
simulation.

Change-Id: Ic22592767dbdca86f3718eca9c837a8e29b6b781
---
 configs/example/gpufs/runfs.py         | 10 ++++++++++
 configs/example/gpufs/system/system.py |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py
index 0f090e2f89..5d29959eff 100644
--- a/configs/example/gpufs/runfs.py
+++ b/configs/example/gpufs/runfs.py
@@ -158,6 +158,16 @@ def addRunFSOptions(parser):
         help="Root partition of disk image",
     )
 
+    parser.add_argument(
+        "--disable-avx",
+        action="store_true",
+        default=False,
+        help="Disables AVX. AVX is used in some ROCm libraries but "
+        "does not have checkpointing support yet. If simulation either "
+        "creates a checkpoint or restores from one, then AVX needs to "
+        "be disabled for correct functionality ",
+    )
+
 
 def runGpuFSSystem(args):
     """
diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py
index 7ddc4f0752..7cb0ce1aa5 100644
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -234,7 +234,7 @@ def makeGpuFSSystem(args):
     # If we are using KVM cpu, enable AVX. AVX is used in some ROCm libraries
     # such as rocBLAS which is used in higher level libraries like PyTorch.
     use_avx = False
-    if ObjectList.is_kvm_cpu(TestCPUClass):
+    if ObjectList.is_kvm_cpu(TestCPUClass) and not args.disable_avx:
         # AVX also requires CR4.osxsave to be 1. These must be set together
         # of KVM will error out.
         system.workload.enable_osxsave = 1

From a19667427a96c560b243ffbadbba3b7f4f6db2b4 Mon Sep 17 00:00:00 2001
From: Vishnu Ramadas <vramadas@outlook.com>
Date: Thu, 5 Oct 2023 18:59:54 -0500
Subject: [PATCH 8/8] mem-ruby: Add BUILD_GPU guard to ruby cooldown and warmup
 phases

Ruby was recently updated to support flushes and warmup for GPUs. Since
this support uses the GPUCoalescer, non-GPU builds face a compile time
issue. This is because GPU code is not built for non-GPU builds. This
commit addes "#if BUILD_GPU" guards around the GPU-related code in
common files like AbstractController.hh, CacheRecorder.*, RubySystem.cc,
GPUCoalescer.hh, and VIPERCoalescer.hh. This support allows GPU builds
to use flushing while non-GPU builds compile without problems

Change-Id: If8ee4ff881fe154553289e8c00881ee1b6e3f113
---
 .../slicc_interface/AbstractController.hh     |  2 ++
 src/mem/ruby/system/CacheRecorder.cc          | 27 +++++++++++++++++++
 src/mem/ruby/system/CacheRecorder.hh          | 12 +++++++++
 src/mem/ruby/system/GPUCoalescer.hh           |  5 ++++
 src/mem/ruby/system/RubySystem.cc             | 16 ++++++++++-
 src/mem/ruby/system/VIPERCoalescer.hh         |  5 ++++
 6 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/src/mem/ruby/slicc_interface/AbstractController.hh b/src/mem/ruby/slicc_interface/AbstractController.hh
index 72b679d6cf..7d93644bd8 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.hh
+++ b/src/mem/ruby/slicc_interface/AbstractController.hh
@@ -70,7 +70,9 @@ namespace ruby
 {
 
 class Network;
+#ifdef BUILD_GPU
 class GPUCoalescer;
+#endif
 class DMASequencer;
 
 // used to communicate that an in_port peeked the wrong message type
diff --git a/src/mem/ruby/system/CacheRecorder.cc b/src/mem/ruby/system/CacheRecorder.cc
index ec552c07c5..057b6aa041 100644
--- a/src/mem/ruby/system/CacheRecorder.cc
+++ b/src/mem/ruby/system/CacheRecorder.cc
@@ -57,6 +57,7 @@ CacheRecorder::CacheRecorder()
 {
 }
 
+#if BUILD_GPU
 CacheRecorder::CacheRecorder(uint8_t* uncompressed_trace,
                              uint64_t uncompressed_trace_size,
                              std::vector<Sequencer*>& seq_map,
@@ -67,6 +68,18 @@ CacheRecorder::CacheRecorder(uint8_t* uncompressed_trace,
       m_seq_map(seq_map), m_coalescer_map(coal_map), m_bytes_read(0),
       m_records_read(0), m_records_flushed(0),
       m_block_size_bytes(block_size_bytes)
+#else
+CacheRecorder::CacheRecorder(uint8_t* uncompressed_trace,
+                             uint64_t uncompressed_trace_size,
+                             std::vector<Sequencer*>& seq_map,
+                             uint64_t block_size_bytes)
+    : m_uncompressed_trace(uncompressed_trace),
+      m_uncompressed_trace_size(uncompressed_trace_size),
+      m_seq_map(seq_map), m_bytes_read(0),
+      m_records_read(0), m_records_flushed(0),
+      m_block_size_bytes(block_size_bytes)
+
+#endif
 {
     if (m_uncompressed_trace != NULL) {
         if (m_block_size_bytes < RubySystem::getBlockSizeBytes()) {
@@ -86,7 +99,9 @@ CacheRecorder::~CacheRecorder()
         m_uncompressed_trace = NULL;
     }
     m_seq_map.clear();
+#if BUILD_GPU
     m_coalescer_map.clear();
+#endif
 }
 
 void
@@ -102,14 +117,20 @@ CacheRecorder::enqueueNextFlushRequest()
         Packet *pkt = new Packet(req, requestType);
 
         Sequencer* m_sequencer_ptr = m_seq_map[rec->m_cntrl_id];
+#if BUILD_GPU
         GPUCoalescer* m_coal_ptr = m_coalescer_map[rec->m_cntrl_id];
+#endif
         assert(m_sequencer_ptr != NULL);
+#if BUILD_GPU
         if (m_coal_ptr == NULL)
             m_sequencer_ptr->makeRequest(pkt);
         else {
             pkt->req->setReqInstSeqNum(m_records_flushed - 1);
             m_coal_ptr->makeRequest(pkt);
         }
+#else
+        m_sequencer_ptr->makeRequest(pkt);
+#endif
 
         DPRINTF(RubyCacheTrace, "Flushing %s\n", *rec);
 
@@ -159,15 +180,21 @@ CacheRecorder::enqueueNextFetchRequest()
             pkt->dataStatic(traceRecord->m_data + rec_bytes_read);
 
             Sequencer* m_sequencer_ptr = m_seq_map[traceRecord->m_cntrl_id];
+#if BUILD_GPU
             GPUCoalescer* m_coal_ptr;
             m_coal_ptr = m_coalescer_map[traceRecord->m_cntrl_id];
+#endif
             assert(m_sequencer_ptr != NULL);
+#if BUILD_GPU
             if (m_coal_ptr == NULL)
                 m_sequencer_ptr->makeRequest(pkt);
             else {
                 pkt->req->setReqInstSeqNum(m_records_read);
                 m_coal_ptr->makeRequest(pkt);
             }
+#else
+            m_sequencer_ptr->makeRequest(pkt);
+#endif
         }
 
         m_bytes_read += (sizeof(TraceRecord) + m_block_size_bytes);
diff --git a/src/mem/ruby/system/CacheRecorder.hh b/src/mem/ruby/system/CacheRecorder.hh
index 9363e2fde7..e94dfad97a 100644
--- a/src/mem/ruby/system/CacheRecorder.hh
+++ b/src/mem/ruby/system/CacheRecorder.hh
@@ -38,6 +38,7 @@
 #include <vector>
 
 #include "base/types.hh"
+#include "config/build_gpu.hh"
 #include "mem/ruby/common/Address.hh"
 #include "mem/ruby/common/DataBlock.hh"
 #include "mem/ruby/common/TypeDefines.hh"
@@ -50,7 +51,9 @@ namespace ruby
 {
 
 class Sequencer;
+#if BUILD_GPU
 class GPUCoalescer;
+#endif
 
 /*!
  * Class for recording cache contents. Note that the last element of the
@@ -77,11 +80,18 @@ class CacheRecorder
     CacheRecorder();
     ~CacheRecorder();
 
+#if BUILD_GPU
     CacheRecorder(uint8_t* uncompressed_trace,
                   uint64_t uncompressed_trace_size,
                   std::vector<Sequencer*>& SequencerMap,
                   std::vector<GPUCoalescer*>& CoalescerMap,
                   uint64_t block_size_bytes);
+#else
+    CacheRecorder(uint8_t* uncompressed_trace,
+                  uint64_t uncompressed_trace_size,
+                  std::vector<Sequencer*>& SequencerMap,
+                  uint64_t block_size_bytes);
+#endif
     void addRecord(int cntrl, Addr data_addr, Addr pc_addr,
                    RubyRequestType type, Tick time, DataBlock& data);
 
@@ -117,7 +127,9 @@ class CacheRecorder
     uint8_t* m_uncompressed_trace;
     uint64_t m_uncompressed_trace_size;
     std::vector<Sequencer*> m_seq_map;
+#if BUILD_GPU
     std::vector<GPUCoalescer*> m_coalescer_map;
+#endif
     uint64_t m_bytes_read;
     uint64_t m_records_read;
     uint64_t m_records_flushed;
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh
index d6db5c00ba..3f936b4b41 100644
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -32,6 +32,10 @@
 #ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
 #define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
 
+#include "config/build_gpu.hh"
+
+#if BUILD_GPU
+
 #include <iostream>
 #include <unordered_map>
 
@@ -546,4 +550,5 @@ operator<<(std::ostream& out, const GPUCoalescer& obj)
 } // namespace ruby
 } // namespace gem5
 
+#endif // BUILD_GPU
 #endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
diff --git a/src/mem/ruby/system/RubySystem.cc b/src/mem/ruby/system/RubySystem.cc
index 232e337752..32dec7b9e0 100644
--- a/src/mem/ruby/system/RubySystem.cc
+++ b/src/mem/ruby/system/RubySystem.cc
@@ -178,21 +178,27 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
                               uint64_t block_size_bytes)
 {
     std::vector<Sequencer*> sequencer_map;
+#if BUILD_GPU
     std::vector<GPUCoalescer*> coalescer_map;
-    Sequencer* sequencer_ptr = NULL;
     GPUCoalescer* coalescer_ptr = NULL;
+#endif
+    Sequencer* sequencer_ptr = NULL;
 
     for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
         sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getCPUSequencer());
+#if BUILD_GPU
         coalescer_map.push_back(m_abs_cntrl_vec[cntrl]->getGPUCoalescer());
+#endif
 
         if (sequencer_ptr == NULL) {
             sequencer_ptr = sequencer_map[cntrl];
         }
 
+#if BUILD_GPU
         if (coalescer_ptr == NULL) {
             coalescer_ptr = coalescer_map[cntrl];
         }
+#endif
 
     }
 
@@ -203,9 +209,11 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
             sequencer_map[cntrl] = sequencer_ptr;
         }
 
+#if BUILD_GPU
         if (coalescer_map[cntrl] == NULL) {
             coalescer_map[cntrl] = coalescer_ptr;
         }
+#endif
 
     }
 
@@ -215,9 +223,15 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
     }
 
     // Create the CacheRecorder and record the cache trace
+#if BUILD_GPU
     m_cache_recorder = new CacheRecorder(uncompressed_trace, cache_trace_size,
                                          sequencer_map, coalescer_map,
                                          block_size_bytes);
+#else
+    m_cache_recorder = new CacheRecorder(uncompressed_trace, cache_trace_size,
+                                         sequencer_map,
+                                         block_size_bytes);
+#endif
 }
 
 void
diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh
index c7e21e946b..d185620244 100644
--- a/src/mem/ruby/system/VIPERCoalescer.hh
+++ b/src/mem/ruby/system/VIPERCoalescer.hh
@@ -32,6 +32,10 @@
 #ifndef __MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__
 #define __MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__
 
+#include "config/build_gpu.hh"
+
+#if BUILD_GPU
+
 #include <iostream>
 
 #include "mem/ruby/common/Address.hh"
@@ -92,4 +96,5 @@ class VIPERCoalescer : public GPUCoalescer
 } // namespace ruby
 } // namespace gem5
 
+#endif // BUILD_GPU
 #endif //__MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__