diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
index beb8da3f9c..a70af07467 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -73,6 +73,14 @@ UncoalescedTable::insertPacket(PacketPtr pkt)
             pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size());
 }
 
+void
+UncoalescedTable::insertReqType(PacketPtr pkt, RubyRequestType type)
+{
+    uint64_t seqNum = pkt->req->getReqInstSeqNum();
+
+    reqTypeMap[seqNum] = type;
+}
+
 bool
 UncoalescedTable::packetAvailable()
 {
@@ -128,9 +136,21 @@ UncoalescedTable::updateResources()
             instMap.erase(iter++);
             instPktsRemaining.erase(seq_num);
 
-            // Release the token
-            DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", seq_num);
-            coalescer->getGMTokenPort().sendTokens(1);
+            // Release the token if the Ruby system is not in cooldown
+            // or warmup phases. When in these phases, the RubyPorts
+            // are accessed directly using the makeRequest() command
+            // instead of accessing through the port. This makes
+            // sending tokens through the port unnecessary
+            if (!RubySystem::getWarmupEnabled()
+                    && !RubySystem::getCooldownEnabled()) {
+                if (reqTypeMap[seq_num] != RubyRequestType_FLUSH) {
+                    DPRINTF(GPUCoalescer,
+                            "Returning token seqNum %d\n", seq_num);
+                    coalescer->getGMTokenPort().sendTokens(1);
+                }
+            }
+
+            reqTypeMap.erase(seq_num);
         } else {
             ++iter;
         }
@@ -565,6 +585,14 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
     for (auto& pkt : pktList) {
         offset = getOffset(pkt->getAddr());
         pkt_size = pkt->getSize();
+        request_address = pkt->getAddr();
+
+        // When the Ruby system is cooldown phase, the requests come from
+        // the cache recorder. These requests do not get coalesced and
+        // do not return valid data.
+        if (RubySystem::getCooldownEnabled())
+            continue;
+
         if (pkt->getPtr<uint8_t>()) {
             switch(type) {
                 // Store and AtomicNoReturns follow the same path, as the
@@ -627,7 +655,6 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
     assert(!pkt->req->isLLSC());
     assert(!pkt->req->isLockedRMW());
     assert(!pkt->req->isInstFetch());
-    assert(!pkt->isFlush());
 
     if (pkt->req->isAtomicReturn()) {
         req_type = RubyRequestType_ATOMIC_RETURN;
@@ -637,6 +664,8 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
         req_type = RubyRequestType_LD;
     } else if (pkt->isWrite()) {
         req_type = RubyRequestType_ST;
+    } else if (pkt->isFlush()) {
+        req_type = RubyRequestType_FLUSH;
     } else {
         panic("Unsupported ruby packet type\n");
     }
@@ -658,7 +687,7 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
         issueMemSyncRequest(pkt);
     } else {
         // otherwise, this must be either read or write command
-        assert(pkt->isRead() || pkt->isWrite());
+        assert(pkt->isRead() || pkt->isWrite() || pkt->isFlush());
 
         InstSeqNum seq_num = pkt->req->getReqInstSeqNum();
 
@@ -667,10 +696,17 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
         // number of lanes actives for that vmem request (i.e., the popcnt
         // of the exec_mask.
         int num_packets = 1;
-        if (!m_usingRubyTester) {
-            num_packets = 0;
-            for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) {
-                num_packets += getDynInst(pkt)->getLaneStatus(i);
+
+        // When Ruby is in warmup or cooldown phase, the requests come from
+        // the cache recorder. There is no dynamic instruction associated
+        // with these requests either
+        if (!RubySystem::getWarmupEnabled()
+                && !RubySystem::getCooldownEnabled()) {
+            if (!m_usingRubyTester) {
+                num_packets = 0;
+                for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) {
+                    num_packets += getDynInst(pkt)->getLaneStatus(i);
+                }
             }
         }
 
@@ -679,6 +715,7 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
         // future cycle. Packets remaining is set to the number of excepted
         // requests from the instruction based on its exec_mask.
         uncoalescedTable.insertPacket(pkt);
+        uncoalescedTable.insertReqType(pkt, getRequestType(pkt));
         uncoalescedTable.initPacketsRemaining(seq_num, num_packets);
         DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
                 pkt->getAddr());
@@ -945,21 +982,27 @@ void
 GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
 {
     for (auto& pkt : mylist) {
-        RubyPort::SenderState *ss =
-            safe_cast<RubyPort::SenderState *>(pkt->senderState);
-        MemResponsePort *port = ss->port;
-        assert(port != NULL);
+        // When Ruby is in warmup or cooldown phase, the requests come
+        // from the cache recorder. They do not track which port to use
+        // and do not need to send the response back
+        if (!RubySystem::getWarmupEnabled()
+                && !RubySystem::getCooldownEnabled()) {
+            RubyPort::SenderState *ss =
+                safe_cast<RubyPort::SenderState *>(pkt->senderState);
+            MemResponsePort *port = ss->port;
+            assert(port != NULL);
 
-        pkt->senderState = ss->predecessor;
+            pkt->senderState = ss->predecessor;
 
-        if (pkt->cmd != MemCmd::WriteReq) {
-            // for WriteReq, we keep the original senderState until
-            // writeCompleteCallback
-            delete ss;
+            if (pkt->cmd != MemCmd::WriteReq) {
+                // for WriteReq, we keep the original senderState until
+                // writeCompleteCallback
+                delete ss;
+            }
+
+            port->hitCallback(pkt);
+            trySendRetries();
         }
-
-        port->hitCallback(pkt);
-        trySendRetries();
     }
 
     // We schedule an event in the same tick as hitCallback (similar to
@@ -971,7 +1014,14 @@ GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
         schedule(issueEvent, curTick());
     }
 
-    testDrainComplete();
+    RubySystem *rs = m_ruby_system;
+    if (RubySystem::getWarmupEnabled()) {
+        rs->m_cache_recorder->enqueueNextFetchRequest();
+    } else if (RubySystem::getCooldownEnabled()) {
+        rs->m_cache_recorder->enqueueNextFlushRequest();
+    } else {
+        testDrainComplete();
+    }
 }
 
 void
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh
index dd28855547..d6db5c00ba 100644
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -71,6 +71,7 @@ class UncoalescedTable
     ~UncoalescedTable() {}
 
     void insertPacket(PacketPtr pkt);
+    void insertReqType(PacketPtr pkt, RubyRequestType type);
     bool packetAvailable();
     void printRequestTable(std::stringstream& ss);
 
@@ -101,6 +102,8 @@ class UncoalescedTable
     std::map<InstSeqNum, PerInstPackets> instMap;
 
     std::map<InstSeqNum, int> instPktsRemaining;
+
+    std::map<InstSeqNum, RubyRequestType> reqTypeMap;
 };
 
 class CoalescedRequest