dev-amdgpu,mem-ruby: Add support to checkpoint and restore between kernels in GPUFS (#377)

Earlier, GPU checkpointing was working only if a checkpoint was created before the first kernel execution. This pull request adds support to checkpoint in-between any two kernel calls. It does so by doing the following. - Adds flush support in the GPU_VIPER protocol - Adds flush support in the GPUCoalescer - Updates cache recorder to use the GPUCoalescer during simulation cooldown and cache warmup times.
2023-10-10 09:41:21 -05:00
parent d9fe0cfe1c a19667427a
commit ec633b3d68
14 changed files with 381 additions and 38 deletions
--- a/configs/example/gpufs/runfs.py
+++ b/configs/example/gpufs/runfs.py
@@ -158,6 +158,16 @@ def addRunFSOptions(parser):
        help="Root partition of disk image",
    )

+    parser.add_argument(
+        "--disable-avx",
+        action="store_true",
+        default=False,
+        help="Disables AVX. AVX is used in some ROCm libraries but "
+        "does not have checkpointing support yet. If simulation either "
+        "creates a checkpoint or restores from one, then AVX needs to "
+        "be disabled for correct functionality ",
+    )
+

 def runGpuFSSystem(args):
    """
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -234,7 +234,7 @@ def makeGpuFSSystem(args):
    # If we are using KVM cpu, enable AVX. AVX is used in some ROCm libraries
    # such as rocBLAS which is used in higher level libraries like PyTorch.
    use_avx = False
-    if ObjectList.is_kvm_cpu(TestCPUClass):
+    if ObjectList.is_kvm_cpu(TestCPUClass) and not args.disable_avx:
        # AVX also requires CR4.osxsave to be 1. These must be set together
        # of KVM will error out.
        system.workload.enable_osxsave = 1
--- a/src/dev/amdgpu/pm4_packet_processor.cc
+++ b/src/dev/amdgpu/pm4_packet_processor.cc
@@ -1044,6 +1044,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
    int num_queues = queues.size();
    Addr id[num_queues];
    Addr mqd_base[num_queues];
+    uint64_t mqd_read_index[num_queues];
    Addr base[num_queues];
    Addr rptr[num_queues];
    Addr wptr[num_queues];
@@ -1060,6 +1061,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
    uint32_t hqd_active[num_queues];
    uint32_t hqd_vmid[num_queues];
    Addr aql_rptr[num_queues];
+    uint32_t aql[num_queues];
    uint32_t doorbell[num_queues];
    uint32_t hqd_pq_control[num_queues];

@@ -1068,9 +1070,10 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
        PM4Queue *q = iter.second;
        id[i] = q->id();
        mqd_base[i] = q->mqdBase();
+        mqd_read_index[i] = q->getMQD()->mqdReadIndex;
        bool cur_state = q->ib();
        q->ib(false);
-        base[i] = q->base() >> 8;
+        base[i] = q->base();
        rptr[i] = q->getRptr();
        wptr[i] = q->getWptr();
        q->ib(true);
@@ -1088,6 +1091,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
        hqd_active[i] = q->getMQD()->hqd_active;
        hqd_vmid[i] = q->getMQD()->hqd_vmid;
        aql_rptr[i] = q->getMQD()->aqlRptr;
+        aql[i] = q->getMQD()->aql;
        doorbell[i] = q->getMQD()->doorbell;
        hqd_pq_control[i] = q->getMQD()->hqd_pq_control;
        i++;
@@ -1096,6 +1100,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
    SERIALIZE_SCALAR(num_queues);
    SERIALIZE_ARRAY(id, num_queues);
    SERIALIZE_ARRAY(mqd_base, num_queues);
+    SERIALIZE_ARRAY(mqd_read_index, num_queues);
    SERIALIZE_ARRAY(base, num_queues);
    SERIALIZE_ARRAY(rptr, num_queues);
    SERIALIZE_ARRAY(wptr, num_queues);
@@ -1112,6 +1117,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
    SERIALIZE_ARRAY(hqd_active, num_queues);
    SERIALIZE_ARRAY(hqd_vmid, num_queues);
    SERIALIZE_ARRAY(aql_rptr, num_queues);
+    SERIALIZE_ARRAY(aql, num_queues);
    SERIALIZE_ARRAY(doorbell, num_queues);
    SERIALIZE_ARRAY(hqd_pq_control, num_queues);
 }
@@ -1127,6 +1133,7 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)

    Addr id[num_queues];
    Addr mqd_base[num_queues];
+    uint64_t mqd_read_index[num_queues];
    Addr base[num_queues];
    Addr rptr[num_queues];
    Addr wptr[num_queues];
@@ -1143,11 +1150,13 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
    uint32_t hqd_active[num_queues];
    uint32_t hqd_vmid[num_queues];
    Addr aql_rptr[num_queues];
+    uint32_t aql[num_queues];
    uint32_t doorbell[num_queues];
    uint32_t hqd_pq_control[num_queues];

    UNSERIALIZE_ARRAY(id, num_queues);
    UNSERIALIZE_ARRAY(mqd_base, num_queues);
+    UNSERIALIZE_ARRAY(mqd_read_index, num_queues);
    UNSERIALIZE_ARRAY(base, num_queues);
    UNSERIALIZE_ARRAY(rptr, num_queues);
    UNSERIALIZE_ARRAY(wptr, num_queues);
@@ -1164,6 +1173,7 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
    UNSERIALIZE_ARRAY(hqd_active, num_queues);
    UNSERIALIZE_ARRAY(hqd_vmid, num_queues);
    UNSERIALIZE_ARRAY(aql_rptr, num_queues);
+    UNSERIALIZE_ARRAY(aql, num_queues);
    UNSERIALIZE_ARRAY(doorbell, num_queues);
    UNSERIALIZE_ARRAY(hqd_pq_control, num_queues);

@@ -1172,22 +1182,24 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
        memset(mqd, 0, sizeof(QueueDesc));

        mqd->mqdBase = mqd_base[i] >> 8;
-        mqd->base = base[i];
-        mqd->rptr = rptr[i];
-        mqd->ibBase = ib_base[i];
-        mqd->ibRptr = ib_rptr[i];
+        mqd->mqdReadIndex = mqd_read_index[i];
+        mqd->base = base[i] >> 8;
+        mqd->aql = aql[i];

        PM4MapQueues* pkt = new PM4MapQueues;
        memset(pkt, 0, sizeof(PM4MapQueues));
        newQueue(mqd, offset[i], pkt, id[i]);

-        queues[id[i]]->ib(false);
-        queues[id[i]]->wptr(wptr[i]);
-        queues[id[i]]->ib(true);
-        queues[id[i]]->wptr(ib_wptr[i]);
+        if (ib[i]) {
+            queues[id[i]]->wptr(ib_wptr[i]);
+            queues[id[i]]->rptr(ib_rptr[i]);
+        } else {
+            queues[id[i]]->rptr(rptr[i]);
+            queues[id[i]]->wptr(wptr[i]);
+        }
+        queues[id[i]]->ib(ib[i]);
        queues[id[i]]->offset(offset[i]);
        queues[id[i]]->processing(processing[i]);
-        queues[id[i]]->ib(ib[i]);
        queues[id[i]]->setPkt(me[i], pipe[i], queue[i], privileged[i]);
        queues[id[i]]->getMQD()->hqd_active = hqd_active[i];
        queues[id[i]]->getMQD()->hqd_vmid = hqd_vmid[i];
@@ -1195,6 +1207,14 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
        queues[id[i]]->getMQD()->doorbell = doorbell[i];
        queues[id[i]]->getMQD()->hqd_pq_control = hqd_pq_control[i];

+        if (mqd->aql) {
+            int mqd_size = (1 << ((hqd_pq_control[i] & 0x3f) + 1)) * 4;
+            auto &hsa_pp = gpuDevice->CP()->hsaPacketProc();
+            hsa_pp.setDeviceQueueDesc(aql_rptr[i], base[i], id[i],
+                                  mqd_size, 8, GfxVersion::gfx900, offset[i],
+                                  mqd_read_index[i]);
+        }
+
        DPRINTF(PM4PacketProcessor, "PM4 queue %d, rptr: %p wptr: %p\n",
                queues[id[i]]->id(), queues[id[i]]->rptr(),
                queues[id[i]]->wptr());
--- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm
@@ -65,7 +65,8 @@ machine(MachineType:TCC, "TCC Cache")
    AtomicPassOn,           desc="Atomic Op Passed on to Directory";
    AtomicDone,             desc="AtomicOps Complete";
    AtomicNotDone,          desc="AtomicOps not Complete";
-    Data,                   desc="data messgae";
+    Data,                   desc="Data message";
+    Flush,                  desc="Flush cache entry";
    // Coming from this TCC
    L2_Repl,                desc="L2 Replacement";
    // Probes
@@ -376,6 +377,8 @@ machine(MachineType:TCC, "TCC Cache")
          } else {
            trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
          }
+        } else if (in_msg.Type == CoherenceRequestType:WriteFlush) {
+            trigger(Event:Flush, in_msg.addr, cache_entry, tbe);
        } else {
          DPRINTF(RubySlicc, "%s\n", in_msg);
          error("Unexpected Response Message to Core");
@@ -509,6 +512,20 @@ machine(MachineType:TCC, "TCC Cache")
    }
  }

+  action(fw_sendFlushResponse, "fw", desc="send Flush Response") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:TDSysWBAck;
+        out_msg.Destination.clear();
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        out_msg.instSeqNum := in_msg.instSeqNum;
+      }
+    }
+  }
+
  action(ar_sendAtomicResponse, "ar", desc="send Atomic Ack") {
    peek(coreRequestNetwork_in, CPURequestMsg) {
        enqueue(responseToCore_out, ResponseMsg, l2_response_latency + glc_atomic_latency, true) {
@@ -628,6 +645,22 @@ machine(MachineType:TCC, "TCC Cache")
    }
  }

+  action(f_flush, "f", desc="write back data") {
+    peek(coreRequestNetwork_in, CPURequestMsg) {
+      enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+        out_msg.addr := address;
+        out_msg.Requestor := machineID;
+        out_msg.WTRequestor := in_msg.Requestor;
+        out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
+        out_msg.MessageSize := MessageSizeType:Data;
+        out_msg.Type := CoherenceRequestType:WriteFlush;
+        out_msg.Dirty := true;
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.writeMask.orMask(cache_entry.writeMask);
+      }
+    }
+  }
+
  action(at_atomicThrough, "at", desc="write back data") {
    peek(coreRequestNetwork_in, CPURequestMsg) {
      enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
@@ -1075,4 +1108,21 @@ machine(MachineType:TCC, "TCC Cache")
  transition(WIB, WBAck,I) {
    pr_popResponseQueue;
  }
+
+  transition({A, IV, WI, WIB}, Flush) {
+    st_stallAndWaitRequest;
+  }
+
+  transition(I, Flush) {
+    fw_sendFlushResponse;
+    p_popRequestQueue;
+  }
+
+  transition({V, W}, Flush, I) {TagArrayRead, TagArrayWrite} {
+    t_allocateTBE;
+    ut_updateTag;
+    f_flush;
+    i_invL2;
+    p_popRequestQueue;
+   }
 }
--- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
+++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm
@@ -55,6 +55,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
    I, AccessPermission:Invalid, desc="Invalid";
    V, AccessPermission:Read_Only, desc="Valid";
    A, AccessPermission:Invalid, desc="Waiting on Atomic";
+
+    F, AccessPermission:Invalid, desc="Flushing; Waiting for Ack";
  }

  enumeration(Event, desc="TCP Events") {
@@ -256,6 +258,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
      peek(responseToTCP_in, ResponseMsg, block_on="addr") {
        Entry cache_entry := getCacheEntry(in_msg.addr);
        TBE tbe := TBEs.lookup(in_msg.addr);
+        DPRINTF(RubySlicc, "In responseToTCP_in with %s\n", in_msg);
+
        if (in_msg.Type == CoherenceResponseType:TDSysResp) {
          if (disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) {
              // If L1 is disabled or requests have GLC or SLC flag set,
@@ -273,6 +277,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
        } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck ||
                     in_msg.Type == CoherenceResponseType:NBSysWBAck) {
            trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
+            DPRINTF(RubySlicc, "Issuing TCC_AckWB\n");
          } else {
            error("Unexpected Response Message to Core");
          }
@@ -469,6 +474,24 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
  action(sf_setFlush, "sf", desc="set flush") {
    inFlush := true;
    APPEND_TRANSITION_COMMENT(" inFlush is true");
+    enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+      out_msg.addr := address;
+      out_msg.Requestor := machineID;
+      assert(is_valid(cache_entry));
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.writeMask.clear();
+      out_msg.writeMask.orMask(cache_entry.writeMask);
+      out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+                              TCC_select_low_bit, TCC_select_num_bits));
+      out_msg.MessageSize := MessageSizeType:Data;
+      out_msg.Type := CoherenceRequestType:WriteFlush;
+      out_msg.InitialRequestTime := curCycle();
+      out_msg.Shared := false;
+      out_msg.isSLCSet := false;
+      peek(mandatoryQueue_in, RubyRequest) {
+        out_msg.instSeqNum := in_msg.instSeqNum;
+      }
+    }
  }

  action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
@@ -524,6 +547,16 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
    cache_entry.Dirty := true;
  }

+  action(f_flushDone, "f", desc="flush done") {
+    assert(is_valid(cache_entry));
+
+    if (use_seq_not_coal) {
+        sequencer.writeCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
+    } else {
+        coalescer.writeCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
+    }
+  }
+
  action(inv_invDone, "inv", desc="local inv done") {
    if (use_seq_not_coal) {
        DPRINTF(RubySlicc, "Sequencer does not define invCallback!\n");
@@ -695,11 +728,16 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
    ic_invCache;
  }

-  transition({V, I, A},Flush) {TagArrayFlash} {
+  transition({V,I}, Flush, F) {TagArrayFlash} {
+    a_allocate;
    sf_setFlush;
    p_popMandatoryQueue;
  }

+  transition(A, Flush) {
+    z_stall;
+  }
+
  transition({I, V}, Evict, I) {TagArrayFlash} {
    inv_invDone;
    p_popMandatoryQueue;
@@ -716,4 +754,10 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
    wd_wtDone;
    pr_popResponseQueue;
  }
+
+  transition(F, TCC_AckWB, I) {
+    f_flushDone;
+    pr_popResponseQueue;
+    ic_invCache;
+  }
 }
--- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
+++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm
@@ -83,6 +83,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
    BM_Pm, AccessPermission:Backing_Store,      desc="blocked waiting for probes, already got memory";
    B_Pm, AccessPermission:Backing_Store,       desc="blocked waiting for probes, already got memory";
    B, AccessPermission:Backing_Store,          desc="sent response, Blocked til ack";
+
+    F, AccessPermission:Busy, desc="sent Flus, blocked till ack";
  }

  // Events
@@ -120,6 +122,9 @@ machine(MachineType:Directory, "AMD Baseline protocol")
    // DMA
    DmaRead,            desc="DMA read";
    DmaWrite,           desc="DMA write";
+
+    // Flush
+    Flush,              desc="Flush entry";
  }

  enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
@@ -411,6 +416,9 @@ machine(MachineType:Directory, "AMD Baseline protocol")
            DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr);
            trigger(Event:VicClean, in_msg.addr, entry, tbe);
          }
+        } else if (in_msg.Type == CoherenceRequestType:WriteFlush) {
+            DPRINTF(RubySlicc, "Got Flush from %s on %s\n", in_msg.Requestor, in_msg.addr);
+            trigger(Event:Flush, in_msg.addr, entry, tbe);
        } else {
          error("Bad request message type");
        }
@@ -562,6 +570,23 @@ machine(MachineType:Directory, "AMD Baseline protocol")
    }
  }

+  action(rf_sendResponseFlush, "rf", desc="send Flush Ack") {
+    peek(memQueue_in, MemoryMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, 1) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:NBSysWBAck;
+        out_msg.Destination.add(tbe.OriginalRequestor);
+        out_msg.WTRequestor := tbe.WTRequestor;
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Control;
+        out_msg.InitialRequestTime := tbe.InitialRequestTime;
+        out_msg.ForwardRequestTime := curCycle();
+        out_msg.ProbeRequestStartTime := curCycle();
+        //out_msg.instSeqNum := in_msg.instSeqNum;
+      }
+    }
+  }
+
  action(l_queueMemWBReq, "lq", desc="Write WB data to memory") {
    peek(responseNetwork_in, ResponseMsg) {
      enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
@@ -933,6 +958,23 @@ machine(MachineType:Directory, "AMD Baseline protocol")
    }
  }

+  action(f_writeFlushDataToMemory, "f", desc="Write flush data to memory") {
+    peek(requestNetwork_in, CPURequestMsg) {
+      enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
+        out_msg.addr := address;
+        out_msg.Type := MemoryRequestType:MEMORY_WB;
+        out_msg.Sender := machineID;
+        out_msg.MessageSize := MessageSizeType:Writeback_Data;
+        out_msg.DataBlk := in_msg.DataBlk;
+      }
+      if (tbe.Dirty == false) {
+          // have to update the TBE, too, because of how this
+          // directory deals with functional writes
+        tbe.DataBlk := in_msg.DataBlk;
+      }
+    }
+  }
+
  action(atd_allocateTBEforDMA, "atd", desc="allocate TBE Entry for DMA") {
    check_allocate(TBEs);
    peek(dmaRequestQueue_in, DMARequestMsg) {
@@ -1553,4 +1595,17 @@ machine(MachineType:Directory, "AMD Baseline protocol")
    dt_deallocateTBE;
    pt_popTriggerQueue;
  }
+
+ transition(U, Flush, F) {L3TagArrayRead, L3TagArrayWrite} {
+    t_allocateTBE;
+    f_writeFlushDataToMemory;
+    w_sendResponseWBAck;
+    p_popRequestQueue;
+ }
+
+ transition(F, WBAck, U) {
+    pm_popMemQueue;
+    dt_deallocateTBE;
+ }
+
 }
--- a/src/mem/ruby/slicc_interface/AbstractController.hh
+++ b/src/mem/ruby/slicc_interface/AbstractController.hh
@@ -70,7 +70,9 @@ namespace ruby
 {

 class Network;
+#ifdef BUILD_GPU
 class GPUCoalescer;
+#endif
 class DMASequencer;

 // used to communicate that an in_port peeked the wrong message type
--- a/src/mem/ruby/system/CacheRecorder.cc
+++ b/src/mem/ruby/system/CacheRecorder.cc
@@ -30,8 +30,11 @@
 #include "mem/ruby/system/CacheRecorder.hh"

 #include "debug/RubyCacheTrace.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/system/GPUCoalescer.hh"
 #include "mem/ruby/system/RubySystem.hh"
 #include "mem/ruby/system/Sequencer.hh"
+#include "sim/sim_exit.hh"

 namespace gem5
 {
@@ -54,14 +57,29 @@ CacheRecorder::CacheRecorder()
 {
 }

+#if BUILD_GPU
+CacheRecorder::CacheRecorder(uint8_t* uncompressed_trace,
+                             uint64_t uncompressed_trace_size,
+                             std::vector<Sequencer*>& seq_map,
+                             std::vector<GPUCoalescer*>& coal_map,
+                             uint64_t block_size_bytes)
+    : m_uncompressed_trace(uncompressed_trace),
+      m_uncompressed_trace_size(uncompressed_trace_size),
+      m_seq_map(seq_map), m_coalescer_map(coal_map), m_bytes_read(0),
+      m_records_read(0), m_records_flushed(0),
+      m_block_size_bytes(block_size_bytes)
+#else
 CacheRecorder::CacheRecorder(uint8_t* uncompressed_trace,
                             uint64_t uncompressed_trace_size,
                             std::vector<Sequencer*>& seq_map,
                             uint64_t block_size_bytes)
    : m_uncompressed_trace(uncompressed_trace),
      m_uncompressed_trace_size(uncompressed_trace_size),
-      m_seq_map(seq_map),  m_bytes_read(0), m_records_read(0),
-      m_records_flushed(0), m_block_size_bytes(block_size_bytes)
+      m_seq_map(seq_map), m_bytes_read(0),
+      m_records_read(0), m_records_flushed(0),
+      m_block_size_bytes(block_size_bytes)
+
+#endif
 {
    if (m_uncompressed_trace != NULL) {
        if (m_block_size_bytes < RubySystem::getBlockSizeBytes()) {
@@ -81,6 +99,9 @@ CacheRecorder::~CacheRecorder()
        m_uncompressed_trace = NULL;
    }
    m_seq_map.clear();
+#if BUILD_GPU
+    m_coalescer_map.clear();
+#endif
 }

 void
@@ -96,11 +117,27 @@ CacheRecorder::enqueueNextFlushRequest()
        Packet *pkt = new Packet(req, requestType);

        Sequencer* m_sequencer_ptr = m_seq_map[rec->m_cntrl_id];
+#if BUILD_GPU
+        GPUCoalescer* m_coal_ptr = m_coalescer_map[rec->m_cntrl_id];
+#endif
        assert(m_sequencer_ptr != NULL);
+#if BUILD_GPU
+        if (m_coal_ptr == NULL)
+            m_sequencer_ptr->makeRequest(pkt);
+        else {
+            pkt->req->setReqInstSeqNum(m_records_flushed - 1);
+            m_coal_ptr->makeRequest(pkt);
+        }
+#else
        m_sequencer_ptr->makeRequest(pkt);
+#endif

        DPRINTF(RubyCacheTrace, "Flushing %s\n", *rec);
+
    } else {
+        if (m_records_flushed > 0) {
+            exitSimLoop("Finished Drain", 0);
+        }
        DPRINTF(RubyCacheTrace, "Flushed all %d records\n", m_records_flushed);
    }
 }
@@ -143,13 +180,27 @@ CacheRecorder::enqueueNextFetchRequest()
            pkt->dataStatic(traceRecord->m_data + rec_bytes_read);

            Sequencer* m_sequencer_ptr = m_seq_map[traceRecord->m_cntrl_id];
+#if BUILD_GPU
+            GPUCoalescer* m_coal_ptr;
+            m_coal_ptr = m_coalescer_map[traceRecord->m_cntrl_id];
+#endif
            assert(m_sequencer_ptr != NULL);
+#if BUILD_GPU
+            if (m_coal_ptr == NULL)
+                m_sequencer_ptr->makeRequest(pkt);
+            else {
+                pkt->req->setReqInstSeqNum(m_records_read);
+                m_coal_ptr->makeRequest(pkt);
+            }
+#else
            m_sequencer_ptr->makeRequest(pkt);
+#endif
        }

        m_bytes_read += (sizeof(TraceRecord) + m_block_size_bytes);
        m_records_read++;
    } else {
+        exitSimLoop("Finished Warmup", 0);
        DPRINTF(RubyCacheTrace, "Fetched all %d records\n", m_records_read);
    }
 }
@@ -168,6 +219,8 @@ CacheRecorder::addRecord(int cntrl, Addr data_addr, Addr pc_addr,
    memcpy(rec->m_data, data.getData(0, m_block_size_bytes),
           m_block_size_bytes);

+    DPRINTF(RubyCacheTrace, "Inside addRecord with cntrl id %d and type %d\n",
+            cntrl, type);
    m_records.push_back(rec);
 }

--- a/src/mem/ruby/system/CacheRecorder.hh
+++ b/src/mem/ruby/system/CacheRecorder.hh
@@ -38,6 +38,7 @@
 #include <vector>

 #include "base/types.hh"
+#include "config/build_gpu.hh"
 #include "mem/ruby/common/Address.hh"
 #include "mem/ruby/common/DataBlock.hh"
 #include "mem/ruby/common/TypeDefines.hh"
@@ -50,6 +51,9 @@ namespace ruby
 {

 class Sequencer;
+#if BUILD_GPU
+class GPUCoalescer;
+#endif

 /*!
 * Class for recording cache contents. Note that the last element of the
@@ -76,10 +80,18 @@ class CacheRecorder
    CacheRecorder();
    ~CacheRecorder();

+#if BUILD_GPU
+    CacheRecorder(uint8_t* uncompressed_trace,
+                  uint64_t uncompressed_trace_size,
+                  std::vector<Sequencer*>& SequencerMap,
+                  std::vector<GPUCoalescer*>& CoalescerMap,
+                  uint64_t block_size_bytes);
+#else
    CacheRecorder(uint8_t* uncompressed_trace,
                  uint64_t uncompressed_trace_size,
                  std::vector<Sequencer*>& SequencerMap,
                  uint64_t block_size_bytes);
+#endif
    void addRecord(int cntrl, Addr data_addr, Addr pc_addr,
                   RubyRequestType type, Tick time, DataBlock& data);

@@ -115,6 +127,9 @@ class CacheRecorder
    uint8_t* m_uncompressed_trace;
    uint64_t m_uncompressed_trace_size;
    std::vector<Sequencer*> m_seq_map;
+#if BUILD_GPU
+    std::vector<GPUCoalescer*> m_coalescer_map;
+#endif
    uint64_t m_bytes_read;
    uint64_t m_records_read;
    uint64_t m_records_flushed;
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -73,6 +73,14 @@ UncoalescedTable::insertPacket(PacketPtr pkt)
            pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size());
 }

+void
+UncoalescedTable::insertReqType(PacketPtr pkt, RubyRequestType type)
+{
+    uint64_t seqNum = pkt->req->getReqInstSeqNum();
+
+    reqTypeMap[seqNum] = type;
+}
+
 bool
 UncoalescedTable::packetAvailable()
 {
@@ -128,9 +136,21 @@ UncoalescedTable::updateResources()
            instMap.erase(iter++);
            instPktsRemaining.erase(seq_num);

-            // Release the token
-            DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", seq_num);
-            coalescer->getGMTokenPort().sendTokens(1);
+            // Release the token if the Ruby system is not in cooldown
+            // or warmup phases. When in these phases, the RubyPorts
+            // are accessed directly using the makeRequest() command
+            // instead of accessing through the port. This makes
+            // sending tokens through the port unnecessary
+            if (!RubySystem::getWarmupEnabled()
+                    && !RubySystem::getCooldownEnabled()) {
+                if (reqTypeMap[seq_num] != RubyRequestType_FLUSH) {
+                    DPRINTF(GPUCoalescer,
+                            "Returning token seqNum %d\n", seq_num);
+                    coalescer->getGMTokenPort().sendTokens(1);
+                }
+            }
+
+            reqTypeMap.erase(seq_num);
        } else {
            ++iter;
        }
@@ -565,6 +585,14 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
    for (auto& pkt : pktList) {
        offset = getOffset(pkt->getAddr());
        pkt_size = pkt->getSize();
+        request_address = pkt->getAddr();
+
+        // When the Ruby system is cooldown phase, the requests come from
+        // the cache recorder. These requests do not get coalesced and
+        // do not return valid data.
+        if (RubySystem::getCooldownEnabled())
+            continue;
+
        if (pkt->getPtr<uint8_t>()) {
            switch(type) {
                // Store and AtomicNoReturns follow the same path, as the
@@ -627,7 +655,6 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
    assert(!pkt->req->isLLSC());
    assert(!pkt->req->isLockedRMW());
    assert(!pkt->req->isInstFetch());
-    assert(!pkt->isFlush());

    if (pkt->req->isAtomicReturn()) {
        req_type = RubyRequestType_ATOMIC_RETURN;
@@ -637,6 +664,8 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
        req_type = RubyRequestType_LD;
    } else if (pkt->isWrite()) {
        req_type = RubyRequestType_ST;
+    } else if (pkt->isFlush()) {
+        req_type = RubyRequestType_FLUSH;
    } else {
        panic("Unsupported ruby packet type\n");
    }
@@ -658,7 +687,7 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
        issueMemSyncRequest(pkt);
    } else {
        // otherwise, this must be either read or write command
-        assert(pkt->isRead() || pkt->isWrite());
+        assert(pkt->isRead() || pkt->isWrite() || pkt->isFlush());

        InstSeqNum seq_num = pkt->req->getReqInstSeqNum();

@@ -667,10 +696,17 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
        // number of lanes actives for that vmem request (i.e., the popcnt
        // of the exec_mask.
        int num_packets = 1;
-        if (!m_usingRubyTester) {
-            num_packets = 0;
-            for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) {
-                num_packets += getDynInst(pkt)->getLaneStatus(i);
+
+        // When Ruby is in warmup or cooldown phase, the requests come from
+        // the cache recorder. There is no dynamic instruction associated
+        // with these requests either
+        if (!RubySystem::getWarmupEnabled()
+                && !RubySystem::getCooldownEnabled()) {
+            if (!m_usingRubyTester) {
+                num_packets = 0;
+                for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) {
+                    num_packets += getDynInst(pkt)->getLaneStatus(i);
+                }
            }
        }

@@ -679,6 +715,7 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
        // future cycle. Packets remaining is set to the number of excepted
        // requests from the instruction based on its exec_mask.
        uncoalescedTable.insertPacket(pkt);
+        uncoalescedTable.insertReqType(pkt, getRequestType(pkt));
        uncoalescedTable.initPacketsRemaining(seq_num, num_packets);
        DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
                pkt->getAddr());
@@ -945,21 +982,27 @@ void
 GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
 {
    for (auto& pkt : mylist) {
-        RubyPort::SenderState *ss =
-            safe_cast<RubyPort::SenderState *>(pkt->senderState);
-        MemResponsePort *port = ss->port;
-        assert(port != NULL);
+        // When Ruby is in warmup or cooldown phase, the requests come
+        // from the cache recorder. They do not track which port to use
+        // and do not need to send the response back
+        if (!RubySystem::getWarmupEnabled()
+                && !RubySystem::getCooldownEnabled()) {
+            RubyPort::SenderState *ss =
+                safe_cast<RubyPort::SenderState *>(pkt->senderState);
+            MemResponsePort *port = ss->port;
+            assert(port != NULL);

-        pkt->senderState = ss->predecessor;
+            pkt->senderState = ss->predecessor;

-        if (pkt->cmd != MemCmd::WriteReq) {
-            // for WriteReq, we keep the original senderState until
-            // writeCompleteCallback
-            delete ss;
+            if (pkt->cmd != MemCmd::WriteReq) {
+                // for WriteReq, we keep the original senderState until
+                // writeCompleteCallback
+                delete ss;
+            }
+
+            port->hitCallback(pkt);
+            trySendRetries();
        }
-
-        port->hitCallback(pkt);
-        trySendRetries();
    }

    // We schedule an event in the same tick as hitCallback (similar to
@@ -971,7 +1014,14 @@ GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
        schedule(issueEvent, curTick());
    }

-    testDrainComplete();
+    RubySystem *rs = m_ruby_system;
+    if (RubySystem::getWarmupEnabled()) {
+        rs->m_cache_recorder->enqueueNextFetchRequest();
+    } else if (RubySystem::getCooldownEnabled()) {
+        rs->m_cache_recorder->enqueueNextFlushRequest();
+    } else {
+        testDrainComplete();
+    }
 }

 void
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -32,6 +32,10 @@
 #ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
 #define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__

+#include "config/build_gpu.hh"
+
+#if BUILD_GPU
+
 #include <iostream>
 #include <unordered_map>

@@ -71,6 +75,7 @@ class UncoalescedTable
    ~UncoalescedTable() {}

    void insertPacket(PacketPtr pkt);
+    void insertReqType(PacketPtr pkt, RubyRequestType type);
    bool packetAvailable();
    void printRequestTable(std::stringstream& ss);

@@ -101,6 +106,8 @@ class UncoalescedTable
    std::map<InstSeqNum, PerInstPackets> instMap;

    std::map<InstSeqNum, int> instPktsRemaining;
+
+    std::map<InstSeqNum, RubyRequestType> reqTypeMap;
 };

 class CoalescedRequest
@@ -543,4 +550,5 @@ operator<<(std::ostream& out, const GPUCoalescer& obj)
 } // namespace ruby
 } // namespace gem5

+#endif // BUILD_GPU
 #endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
--- a/src/mem/ruby/system/RubySystem.cc
+++ b/src/mem/ruby/system/RubySystem.cc
@@ -178,13 +178,28 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
                              uint64_t block_size_bytes)
 {
    std::vector<Sequencer*> sequencer_map;
+#if BUILD_GPU
+    std::vector<GPUCoalescer*> coalescer_map;
+    GPUCoalescer* coalescer_ptr = NULL;
+#endif
    Sequencer* sequencer_ptr = NULL;

    for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
        sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getCPUSequencer());
+#if BUILD_GPU
+        coalescer_map.push_back(m_abs_cntrl_vec[cntrl]->getGPUCoalescer());
+#endif
+
        if (sequencer_ptr == NULL) {
            sequencer_ptr = sequencer_map[cntrl];
        }
+
+#if BUILD_GPU
+        if (coalescer_ptr == NULL) {
+            coalescer_ptr = coalescer_map[cntrl];
+        }
+#endif
+
    }

    assert(sequencer_ptr != NULL);
@@ -193,6 +208,13 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
        if (sequencer_map[cntrl] == NULL) {
            sequencer_map[cntrl] = sequencer_ptr;
        }
+
+#if BUILD_GPU
+        if (coalescer_map[cntrl] == NULL) {
+            coalescer_map[cntrl] = coalescer_ptr;
+        }
+#endif
+
    }

    // Remove the old CacheRecorder if it's still hanging about.
@@ -201,8 +223,15 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
    }

    // Create the CacheRecorder and record the cache trace
+#if BUILD_GPU
    m_cache_recorder = new CacheRecorder(uncompressed_trace, cache_trace_size,
-                                         sequencer_map, block_size_bytes);
+                                         sequencer_map, coalescer_map,
+                                         block_size_bytes);
+#else
+    m_cache_recorder = new CacheRecorder(uncompressed_trace, cache_trace_size,
+                                         sequencer_map,
+                                         block_size_bytes);
+#endif
 }

 void
--- a/src/mem/ruby/system/VIPERCoalescer.cc
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -75,12 +75,14 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
    //    ReadReq             : cache read
    //    WriteReq            : cache write
    //    AtomicOp            : cache atomic
+    //    Flush               : flush and invalidate cache
    //
    // VIPER does not expect MemSyncReq & Release since in GCN3, compute unit
    // does not specify an equivalent type of memory request.
    assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL1()) ||
            pkt->cmd == MemCmd::ReadReq ||
            pkt->cmd == MemCmd::WriteReq ||
+            pkt->cmd == MemCmd::FlushReq ||
            pkt->isAtomicOp());

    if (pkt->req->isInvL1() && m_cache_inv_pkt) {
--- a/src/mem/ruby/system/VIPERCoalescer.hh
+++ b/src/mem/ruby/system/VIPERCoalescer.hh
@@ -32,6 +32,10 @@
 #ifndef __MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__
 #define __MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__

+#include "config/build_gpu.hh"
+
+#if BUILD_GPU
+
 #include <iostream>

 #include "mem/ruby/common/Address.hh"
@@ -92,4 +96,5 @@ class VIPERCoalescer : public GPUCoalescer
 } // namespace ruby
 } // namespace gem5

+#endif // BUILD_GPU
 #endif //__MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__