dev-amdgpu,mem-ruby: Add support to checkpoint and restore between kernels in GPUFS (#377)

Earlier, GPU checkpointing was working only if a checkpoint was created
before the first kernel execution. This pull request adds support to
checkpoint in-between any two kernel calls. It does so by doing the
following.

- Adds flush support in the GPU_VIPER protocol
- Adds flush support in the GPUCoalescer
- Updates cache recorder to use the GPUCoalescer during simulation
cooldown and cache warmup times.
This commit is contained in:
Matt Sinclair
2023-10-10 09:41:21 -05:00
committed by GitHub
14 changed files with 381 additions and 38 deletions

View File

@@ -158,6 +158,16 @@ def addRunFSOptions(parser):
help="Root partition of disk image",
)
parser.add_argument(
"--disable-avx",
action="store_true",
default=False,
help="Disables AVX. AVX is used in some ROCm libraries but "
"does not have checkpointing support yet. If simulation either "
"creates a checkpoint or restores from one, then AVX needs to "
"be disabled for correct functionality ",
)
def runGpuFSSystem(args):
"""

View File

@@ -234,7 +234,7 @@ def makeGpuFSSystem(args):
# If we are using KVM cpu, enable AVX. AVX is used in some ROCm libraries
# such as rocBLAS which is used in higher level libraries like PyTorch.
use_avx = False
if ObjectList.is_kvm_cpu(TestCPUClass):
if ObjectList.is_kvm_cpu(TestCPUClass) and not args.disable_avx:
# AVX also requires CR4.osxsave to be 1. These must be set together
# of KVM will error out.
system.workload.enable_osxsave = 1

View File

@@ -1044,6 +1044,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
int num_queues = queues.size();
Addr id[num_queues];
Addr mqd_base[num_queues];
uint64_t mqd_read_index[num_queues];
Addr base[num_queues];
Addr rptr[num_queues];
Addr wptr[num_queues];
@@ -1060,6 +1061,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
uint32_t hqd_active[num_queues];
uint32_t hqd_vmid[num_queues];
Addr aql_rptr[num_queues];
uint32_t aql[num_queues];
uint32_t doorbell[num_queues];
uint32_t hqd_pq_control[num_queues];
@@ -1068,9 +1070,10 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
PM4Queue *q = iter.second;
id[i] = q->id();
mqd_base[i] = q->mqdBase();
mqd_read_index[i] = q->getMQD()->mqdReadIndex;
bool cur_state = q->ib();
q->ib(false);
base[i] = q->base() >> 8;
base[i] = q->base();
rptr[i] = q->getRptr();
wptr[i] = q->getWptr();
q->ib(true);
@@ -1088,6 +1091,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
hqd_active[i] = q->getMQD()->hqd_active;
hqd_vmid[i] = q->getMQD()->hqd_vmid;
aql_rptr[i] = q->getMQD()->aqlRptr;
aql[i] = q->getMQD()->aql;
doorbell[i] = q->getMQD()->doorbell;
hqd_pq_control[i] = q->getMQD()->hqd_pq_control;
i++;
@@ -1096,6 +1100,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
SERIALIZE_SCALAR(num_queues);
SERIALIZE_ARRAY(id, num_queues);
SERIALIZE_ARRAY(mqd_base, num_queues);
SERIALIZE_ARRAY(mqd_read_index, num_queues);
SERIALIZE_ARRAY(base, num_queues);
SERIALIZE_ARRAY(rptr, num_queues);
SERIALIZE_ARRAY(wptr, num_queues);
@@ -1112,6 +1117,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
SERIALIZE_ARRAY(hqd_active, num_queues);
SERIALIZE_ARRAY(hqd_vmid, num_queues);
SERIALIZE_ARRAY(aql_rptr, num_queues);
SERIALIZE_ARRAY(aql, num_queues);
SERIALIZE_ARRAY(doorbell, num_queues);
SERIALIZE_ARRAY(hqd_pq_control, num_queues);
}
@@ -1127,6 +1133,7 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
Addr id[num_queues];
Addr mqd_base[num_queues];
uint64_t mqd_read_index[num_queues];
Addr base[num_queues];
Addr rptr[num_queues];
Addr wptr[num_queues];
@@ -1143,11 +1150,13 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
uint32_t hqd_active[num_queues];
uint32_t hqd_vmid[num_queues];
Addr aql_rptr[num_queues];
uint32_t aql[num_queues];
uint32_t doorbell[num_queues];
uint32_t hqd_pq_control[num_queues];
UNSERIALIZE_ARRAY(id, num_queues);
UNSERIALIZE_ARRAY(mqd_base, num_queues);
UNSERIALIZE_ARRAY(mqd_read_index, num_queues);
UNSERIALIZE_ARRAY(base, num_queues);
UNSERIALIZE_ARRAY(rptr, num_queues);
UNSERIALIZE_ARRAY(wptr, num_queues);
@@ -1164,6 +1173,7 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
UNSERIALIZE_ARRAY(hqd_active, num_queues);
UNSERIALIZE_ARRAY(hqd_vmid, num_queues);
UNSERIALIZE_ARRAY(aql_rptr, num_queues);
UNSERIALIZE_ARRAY(aql, num_queues);
UNSERIALIZE_ARRAY(doorbell, num_queues);
UNSERIALIZE_ARRAY(hqd_pq_control, num_queues);
@@ -1172,22 +1182,24 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
memset(mqd, 0, sizeof(QueueDesc));
mqd->mqdBase = mqd_base[i] >> 8;
mqd->base = base[i];
mqd->rptr = rptr[i];
mqd->ibBase = ib_base[i];
mqd->ibRptr = ib_rptr[i];
mqd->mqdReadIndex = mqd_read_index[i];
mqd->base = base[i] >> 8;
mqd->aql = aql[i];
PM4MapQueues* pkt = new PM4MapQueues;
memset(pkt, 0, sizeof(PM4MapQueues));
newQueue(mqd, offset[i], pkt, id[i]);
queues[id[i]]->ib(false);
queues[id[i]]->wptr(wptr[i]);
queues[id[i]]->ib(true);
queues[id[i]]->wptr(ib_wptr[i]);
if (ib[i]) {
queues[id[i]]->wptr(ib_wptr[i]);
queues[id[i]]->rptr(ib_rptr[i]);
} else {
queues[id[i]]->rptr(rptr[i]);
queues[id[i]]->wptr(wptr[i]);
}
queues[id[i]]->ib(ib[i]);
queues[id[i]]->offset(offset[i]);
queues[id[i]]->processing(processing[i]);
queues[id[i]]->ib(ib[i]);
queues[id[i]]->setPkt(me[i], pipe[i], queue[i], privileged[i]);
queues[id[i]]->getMQD()->hqd_active = hqd_active[i];
queues[id[i]]->getMQD()->hqd_vmid = hqd_vmid[i];
@@ -1195,6 +1207,14 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
queues[id[i]]->getMQD()->doorbell = doorbell[i];
queues[id[i]]->getMQD()->hqd_pq_control = hqd_pq_control[i];
if (mqd->aql) {
int mqd_size = (1 << ((hqd_pq_control[i] & 0x3f) + 1)) * 4;
auto &hsa_pp = gpuDevice->CP()->hsaPacketProc();
hsa_pp.setDeviceQueueDesc(aql_rptr[i], base[i], id[i],
mqd_size, 8, GfxVersion::gfx900, offset[i],
mqd_read_index[i]);
}
DPRINTF(PM4PacketProcessor, "PM4 queue %d, rptr: %p wptr: %p\n",
queues[id[i]]->id(), queues[id[i]]->rptr(),
queues[id[i]]->wptr());

View File

@@ -65,7 +65,8 @@ machine(MachineType:TCC, "TCC Cache")
AtomicPassOn, desc="Atomic Op Passed on to Directory";
AtomicDone, desc="AtomicOps Complete";
AtomicNotDone, desc="AtomicOps not Complete";
Data, desc="data messgae";
Data, desc="Data message";
Flush, desc="Flush cache entry";
// Coming from this TCC
L2_Repl, desc="L2 Replacement";
// Probes
@@ -376,6 +377,8 @@ machine(MachineType:TCC, "TCC Cache")
} else {
trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
}
} else if (in_msg.Type == CoherenceRequestType:WriteFlush) {
trigger(Event:Flush, in_msg.addr, cache_entry, tbe);
} else {
DPRINTF(RubySlicc, "%s\n", in_msg);
error("Unexpected Response Message to Core");
@@ -509,6 +512,20 @@ machine(MachineType:TCC, "TCC Cache")
}
}
action(fw_sendFlushResponse, "fw", desc="send Flush Response") {
peek(coreRequestNetwork_in, CPURequestMsg) {
enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
out_msg.addr := address;
out_msg.Type := CoherenceResponseType:TDSysWBAck;
out_msg.Destination.clear();
out_msg.Destination.add(in_msg.Requestor);
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Control;
out_msg.instSeqNum := in_msg.instSeqNum;
}
}
}
action(ar_sendAtomicResponse, "ar", desc="send Atomic Ack") {
peek(coreRequestNetwork_in, CPURequestMsg) {
enqueue(responseToCore_out, ResponseMsg, l2_response_latency + glc_atomic_latency, true) {
@@ -628,6 +645,22 @@ machine(MachineType:TCC, "TCC Cache")
}
}
action(f_flush, "f", desc="write back data") {
peek(coreRequestNetwork_in, CPURequestMsg) {
enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
out_msg.addr := address;
out_msg.Requestor := machineID;
out_msg.WTRequestor := in_msg.Requestor;
out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
out_msg.MessageSize := MessageSizeType:Data;
out_msg.Type := CoherenceRequestType:WriteFlush;
out_msg.Dirty := true;
out_msg.DataBlk := cache_entry.DataBlk;
out_msg.writeMask.orMask(cache_entry.writeMask);
}
}
}
action(at_atomicThrough, "at", desc="write back data") {
peek(coreRequestNetwork_in, CPURequestMsg) {
enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
@@ -1075,4 +1108,21 @@ machine(MachineType:TCC, "TCC Cache")
transition(WIB, WBAck,I) {
pr_popResponseQueue;
}
transition({A, IV, WI, WIB}, Flush) {
st_stallAndWaitRequest;
}
transition(I, Flush) {
fw_sendFlushResponse;
p_popRequestQueue;
}
transition({V, W}, Flush, I) {TagArrayRead, TagArrayWrite} {
t_allocateTBE;
ut_updateTag;
f_flush;
i_invL2;
p_popRequestQueue;
}
}

View File

@@ -55,6 +55,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
I, AccessPermission:Invalid, desc="Invalid";
V, AccessPermission:Read_Only, desc="Valid";
A, AccessPermission:Invalid, desc="Waiting on Atomic";
F, AccessPermission:Invalid, desc="Flushing; Waiting for Ack";
}
enumeration(Event, desc="TCP Events") {
@@ -256,6 +258,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
peek(responseToTCP_in, ResponseMsg, block_on="addr") {
Entry cache_entry := getCacheEntry(in_msg.addr);
TBE tbe := TBEs.lookup(in_msg.addr);
DPRINTF(RubySlicc, "In responseToTCP_in with %s\n", in_msg);
if (in_msg.Type == CoherenceResponseType:TDSysResp) {
if (disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) {
// If L1 is disabled or requests have GLC or SLC flag set,
@@ -273,6 +277,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
} else if (in_msg.Type == CoherenceResponseType:TDSysWBAck ||
in_msg.Type == CoherenceResponseType:NBSysWBAck) {
trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
DPRINTF(RubySlicc, "Issuing TCC_AckWB\n");
} else {
error("Unexpected Response Message to Core");
}
@@ -469,6 +474,24 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
action(sf_setFlush, "sf", desc="set flush") {
inFlush := true;
APPEND_TRANSITION_COMMENT(" inFlush is true");
enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
out_msg.addr := address;
out_msg.Requestor := machineID;
assert(is_valid(cache_entry));
out_msg.DataBlk := cache_entry.DataBlk;
out_msg.writeMask.clear();
out_msg.writeMask.orMask(cache_entry.writeMask);
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
TCC_select_low_bit, TCC_select_num_bits));
out_msg.MessageSize := MessageSizeType:Data;
out_msg.Type := CoherenceRequestType:WriteFlush;
out_msg.InitialRequestTime := curCycle();
out_msg.Shared := false;
out_msg.isSLCSet := false;
peek(mandatoryQueue_in, RubyRequest) {
out_msg.instSeqNum := in_msg.instSeqNum;
}
}
}
action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
@@ -524,6 +547,16 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
cache_entry.Dirty := true;
}
action(f_flushDone, "f", desc="flush done") {
assert(is_valid(cache_entry));
if (use_seq_not_coal) {
sequencer.writeCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
} else {
coalescer.writeCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
}
}
action(inv_invDone, "inv", desc="local inv done") {
if (use_seq_not_coal) {
DPRINTF(RubySlicc, "Sequencer does not define invCallback!\n");
@@ -695,11 +728,16 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
ic_invCache;
}
transition({V, I, A},Flush) {TagArrayFlash} {
transition({V,I}, Flush, F) {TagArrayFlash} {
a_allocate;
sf_setFlush;
p_popMandatoryQueue;
}
transition(A, Flush) {
z_stall;
}
transition({I, V}, Evict, I) {TagArrayFlash} {
inv_invDone;
p_popMandatoryQueue;
@@ -716,4 +754,10 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
wd_wtDone;
pr_popResponseQueue;
}
transition(F, TCC_AckWB, I) {
f_flushDone;
pr_popResponseQueue;
ic_invCache;
}
}

View File

@@ -83,6 +83,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
BM_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
B_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
B, AccessPermission:Backing_Store, desc="sent response, Blocked til ack";
F, AccessPermission:Busy, desc="sent Flus, blocked till ack";
}
// Events
@@ -120,6 +122,9 @@ machine(MachineType:Directory, "AMD Baseline protocol")
// DMA
DmaRead, desc="DMA read";
DmaWrite, desc="DMA write";
// Flush
Flush, desc="Flush entry";
}
enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
@@ -411,6 +416,9 @@ machine(MachineType:Directory, "AMD Baseline protocol")
DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr);
trigger(Event:VicClean, in_msg.addr, entry, tbe);
}
} else if (in_msg.Type == CoherenceRequestType:WriteFlush) {
DPRINTF(RubySlicc, "Got Flush from %s on %s\n", in_msg.Requestor, in_msg.addr);
trigger(Event:Flush, in_msg.addr, entry, tbe);
} else {
error("Bad request message type");
}
@@ -562,6 +570,23 @@ machine(MachineType:Directory, "AMD Baseline protocol")
}
}
action(rf_sendResponseFlush, "rf", desc="send Flush Ack") {
peek(memQueue_in, MemoryMsg) {
enqueue(responseNetwork_out, ResponseMsg, 1) {
out_msg.addr := address;
out_msg.Type := CoherenceResponseType:NBSysWBAck;
out_msg.Destination.add(tbe.OriginalRequestor);
out_msg.WTRequestor := tbe.WTRequestor;
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Control;
out_msg.InitialRequestTime := tbe.InitialRequestTime;
out_msg.ForwardRequestTime := curCycle();
out_msg.ProbeRequestStartTime := curCycle();
//out_msg.instSeqNum := in_msg.instSeqNum;
}
}
}
action(l_queueMemWBReq, "lq", desc="Write WB data to memory") {
peek(responseNetwork_in, ResponseMsg) {
enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
@@ -933,6 +958,23 @@ machine(MachineType:Directory, "AMD Baseline protocol")
}
}
action(f_writeFlushDataToMemory, "f", desc="Write flush data to memory") {
peek(requestNetwork_in, CPURequestMsg) {
enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
out_msg.addr := address;
out_msg.Type := MemoryRequestType:MEMORY_WB;
out_msg.Sender := machineID;
out_msg.MessageSize := MessageSizeType:Writeback_Data;
out_msg.DataBlk := in_msg.DataBlk;
}
if (tbe.Dirty == false) {
// have to update the TBE, too, because of how this
// directory deals with functional writes
tbe.DataBlk := in_msg.DataBlk;
}
}
}
action(atd_allocateTBEforDMA, "atd", desc="allocate TBE Entry for DMA") {
check_allocate(TBEs);
peek(dmaRequestQueue_in, DMARequestMsg) {
@@ -1553,4 +1595,17 @@ machine(MachineType:Directory, "AMD Baseline protocol")
dt_deallocateTBE;
pt_popTriggerQueue;
}
transition(U, Flush, F) {L3TagArrayRead, L3TagArrayWrite} {
t_allocateTBE;
f_writeFlushDataToMemory;
w_sendResponseWBAck;
p_popRequestQueue;
}
transition(F, WBAck, U) {
pm_popMemQueue;
dt_deallocateTBE;
}
}

View File

@@ -70,7 +70,9 @@ namespace ruby
{
class Network;
#ifdef BUILD_GPU
class GPUCoalescer;
#endif
class DMASequencer;
// used to communicate that an in_port peeked the wrong message type

View File

@@ -30,8 +30,11 @@
#include "mem/ruby/system/CacheRecorder.hh"
#include "debug/RubyCacheTrace.hh"
#include "mem/packet.hh"
#include "mem/ruby/system/GPUCoalescer.hh"
#include "mem/ruby/system/RubySystem.hh"
#include "mem/ruby/system/Sequencer.hh"
#include "sim/sim_exit.hh"
namespace gem5
{
@@ -54,14 +57,29 @@ CacheRecorder::CacheRecorder()
{
}
#if BUILD_GPU
CacheRecorder::CacheRecorder(uint8_t* uncompressed_trace,
uint64_t uncompressed_trace_size,
std::vector<Sequencer*>& seq_map,
std::vector<GPUCoalescer*>& coal_map,
uint64_t block_size_bytes)
: m_uncompressed_trace(uncompressed_trace),
m_uncompressed_trace_size(uncompressed_trace_size),
m_seq_map(seq_map), m_coalescer_map(coal_map), m_bytes_read(0),
m_records_read(0), m_records_flushed(0),
m_block_size_bytes(block_size_bytes)
#else
CacheRecorder::CacheRecorder(uint8_t* uncompressed_trace,
uint64_t uncompressed_trace_size,
std::vector<Sequencer*>& seq_map,
uint64_t block_size_bytes)
: m_uncompressed_trace(uncompressed_trace),
m_uncompressed_trace_size(uncompressed_trace_size),
m_seq_map(seq_map), m_bytes_read(0), m_records_read(0),
m_records_flushed(0), m_block_size_bytes(block_size_bytes)
m_seq_map(seq_map), m_bytes_read(0),
m_records_read(0), m_records_flushed(0),
m_block_size_bytes(block_size_bytes)
#endif
{
if (m_uncompressed_trace != NULL) {
if (m_block_size_bytes < RubySystem::getBlockSizeBytes()) {
@@ -81,6 +99,9 @@ CacheRecorder::~CacheRecorder()
m_uncompressed_trace = NULL;
}
m_seq_map.clear();
#if BUILD_GPU
m_coalescer_map.clear();
#endif
}
void
@@ -96,11 +117,27 @@ CacheRecorder::enqueueNextFlushRequest()
Packet *pkt = new Packet(req, requestType);
Sequencer* m_sequencer_ptr = m_seq_map[rec->m_cntrl_id];
#if BUILD_GPU
GPUCoalescer* m_coal_ptr = m_coalescer_map[rec->m_cntrl_id];
#endif
assert(m_sequencer_ptr != NULL);
#if BUILD_GPU
if (m_coal_ptr == NULL)
m_sequencer_ptr->makeRequest(pkt);
else {
pkt->req->setReqInstSeqNum(m_records_flushed - 1);
m_coal_ptr->makeRequest(pkt);
}
#else
m_sequencer_ptr->makeRequest(pkt);
#endif
DPRINTF(RubyCacheTrace, "Flushing %s\n", *rec);
} else {
if (m_records_flushed > 0) {
exitSimLoop("Finished Drain", 0);
}
DPRINTF(RubyCacheTrace, "Flushed all %d records\n", m_records_flushed);
}
}
@@ -143,13 +180,27 @@ CacheRecorder::enqueueNextFetchRequest()
pkt->dataStatic(traceRecord->m_data + rec_bytes_read);
Sequencer* m_sequencer_ptr = m_seq_map[traceRecord->m_cntrl_id];
#if BUILD_GPU
GPUCoalescer* m_coal_ptr;
m_coal_ptr = m_coalescer_map[traceRecord->m_cntrl_id];
#endif
assert(m_sequencer_ptr != NULL);
#if BUILD_GPU
if (m_coal_ptr == NULL)
m_sequencer_ptr->makeRequest(pkt);
else {
pkt->req->setReqInstSeqNum(m_records_read);
m_coal_ptr->makeRequest(pkt);
}
#else
m_sequencer_ptr->makeRequest(pkt);
#endif
}
m_bytes_read += (sizeof(TraceRecord) + m_block_size_bytes);
m_records_read++;
} else {
exitSimLoop("Finished Warmup", 0);
DPRINTF(RubyCacheTrace, "Fetched all %d records\n", m_records_read);
}
}
@@ -168,6 +219,8 @@ CacheRecorder::addRecord(int cntrl, Addr data_addr, Addr pc_addr,
memcpy(rec->m_data, data.getData(0, m_block_size_bytes),
m_block_size_bytes);
DPRINTF(RubyCacheTrace, "Inside addRecord with cntrl id %d and type %d\n",
cntrl, type);
m_records.push_back(rec);
}

View File

@@ -38,6 +38,7 @@
#include <vector>
#include "base/types.hh"
#include "config/build_gpu.hh"
#include "mem/ruby/common/Address.hh"
#include "mem/ruby/common/DataBlock.hh"
#include "mem/ruby/common/TypeDefines.hh"
@@ -50,6 +51,9 @@ namespace ruby
{
class Sequencer;
#if BUILD_GPU
class GPUCoalescer;
#endif
/*!
* Class for recording cache contents. Note that the last element of the
@@ -76,10 +80,18 @@ class CacheRecorder
CacheRecorder();
~CacheRecorder();
#if BUILD_GPU
CacheRecorder(uint8_t* uncompressed_trace,
uint64_t uncompressed_trace_size,
std::vector<Sequencer*>& SequencerMap,
std::vector<GPUCoalescer*>& CoalescerMap,
uint64_t block_size_bytes);
#else
CacheRecorder(uint8_t* uncompressed_trace,
uint64_t uncompressed_trace_size,
std::vector<Sequencer*>& SequencerMap,
uint64_t block_size_bytes);
#endif
void addRecord(int cntrl, Addr data_addr, Addr pc_addr,
RubyRequestType type, Tick time, DataBlock& data);
@@ -115,6 +127,9 @@ class CacheRecorder
uint8_t* m_uncompressed_trace;
uint64_t m_uncompressed_trace_size;
std::vector<Sequencer*> m_seq_map;
#if BUILD_GPU
std::vector<GPUCoalescer*> m_coalescer_map;
#endif
uint64_t m_bytes_read;
uint64_t m_records_read;
uint64_t m_records_flushed;

View File

@@ -73,6 +73,14 @@ UncoalescedTable::insertPacket(PacketPtr pkt)
pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size());
}
void
UncoalescedTable::insertReqType(PacketPtr pkt, RubyRequestType type)
{
uint64_t seqNum = pkt->req->getReqInstSeqNum();
reqTypeMap[seqNum] = type;
}
bool
UncoalescedTable::packetAvailable()
{
@@ -128,9 +136,21 @@ UncoalescedTable::updateResources()
instMap.erase(iter++);
instPktsRemaining.erase(seq_num);
// Release the token
DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", seq_num);
coalescer->getGMTokenPort().sendTokens(1);
// Release the token if the Ruby system is not in cooldown
// or warmup phases. When in these phases, the RubyPorts
// are accessed directly using the makeRequest() command
// instead of accessing through the port. This makes
// sending tokens through the port unnecessary
if (!RubySystem::getWarmupEnabled()
&& !RubySystem::getCooldownEnabled()) {
if (reqTypeMap[seq_num] != RubyRequestType_FLUSH) {
DPRINTF(GPUCoalescer,
"Returning token seqNum %d\n", seq_num);
coalescer->getGMTokenPort().sendTokens(1);
}
}
reqTypeMap.erase(seq_num);
} else {
++iter;
}
@@ -565,6 +585,14 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
for (auto& pkt : pktList) {
offset = getOffset(pkt->getAddr());
pkt_size = pkt->getSize();
request_address = pkt->getAddr();
// When the Ruby system is cooldown phase, the requests come from
// the cache recorder. These requests do not get coalesced and
// do not return valid data.
if (RubySystem::getCooldownEnabled())
continue;
if (pkt->getPtr<uint8_t>()) {
switch(type) {
// Store and AtomicNoReturns follow the same path, as the
@@ -627,7 +655,6 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
assert(!pkt->req->isLLSC());
assert(!pkt->req->isLockedRMW());
assert(!pkt->req->isInstFetch());
assert(!pkt->isFlush());
if (pkt->req->isAtomicReturn()) {
req_type = RubyRequestType_ATOMIC_RETURN;
@@ -637,6 +664,8 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
req_type = RubyRequestType_LD;
} else if (pkt->isWrite()) {
req_type = RubyRequestType_ST;
} else if (pkt->isFlush()) {
req_type = RubyRequestType_FLUSH;
} else {
panic("Unsupported ruby packet type\n");
}
@@ -658,7 +687,7 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
issueMemSyncRequest(pkt);
} else {
// otherwise, this must be either read or write command
assert(pkt->isRead() || pkt->isWrite());
assert(pkt->isRead() || pkt->isWrite() || pkt->isFlush());
InstSeqNum seq_num = pkt->req->getReqInstSeqNum();
@@ -667,10 +696,17 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
// number of lanes actives for that vmem request (i.e., the popcnt
// of the exec_mask.
int num_packets = 1;
if (!m_usingRubyTester) {
num_packets = 0;
for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) {
num_packets += getDynInst(pkt)->getLaneStatus(i);
// When Ruby is in warmup or cooldown phase, the requests come from
// the cache recorder. There is no dynamic instruction associated
// with these requests either
if (!RubySystem::getWarmupEnabled()
&& !RubySystem::getCooldownEnabled()) {
if (!m_usingRubyTester) {
num_packets = 0;
for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) {
num_packets += getDynInst(pkt)->getLaneStatus(i);
}
}
}
@@ -679,6 +715,7 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
// future cycle. Packets remaining is set to the number of excepted
// requests from the instruction based on its exec_mask.
uncoalescedTable.insertPacket(pkt);
uncoalescedTable.insertReqType(pkt, getRequestType(pkt));
uncoalescedTable.initPacketsRemaining(seq_num, num_packets);
DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
pkt->getAddr());
@@ -945,21 +982,27 @@ void
GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
{
for (auto& pkt : mylist) {
RubyPort::SenderState *ss =
safe_cast<RubyPort::SenderState *>(pkt->senderState);
MemResponsePort *port = ss->port;
assert(port != NULL);
// When Ruby is in warmup or cooldown phase, the requests come
// from the cache recorder. They do not track which port to use
// and do not need to send the response back
if (!RubySystem::getWarmupEnabled()
&& !RubySystem::getCooldownEnabled()) {
RubyPort::SenderState *ss =
safe_cast<RubyPort::SenderState *>(pkt->senderState);
MemResponsePort *port = ss->port;
assert(port != NULL);
pkt->senderState = ss->predecessor;
pkt->senderState = ss->predecessor;
if (pkt->cmd != MemCmd::WriteReq) {
// for WriteReq, we keep the original senderState until
// writeCompleteCallback
delete ss;
if (pkt->cmd != MemCmd::WriteReq) {
// for WriteReq, we keep the original senderState until
// writeCompleteCallback
delete ss;
}
port->hitCallback(pkt);
trySendRetries();
}
port->hitCallback(pkt);
trySendRetries();
}
// We schedule an event in the same tick as hitCallback (similar to
@@ -971,7 +1014,14 @@ GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
schedule(issueEvent, curTick());
}
testDrainComplete();
RubySystem *rs = m_ruby_system;
if (RubySystem::getWarmupEnabled()) {
rs->m_cache_recorder->enqueueNextFetchRequest();
} else if (RubySystem::getCooldownEnabled()) {
rs->m_cache_recorder->enqueueNextFlushRequest();
} else {
testDrainComplete();
}
}
void

View File

@@ -32,6 +32,10 @@
#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
#include "config/build_gpu.hh"
#if BUILD_GPU
#include <iostream>
#include <unordered_map>
@@ -71,6 +75,7 @@ class UncoalescedTable
~UncoalescedTable() {}
void insertPacket(PacketPtr pkt);
void insertReqType(PacketPtr pkt, RubyRequestType type);
bool packetAvailable();
void printRequestTable(std::stringstream& ss);
@@ -101,6 +106,8 @@ class UncoalescedTable
std::map<InstSeqNum, PerInstPackets> instMap;
std::map<InstSeqNum, int> instPktsRemaining;
std::map<InstSeqNum, RubyRequestType> reqTypeMap;
};
class CoalescedRequest
@@ -543,4 +550,5 @@ operator<<(std::ostream& out, const GPUCoalescer& obj)
} // namespace ruby
} // namespace gem5
#endif // BUILD_GPU
#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__

View File

@@ -178,13 +178,28 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
uint64_t block_size_bytes)
{
std::vector<Sequencer*> sequencer_map;
#if BUILD_GPU
std::vector<GPUCoalescer*> coalescer_map;
GPUCoalescer* coalescer_ptr = NULL;
#endif
Sequencer* sequencer_ptr = NULL;
for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getCPUSequencer());
#if BUILD_GPU
coalescer_map.push_back(m_abs_cntrl_vec[cntrl]->getGPUCoalescer());
#endif
if (sequencer_ptr == NULL) {
sequencer_ptr = sequencer_map[cntrl];
}
#if BUILD_GPU
if (coalescer_ptr == NULL) {
coalescer_ptr = coalescer_map[cntrl];
}
#endif
}
assert(sequencer_ptr != NULL);
@@ -193,6 +208,13 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
if (sequencer_map[cntrl] == NULL) {
sequencer_map[cntrl] = sequencer_ptr;
}
#if BUILD_GPU
if (coalescer_map[cntrl] == NULL) {
coalescer_map[cntrl] = coalescer_ptr;
}
#endif
}
// Remove the old CacheRecorder if it's still hanging about.
@@ -201,8 +223,15 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
}
// Create the CacheRecorder and record the cache trace
#if BUILD_GPU
m_cache_recorder = new CacheRecorder(uncompressed_trace, cache_trace_size,
sequencer_map, block_size_bytes);
sequencer_map, coalescer_map,
block_size_bytes);
#else
m_cache_recorder = new CacheRecorder(uncompressed_trace, cache_trace_size,
sequencer_map,
block_size_bytes);
#endif
}
void

View File

@@ -75,12 +75,14 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
// ReadReq : cache read
// WriteReq : cache write
// AtomicOp : cache atomic
// Flush : flush and invalidate cache
//
// VIPER does not expect MemSyncReq & Release since in GCN3, compute unit
// does not specify an equivalent type of memory request.
assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL1()) ||
pkt->cmd == MemCmd::ReadReq ||
pkt->cmd == MemCmd::WriteReq ||
pkt->cmd == MemCmd::FlushReq ||
pkt->isAtomicOp());
if (pkt->req->isInvL1() && m_cache_inv_pkt) {

View File

@@ -32,6 +32,10 @@
#ifndef __MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__
#define __MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__
#include "config/build_gpu.hh"
#if BUILD_GPU
#include <iostream>
#include "mem/ruby/common/Address.hh"
@@ -92,4 +96,5 @@ class VIPERCoalescer : public GPUCoalescer
} // namespace ruby
} // namespace gem5
#endif // BUILD_GPU
#endif //__MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__