dev-amdgpu,mem-ruby: Add support to checkpoint and restore between kernels in GPUFS (#377)
Earlier, GPU checkpointing was working only if a checkpoint was created before the first kernel execution. This pull request adds support to checkpoint in-between any two kernel calls. It does so by doing the following. - Adds flush support in the GPU_VIPER protocol - Adds flush support in the GPUCoalescer - Updates cache recorder to use the GPUCoalescer during simulation cooldown and cache warmup times.
This commit is contained in:
@@ -158,6 +158,16 @@ def addRunFSOptions(parser):
|
||||
help="Root partition of disk image",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--disable-avx",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Disables AVX. AVX is used in some ROCm libraries but "
|
||||
"does not have checkpointing support yet. If simulation either "
|
||||
"creates a checkpoint or restores from one, then AVX needs to "
|
||||
"be disabled for correct functionality ",
|
||||
)
|
||||
|
||||
|
||||
def runGpuFSSystem(args):
|
||||
"""
|
||||
|
||||
@@ -234,7 +234,7 @@ def makeGpuFSSystem(args):
|
||||
# If we are using KVM cpu, enable AVX. AVX is used in some ROCm libraries
|
||||
# such as rocBLAS which is used in higher level libraries like PyTorch.
|
||||
use_avx = False
|
||||
if ObjectList.is_kvm_cpu(TestCPUClass):
|
||||
if ObjectList.is_kvm_cpu(TestCPUClass) and not args.disable_avx:
|
||||
# AVX also requires CR4.osxsave to be 1. These must be set together
|
||||
# of KVM will error out.
|
||||
system.workload.enable_osxsave = 1
|
||||
|
||||
@@ -1044,6 +1044,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
|
||||
int num_queues = queues.size();
|
||||
Addr id[num_queues];
|
||||
Addr mqd_base[num_queues];
|
||||
uint64_t mqd_read_index[num_queues];
|
||||
Addr base[num_queues];
|
||||
Addr rptr[num_queues];
|
||||
Addr wptr[num_queues];
|
||||
@@ -1060,6 +1061,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
|
||||
uint32_t hqd_active[num_queues];
|
||||
uint32_t hqd_vmid[num_queues];
|
||||
Addr aql_rptr[num_queues];
|
||||
uint32_t aql[num_queues];
|
||||
uint32_t doorbell[num_queues];
|
||||
uint32_t hqd_pq_control[num_queues];
|
||||
|
||||
@@ -1068,9 +1070,10 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
|
||||
PM4Queue *q = iter.second;
|
||||
id[i] = q->id();
|
||||
mqd_base[i] = q->mqdBase();
|
||||
mqd_read_index[i] = q->getMQD()->mqdReadIndex;
|
||||
bool cur_state = q->ib();
|
||||
q->ib(false);
|
||||
base[i] = q->base() >> 8;
|
||||
base[i] = q->base();
|
||||
rptr[i] = q->getRptr();
|
||||
wptr[i] = q->getWptr();
|
||||
q->ib(true);
|
||||
@@ -1088,6 +1091,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
|
||||
hqd_active[i] = q->getMQD()->hqd_active;
|
||||
hqd_vmid[i] = q->getMQD()->hqd_vmid;
|
||||
aql_rptr[i] = q->getMQD()->aqlRptr;
|
||||
aql[i] = q->getMQD()->aql;
|
||||
doorbell[i] = q->getMQD()->doorbell;
|
||||
hqd_pq_control[i] = q->getMQD()->hqd_pq_control;
|
||||
i++;
|
||||
@@ -1096,6 +1100,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
|
||||
SERIALIZE_SCALAR(num_queues);
|
||||
SERIALIZE_ARRAY(id, num_queues);
|
||||
SERIALIZE_ARRAY(mqd_base, num_queues);
|
||||
SERIALIZE_ARRAY(mqd_read_index, num_queues);
|
||||
SERIALIZE_ARRAY(base, num_queues);
|
||||
SERIALIZE_ARRAY(rptr, num_queues);
|
||||
SERIALIZE_ARRAY(wptr, num_queues);
|
||||
@@ -1112,6 +1117,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const
|
||||
SERIALIZE_ARRAY(hqd_active, num_queues);
|
||||
SERIALIZE_ARRAY(hqd_vmid, num_queues);
|
||||
SERIALIZE_ARRAY(aql_rptr, num_queues);
|
||||
SERIALIZE_ARRAY(aql, num_queues);
|
||||
SERIALIZE_ARRAY(doorbell, num_queues);
|
||||
SERIALIZE_ARRAY(hqd_pq_control, num_queues);
|
||||
}
|
||||
@@ -1127,6 +1133,7 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
|
||||
|
||||
Addr id[num_queues];
|
||||
Addr mqd_base[num_queues];
|
||||
uint64_t mqd_read_index[num_queues];
|
||||
Addr base[num_queues];
|
||||
Addr rptr[num_queues];
|
||||
Addr wptr[num_queues];
|
||||
@@ -1143,11 +1150,13 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
|
||||
uint32_t hqd_active[num_queues];
|
||||
uint32_t hqd_vmid[num_queues];
|
||||
Addr aql_rptr[num_queues];
|
||||
uint32_t aql[num_queues];
|
||||
uint32_t doorbell[num_queues];
|
||||
uint32_t hqd_pq_control[num_queues];
|
||||
|
||||
UNSERIALIZE_ARRAY(id, num_queues);
|
||||
UNSERIALIZE_ARRAY(mqd_base, num_queues);
|
||||
UNSERIALIZE_ARRAY(mqd_read_index, num_queues);
|
||||
UNSERIALIZE_ARRAY(base, num_queues);
|
||||
UNSERIALIZE_ARRAY(rptr, num_queues);
|
||||
UNSERIALIZE_ARRAY(wptr, num_queues);
|
||||
@@ -1164,6 +1173,7 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
|
||||
UNSERIALIZE_ARRAY(hqd_active, num_queues);
|
||||
UNSERIALIZE_ARRAY(hqd_vmid, num_queues);
|
||||
UNSERIALIZE_ARRAY(aql_rptr, num_queues);
|
||||
UNSERIALIZE_ARRAY(aql, num_queues);
|
||||
UNSERIALIZE_ARRAY(doorbell, num_queues);
|
||||
UNSERIALIZE_ARRAY(hqd_pq_control, num_queues);
|
||||
|
||||
@@ -1172,22 +1182,24 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
|
||||
memset(mqd, 0, sizeof(QueueDesc));
|
||||
|
||||
mqd->mqdBase = mqd_base[i] >> 8;
|
||||
mqd->base = base[i];
|
||||
mqd->rptr = rptr[i];
|
||||
mqd->ibBase = ib_base[i];
|
||||
mqd->ibRptr = ib_rptr[i];
|
||||
mqd->mqdReadIndex = mqd_read_index[i];
|
||||
mqd->base = base[i] >> 8;
|
||||
mqd->aql = aql[i];
|
||||
|
||||
PM4MapQueues* pkt = new PM4MapQueues;
|
||||
memset(pkt, 0, sizeof(PM4MapQueues));
|
||||
newQueue(mqd, offset[i], pkt, id[i]);
|
||||
|
||||
queues[id[i]]->ib(false);
|
||||
queues[id[i]]->wptr(wptr[i]);
|
||||
queues[id[i]]->ib(true);
|
||||
queues[id[i]]->wptr(ib_wptr[i]);
|
||||
if (ib[i]) {
|
||||
queues[id[i]]->wptr(ib_wptr[i]);
|
||||
queues[id[i]]->rptr(ib_rptr[i]);
|
||||
} else {
|
||||
queues[id[i]]->rptr(rptr[i]);
|
||||
queues[id[i]]->wptr(wptr[i]);
|
||||
}
|
||||
queues[id[i]]->ib(ib[i]);
|
||||
queues[id[i]]->offset(offset[i]);
|
||||
queues[id[i]]->processing(processing[i]);
|
||||
queues[id[i]]->ib(ib[i]);
|
||||
queues[id[i]]->setPkt(me[i], pipe[i], queue[i], privileged[i]);
|
||||
queues[id[i]]->getMQD()->hqd_active = hqd_active[i];
|
||||
queues[id[i]]->getMQD()->hqd_vmid = hqd_vmid[i];
|
||||
@@ -1195,6 +1207,14 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp)
|
||||
queues[id[i]]->getMQD()->doorbell = doorbell[i];
|
||||
queues[id[i]]->getMQD()->hqd_pq_control = hqd_pq_control[i];
|
||||
|
||||
if (mqd->aql) {
|
||||
int mqd_size = (1 << ((hqd_pq_control[i] & 0x3f) + 1)) * 4;
|
||||
auto &hsa_pp = gpuDevice->CP()->hsaPacketProc();
|
||||
hsa_pp.setDeviceQueueDesc(aql_rptr[i], base[i], id[i],
|
||||
mqd_size, 8, GfxVersion::gfx900, offset[i],
|
||||
mqd_read_index[i]);
|
||||
}
|
||||
|
||||
DPRINTF(PM4PacketProcessor, "PM4 queue %d, rptr: %p wptr: %p\n",
|
||||
queues[id[i]]->id(), queues[id[i]]->rptr(),
|
||||
queues[id[i]]->wptr());
|
||||
|
||||
@@ -65,7 +65,8 @@ machine(MachineType:TCC, "TCC Cache")
|
||||
AtomicPassOn, desc="Atomic Op Passed on to Directory";
|
||||
AtomicDone, desc="AtomicOps Complete";
|
||||
AtomicNotDone, desc="AtomicOps not Complete";
|
||||
Data, desc="data messgae";
|
||||
Data, desc="Data message";
|
||||
Flush, desc="Flush cache entry";
|
||||
// Coming from this TCC
|
||||
L2_Repl, desc="L2 Replacement";
|
||||
// Probes
|
||||
@@ -376,6 +377,8 @@ machine(MachineType:TCC, "TCC Cache")
|
||||
} else {
|
||||
trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
|
||||
}
|
||||
} else if (in_msg.Type == CoherenceRequestType:WriteFlush) {
|
||||
trigger(Event:Flush, in_msg.addr, cache_entry, tbe);
|
||||
} else {
|
||||
DPRINTF(RubySlicc, "%s\n", in_msg);
|
||||
error("Unexpected Response Message to Core");
|
||||
@@ -509,6 +512,20 @@ machine(MachineType:TCC, "TCC Cache")
|
||||
}
|
||||
}
|
||||
|
||||
action(fw_sendFlushResponse, "fw", desc="send Flush Response") {
|
||||
peek(coreRequestNetwork_in, CPURequestMsg) {
|
||||
enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := CoherenceResponseType:TDSysWBAck;
|
||||
out_msg.Destination.clear();
|
||||
out_msg.Destination.add(in_msg.Requestor);
|
||||
out_msg.Sender := machineID;
|
||||
out_msg.MessageSize := MessageSizeType:Writeback_Control;
|
||||
out_msg.instSeqNum := in_msg.instSeqNum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
action(ar_sendAtomicResponse, "ar", desc="send Atomic Ack") {
|
||||
peek(coreRequestNetwork_in, CPURequestMsg) {
|
||||
enqueue(responseToCore_out, ResponseMsg, l2_response_latency + glc_atomic_latency, true) {
|
||||
@@ -628,6 +645,22 @@ machine(MachineType:TCC, "TCC Cache")
|
||||
}
|
||||
}
|
||||
|
||||
action(f_flush, "f", desc="write back data") {
|
||||
peek(coreRequestNetwork_in, CPURequestMsg) {
|
||||
enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Requestor := machineID;
|
||||
out_msg.WTRequestor := in_msg.Requestor;
|
||||
out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
|
||||
out_msg.MessageSize := MessageSizeType:Data;
|
||||
out_msg.Type := CoherenceRequestType:WriteFlush;
|
||||
out_msg.Dirty := true;
|
||||
out_msg.DataBlk := cache_entry.DataBlk;
|
||||
out_msg.writeMask.orMask(cache_entry.writeMask);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
action(at_atomicThrough, "at", desc="write back data") {
|
||||
peek(coreRequestNetwork_in, CPURequestMsg) {
|
||||
enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
|
||||
@@ -1075,4 +1108,21 @@ machine(MachineType:TCC, "TCC Cache")
|
||||
transition(WIB, WBAck,I) {
|
||||
pr_popResponseQueue;
|
||||
}
|
||||
|
||||
transition({A, IV, WI, WIB}, Flush) {
|
||||
st_stallAndWaitRequest;
|
||||
}
|
||||
|
||||
transition(I, Flush) {
|
||||
fw_sendFlushResponse;
|
||||
p_popRequestQueue;
|
||||
}
|
||||
|
||||
transition({V, W}, Flush, I) {TagArrayRead, TagArrayWrite} {
|
||||
t_allocateTBE;
|
||||
ut_updateTag;
|
||||
f_flush;
|
||||
i_invL2;
|
||||
p_popRequestQueue;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -55,6 +55,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
I, AccessPermission:Invalid, desc="Invalid";
|
||||
V, AccessPermission:Read_Only, desc="Valid";
|
||||
A, AccessPermission:Invalid, desc="Waiting on Atomic";
|
||||
|
||||
F, AccessPermission:Invalid, desc="Flushing; Waiting for Ack";
|
||||
}
|
||||
|
||||
enumeration(Event, desc="TCP Events") {
|
||||
@@ -256,6 +258,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
peek(responseToTCP_in, ResponseMsg, block_on="addr") {
|
||||
Entry cache_entry := getCacheEntry(in_msg.addr);
|
||||
TBE tbe := TBEs.lookup(in_msg.addr);
|
||||
DPRINTF(RubySlicc, "In responseToTCP_in with %s\n", in_msg);
|
||||
|
||||
if (in_msg.Type == CoherenceResponseType:TDSysResp) {
|
||||
if (disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) {
|
||||
// If L1 is disabled or requests have GLC or SLC flag set,
|
||||
@@ -273,6 +277,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
} else if (in_msg.Type == CoherenceResponseType:TDSysWBAck ||
|
||||
in_msg.Type == CoherenceResponseType:NBSysWBAck) {
|
||||
trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
|
||||
DPRINTF(RubySlicc, "Issuing TCC_AckWB\n");
|
||||
} else {
|
||||
error("Unexpected Response Message to Core");
|
||||
}
|
||||
@@ -469,6 +474,24 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
action(sf_setFlush, "sf", desc="set flush") {
|
||||
inFlush := true;
|
||||
APPEND_TRANSITION_COMMENT(" inFlush is true");
|
||||
enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Requestor := machineID;
|
||||
assert(is_valid(cache_entry));
|
||||
out_msg.DataBlk := cache_entry.DataBlk;
|
||||
out_msg.writeMask.clear();
|
||||
out_msg.writeMask.orMask(cache_entry.writeMask);
|
||||
out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
|
||||
TCC_select_low_bit, TCC_select_num_bits));
|
||||
out_msg.MessageSize := MessageSizeType:Data;
|
||||
out_msg.Type := CoherenceRequestType:WriteFlush;
|
||||
out_msg.InitialRequestTime := curCycle();
|
||||
out_msg.Shared := false;
|
||||
out_msg.isSLCSet := false;
|
||||
peek(mandatoryQueue_in, RubyRequest) {
|
||||
out_msg.instSeqNum := in_msg.instSeqNum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
|
||||
@@ -524,6 +547,16 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
cache_entry.Dirty := true;
|
||||
}
|
||||
|
||||
action(f_flushDone, "f", desc="flush done") {
|
||||
assert(is_valid(cache_entry));
|
||||
|
||||
if (use_seq_not_coal) {
|
||||
sequencer.writeCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
|
||||
} else {
|
||||
coalescer.writeCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
|
||||
}
|
||||
}
|
||||
|
||||
action(inv_invDone, "inv", desc="local inv done") {
|
||||
if (use_seq_not_coal) {
|
||||
DPRINTF(RubySlicc, "Sequencer does not define invCallback!\n");
|
||||
@@ -695,11 +728,16 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
ic_invCache;
|
||||
}
|
||||
|
||||
transition({V, I, A},Flush) {TagArrayFlash} {
|
||||
transition({V,I}, Flush, F) {TagArrayFlash} {
|
||||
a_allocate;
|
||||
sf_setFlush;
|
||||
p_popMandatoryQueue;
|
||||
}
|
||||
|
||||
transition(A, Flush) {
|
||||
z_stall;
|
||||
}
|
||||
|
||||
transition({I, V}, Evict, I) {TagArrayFlash} {
|
||||
inv_invDone;
|
||||
p_popMandatoryQueue;
|
||||
@@ -716,4 +754,10 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
|
||||
wd_wtDone;
|
||||
pr_popResponseQueue;
|
||||
}
|
||||
|
||||
transition(F, TCC_AckWB, I) {
|
||||
f_flushDone;
|
||||
pr_popResponseQueue;
|
||||
ic_invCache;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -83,6 +83,8 @@ machine(MachineType:Directory, "AMD Baseline protocol")
|
||||
BM_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
|
||||
B_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
|
||||
B, AccessPermission:Backing_Store, desc="sent response, Blocked til ack";
|
||||
|
||||
F, AccessPermission:Busy, desc="sent Flus, blocked till ack";
|
||||
}
|
||||
|
||||
// Events
|
||||
@@ -120,6 +122,9 @@ machine(MachineType:Directory, "AMD Baseline protocol")
|
||||
// DMA
|
||||
DmaRead, desc="DMA read";
|
||||
DmaWrite, desc="DMA write";
|
||||
|
||||
// Flush
|
||||
Flush, desc="Flush entry";
|
||||
}
|
||||
|
||||
enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
|
||||
@@ -411,6 +416,9 @@ machine(MachineType:Directory, "AMD Baseline protocol")
|
||||
DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr);
|
||||
trigger(Event:VicClean, in_msg.addr, entry, tbe);
|
||||
}
|
||||
} else if (in_msg.Type == CoherenceRequestType:WriteFlush) {
|
||||
DPRINTF(RubySlicc, "Got Flush from %s on %s\n", in_msg.Requestor, in_msg.addr);
|
||||
trigger(Event:Flush, in_msg.addr, entry, tbe);
|
||||
} else {
|
||||
error("Bad request message type");
|
||||
}
|
||||
@@ -562,6 +570,23 @@ machine(MachineType:Directory, "AMD Baseline protocol")
|
||||
}
|
||||
}
|
||||
|
||||
action(rf_sendResponseFlush, "rf", desc="send Flush Ack") {
|
||||
peek(memQueue_in, MemoryMsg) {
|
||||
enqueue(responseNetwork_out, ResponseMsg, 1) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := CoherenceResponseType:NBSysWBAck;
|
||||
out_msg.Destination.add(tbe.OriginalRequestor);
|
||||
out_msg.WTRequestor := tbe.WTRequestor;
|
||||
out_msg.Sender := machineID;
|
||||
out_msg.MessageSize := MessageSizeType:Writeback_Control;
|
||||
out_msg.InitialRequestTime := tbe.InitialRequestTime;
|
||||
out_msg.ForwardRequestTime := curCycle();
|
||||
out_msg.ProbeRequestStartTime := curCycle();
|
||||
//out_msg.instSeqNum := in_msg.instSeqNum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
action(l_queueMemWBReq, "lq", desc="Write WB data to memory") {
|
||||
peek(responseNetwork_in, ResponseMsg) {
|
||||
enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
|
||||
@@ -933,6 +958,23 @@ machine(MachineType:Directory, "AMD Baseline protocol")
|
||||
}
|
||||
}
|
||||
|
||||
action(f_writeFlushDataToMemory, "f", desc="Write flush data to memory") {
|
||||
peek(requestNetwork_in, CPURequestMsg) {
|
||||
enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) {
|
||||
out_msg.addr := address;
|
||||
out_msg.Type := MemoryRequestType:MEMORY_WB;
|
||||
out_msg.Sender := machineID;
|
||||
out_msg.MessageSize := MessageSizeType:Writeback_Data;
|
||||
out_msg.DataBlk := in_msg.DataBlk;
|
||||
}
|
||||
if (tbe.Dirty == false) {
|
||||
// have to update the TBE, too, because of how this
|
||||
// directory deals with functional writes
|
||||
tbe.DataBlk := in_msg.DataBlk;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
action(atd_allocateTBEforDMA, "atd", desc="allocate TBE Entry for DMA") {
|
||||
check_allocate(TBEs);
|
||||
peek(dmaRequestQueue_in, DMARequestMsg) {
|
||||
@@ -1553,4 +1595,17 @@ machine(MachineType:Directory, "AMD Baseline protocol")
|
||||
dt_deallocateTBE;
|
||||
pt_popTriggerQueue;
|
||||
}
|
||||
|
||||
transition(U, Flush, F) {L3TagArrayRead, L3TagArrayWrite} {
|
||||
t_allocateTBE;
|
||||
f_writeFlushDataToMemory;
|
||||
w_sendResponseWBAck;
|
||||
p_popRequestQueue;
|
||||
}
|
||||
|
||||
transition(F, WBAck, U) {
|
||||
pm_popMemQueue;
|
||||
dt_deallocateTBE;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -70,7 +70,9 @@ namespace ruby
|
||||
{
|
||||
|
||||
class Network;
|
||||
#ifdef BUILD_GPU
|
||||
class GPUCoalescer;
|
||||
#endif
|
||||
class DMASequencer;
|
||||
|
||||
// used to communicate that an in_port peeked the wrong message type
|
||||
|
||||
@@ -30,8 +30,11 @@
|
||||
#include "mem/ruby/system/CacheRecorder.hh"
|
||||
|
||||
#include "debug/RubyCacheTrace.hh"
|
||||
#include "mem/packet.hh"
|
||||
#include "mem/ruby/system/GPUCoalescer.hh"
|
||||
#include "mem/ruby/system/RubySystem.hh"
|
||||
#include "mem/ruby/system/Sequencer.hh"
|
||||
#include "sim/sim_exit.hh"
|
||||
|
||||
namespace gem5
|
||||
{
|
||||
@@ -54,14 +57,29 @@ CacheRecorder::CacheRecorder()
|
||||
{
|
||||
}
|
||||
|
||||
#if BUILD_GPU
|
||||
CacheRecorder::CacheRecorder(uint8_t* uncompressed_trace,
|
||||
uint64_t uncompressed_trace_size,
|
||||
std::vector<Sequencer*>& seq_map,
|
||||
std::vector<GPUCoalescer*>& coal_map,
|
||||
uint64_t block_size_bytes)
|
||||
: m_uncompressed_trace(uncompressed_trace),
|
||||
m_uncompressed_trace_size(uncompressed_trace_size),
|
||||
m_seq_map(seq_map), m_coalescer_map(coal_map), m_bytes_read(0),
|
||||
m_records_read(0), m_records_flushed(0),
|
||||
m_block_size_bytes(block_size_bytes)
|
||||
#else
|
||||
CacheRecorder::CacheRecorder(uint8_t* uncompressed_trace,
|
||||
uint64_t uncompressed_trace_size,
|
||||
std::vector<Sequencer*>& seq_map,
|
||||
uint64_t block_size_bytes)
|
||||
: m_uncompressed_trace(uncompressed_trace),
|
||||
m_uncompressed_trace_size(uncompressed_trace_size),
|
||||
m_seq_map(seq_map), m_bytes_read(0), m_records_read(0),
|
||||
m_records_flushed(0), m_block_size_bytes(block_size_bytes)
|
||||
m_seq_map(seq_map), m_bytes_read(0),
|
||||
m_records_read(0), m_records_flushed(0),
|
||||
m_block_size_bytes(block_size_bytes)
|
||||
|
||||
#endif
|
||||
{
|
||||
if (m_uncompressed_trace != NULL) {
|
||||
if (m_block_size_bytes < RubySystem::getBlockSizeBytes()) {
|
||||
@@ -81,6 +99,9 @@ CacheRecorder::~CacheRecorder()
|
||||
m_uncompressed_trace = NULL;
|
||||
}
|
||||
m_seq_map.clear();
|
||||
#if BUILD_GPU
|
||||
m_coalescer_map.clear();
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
@@ -96,11 +117,27 @@ CacheRecorder::enqueueNextFlushRequest()
|
||||
Packet *pkt = new Packet(req, requestType);
|
||||
|
||||
Sequencer* m_sequencer_ptr = m_seq_map[rec->m_cntrl_id];
|
||||
#if BUILD_GPU
|
||||
GPUCoalescer* m_coal_ptr = m_coalescer_map[rec->m_cntrl_id];
|
||||
#endif
|
||||
assert(m_sequencer_ptr != NULL);
|
||||
#if BUILD_GPU
|
||||
if (m_coal_ptr == NULL)
|
||||
m_sequencer_ptr->makeRequest(pkt);
|
||||
else {
|
||||
pkt->req->setReqInstSeqNum(m_records_flushed - 1);
|
||||
m_coal_ptr->makeRequest(pkt);
|
||||
}
|
||||
#else
|
||||
m_sequencer_ptr->makeRequest(pkt);
|
||||
#endif
|
||||
|
||||
DPRINTF(RubyCacheTrace, "Flushing %s\n", *rec);
|
||||
|
||||
} else {
|
||||
if (m_records_flushed > 0) {
|
||||
exitSimLoop("Finished Drain", 0);
|
||||
}
|
||||
DPRINTF(RubyCacheTrace, "Flushed all %d records\n", m_records_flushed);
|
||||
}
|
||||
}
|
||||
@@ -143,13 +180,27 @@ CacheRecorder::enqueueNextFetchRequest()
|
||||
pkt->dataStatic(traceRecord->m_data + rec_bytes_read);
|
||||
|
||||
Sequencer* m_sequencer_ptr = m_seq_map[traceRecord->m_cntrl_id];
|
||||
#if BUILD_GPU
|
||||
GPUCoalescer* m_coal_ptr;
|
||||
m_coal_ptr = m_coalescer_map[traceRecord->m_cntrl_id];
|
||||
#endif
|
||||
assert(m_sequencer_ptr != NULL);
|
||||
#if BUILD_GPU
|
||||
if (m_coal_ptr == NULL)
|
||||
m_sequencer_ptr->makeRequest(pkt);
|
||||
else {
|
||||
pkt->req->setReqInstSeqNum(m_records_read);
|
||||
m_coal_ptr->makeRequest(pkt);
|
||||
}
|
||||
#else
|
||||
m_sequencer_ptr->makeRequest(pkt);
|
||||
#endif
|
||||
}
|
||||
|
||||
m_bytes_read += (sizeof(TraceRecord) + m_block_size_bytes);
|
||||
m_records_read++;
|
||||
} else {
|
||||
exitSimLoop("Finished Warmup", 0);
|
||||
DPRINTF(RubyCacheTrace, "Fetched all %d records\n", m_records_read);
|
||||
}
|
||||
}
|
||||
@@ -168,6 +219,8 @@ CacheRecorder::addRecord(int cntrl, Addr data_addr, Addr pc_addr,
|
||||
memcpy(rec->m_data, data.getData(0, m_block_size_bytes),
|
||||
m_block_size_bytes);
|
||||
|
||||
DPRINTF(RubyCacheTrace, "Inside addRecord with cntrl id %d and type %d\n",
|
||||
cntrl, type);
|
||||
m_records.push_back(rec);
|
||||
}
|
||||
|
||||
|
||||
@@ -38,6 +38,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "base/types.hh"
|
||||
#include "config/build_gpu.hh"
|
||||
#include "mem/ruby/common/Address.hh"
|
||||
#include "mem/ruby/common/DataBlock.hh"
|
||||
#include "mem/ruby/common/TypeDefines.hh"
|
||||
@@ -50,6 +51,9 @@ namespace ruby
|
||||
{
|
||||
|
||||
class Sequencer;
|
||||
#if BUILD_GPU
|
||||
class GPUCoalescer;
|
||||
#endif
|
||||
|
||||
/*!
|
||||
* Class for recording cache contents. Note that the last element of the
|
||||
@@ -76,10 +80,18 @@ class CacheRecorder
|
||||
CacheRecorder();
|
||||
~CacheRecorder();
|
||||
|
||||
#if BUILD_GPU
|
||||
CacheRecorder(uint8_t* uncompressed_trace,
|
||||
uint64_t uncompressed_trace_size,
|
||||
std::vector<Sequencer*>& SequencerMap,
|
||||
std::vector<GPUCoalescer*>& CoalescerMap,
|
||||
uint64_t block_size_bytes);
|
||||
#else
|
||||
CacheRecorder(uint8_t* uncompressed_trace,
|
||||
uint64_t uncompressed_trace_size,
|
||||
std::vector<Sequencer*>& SequencerMap,
|
||||
uint64_t block_size_bytes);
|
||||
#endif
|
||||
void addRecord(int cntrl, Addr data_addr, Addr pc_addr,
|
||||
RubyRequestType type, Tick time, DataBlock& data);
|
||||
|
||||
@@ -115,6 +127,9 @@ class CacheRecorder
|
||||
uint8_t* m_uncompressed_trace;
|
||||
uint64_t m_uncompressed_trace_size;
|
||||
std::vector<Sequencer*> m_seq_map;
|
||||
#if BUILD_GPU
|
||||
std::vector<GPUCoalescer*> m_coalescer_map;
|
||||
#endif
|
||||
uint64_t m_bytes_read;
|
||||
uint64_t m_records_read;
|
||||
uint64_t m_records_flushed;
|
||||
|
||||
@@ -73,6 +73,14 @@ UncoalescedTable::insertPacket(PacketPtr pkt)
|
||||
pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size());
|
||||
}
|
||||
|
||||
void
|
||||
UncoalescedTable::insertReqType(PacketPtr pkt, RubyRequestType type)
|
||||
{
|
||||
uint64_t seqNum = pkt->req->getReqInstSeqNum();
|
||||
|
||||
reqTypeMap[seqNum] = type;
|
||||
}
|
||||
|
||||
bool
|
||||
UncoalescedTable::packetAvailable()
|
||||
{
|
||||
@@ -128,9 +136,21 @@ UncoalescedTable::updateResources()
|
||||
instMap.erase(iter++);
|
||||
instPktsRemaining.erase(seq_num);
|
||||
|
||||
// Release the token
|
||||
DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", seq_num);
|
||||
coalescer->getGMTokenPort().sendTokens(1);
|
||||
// Release the token if the Ruby system is not in cooldown
|
||||
// or warmup phases. When in these phases, the RubyPorts
|
||||
// are accessed directly using the makeRequest() command
|
||||
// instead of accessing through the port. This makes
|
||||
// sending tokens through the port unnecessary
|
||||
if (!RubySystem::getWarmupEnabled()
|
||||
&& !RubySystem::getCooldownEnabled()) {
|
||||
if (reqTypeMap[seq_num] != RubyRequestType_FLUSH) {
|
||||
DPRINTF(GPUCoalescer,
|
||||
"Returning token seqNum %d\n", seq_num);
|
||||
coalescer->getGMTokenPort().sendTokens(1);
|
||||
}
|
||||
}
|
||||
|
||||
reqTypeMap.erase(seq_num);
|
||||
} else {
|
||||
++iter;
|
||||
}
|
||||
@@ -565,6 +585,14 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest,
|
||||
for (auto& pkt : pktList) {
|
||||
offset = getOffset(pkt->getAddr());
|
||||
pkt_size = pkt->getSize();
|
||||
request_address = pkt->getAddr();
|
||||
|
||||
// When the Ruby system is cooldown phase, the requests come from
|
||||
// the cache recorder. These requests do not get coalesced and
|
||||
// do not return valid data.
|
||||
if (RubySystem::getCooldownEnabled())
|
||||
continue;
|
||||
|
||||
if (pkt->getPtr<uint8_t>()) {
|
||||
switch(type) {
|
||||
// Store and AtomicNoReturns follow the same path, as the
|
||||
@@ -627,7 +655,6 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
|
||||
assert(!pkt->req->isLLSC());
|
||||
assert(!pkt->req->isLockedRMW());
|
||||
assert(!pkt->req->isInstFetch());
|
||||
assert(!pkt->isFlush());
|
||||
|
||||
if (pkt->req->isAtomicReturn()) {
|
||||
req_type = RubyRequestType_ATOMIC_RETURN;
|
||||
@@ -637,6 +664,8 @@ GPUCoalescer::getRequestType(PacketPtr pkt)
|
||||
req_type = RubyRequestType_LD;
|
||||
} else if (pkt->isWrite()) {
|
||||
req_type = RubyRequestType_ST;
|
||||
} else if (pkt->isFlush()) {
|
||||
req_type = RubyRequestType_FLUSH;
|
||||
} else {
|
||||
panic("Unsupported ruby packet type\n");
|
||||
}
|
||||
@@ -658,7 +687,7 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
|
||||
issueMemSyncRequest(pkt);
|
||||
} else {
|
||||
// otherwise, this must be either read or write command
|
||||
assert(pkt->isRead() || pkt->isWrite());
|
||||
assert(pkt->isRead() || pkt->isWrite() || pkt->isFlush());
|
||||
|
||||
InstSeqNum seq_num = pkt->req->getReqInstSeqNum();
|
||||
|
||||
@@ -667,10 +696,17 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
|
||||
// number of lanes actives for that vmem request (i.e., the popcnt
|
||||
// of the exec_mask.
|
||||
int num_packets = 1;
|
||||
if (!m_usingRubyTester) {
|
||||
num_packets = 0;
|
||||
for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) {
|
||||
num_packets += getDynInst(pkt)->getLaneStatus(i);
|
||||
|
||||
// When Ruby is in warmup or cooldown phase, the requests come from
|
||||
// the cache recorder. There is no dynamic instruction associated
|
||||
// with these requests either
|
||||
if (!RubySystem::getWarmupEnabled()
|
||||
&& !RubySystem::getCooldownEnabled()) {
|
||||
if (!m_usingRubyTester) {
|
||||
num_packets = 0;
|
||||
for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) {
|
||||
num_packets += getDynInst(pkt)->getLaneStatus(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -679,6 +715,7 @@ GPUCoalescer::makeRequest(PacketPtr pkt)
|
||||
// future cycle. Packets remaining is set to the number of excepted
|
||||
// requests from the instruction based on its exec_mask.
|
||||
uncoalescedTable.insertPacket(pkt);
|
||||
uncoalescedTable.insertReqType(pkt, getRequestType(pkt));
|
||||
uncoalescedTable.initPacketsRemaining(seq_num, num_packets);
|
||||
DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
|
||||
pkt->getAddr());
|
||||
@@ -945,21 +982,27 @@ void
|
||||
GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
|
||||
{
|
||||
for (auto& pkt : mylist) {
|
||||
RubyPort::SenderState *ss =
|
||||
safe_cast<RubyPort::SenderState *>(pkt->senderState);
|
||||
MemResponsePort *port = ss->port;
|
||||
assert(port != NULL);
|
||||
// When Ruby is in warmup or cooldown phase, the requests come
|
||||
// from the cache recorder. They do not track which port to use
|
||||
// and do not need to send the response back
|
||||
if (!RubySystem::getWarmupEnabled()
|
||||
&& !RubySystem::getCooldownEnabled()) {
|
||||
RubyPort::SenderState *ss =
|
||||
safe_cast<RubyPort::SenderState *>(pkt->senderState);
|
||||
MemResponsePort *port = ss->port;
|
||||
assert(port != NULL);
|
||||
|
||||
pkt->senderState = ss->predecessor;
|
||||
pkt->senderState = ss->predecessor;
|
||||
|
||||
if (pkt->cmd != MemCmd::WriteReq) {
|
||||
// for WriteReq, we keep the original senderState until
|
||||
// writeCompleteCallback
|
||||
delete ss;
|
||||
if (pkt->cmd != MemCmd::WriteReq) {
|
||||
// for WriteReq, we keep the original senderState until
|
||||
// writeCompleteCallback
|
||||
delete ss;
|
||||
}
|
||||
|
||||
port->hitCallback(pkt);
|
||||
trySendRetries();
|
||||
}
|
||||
|
||||
port->hitCallback(pkt);
|
||||
trySendRetries();
|
||||
}
|
||||
|
||||
// We schedule an event in the same tick as hitCallback (similar to
|
||||
@@ -971,7 +1014,14 @@ GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
|
||||
schedule(issueEvent, curTick());
|
||||
}
|
||||
|
||||
testDrainComplete();
|
||||
RubySystem *rs = m_ruby_system;
|
||||
if (RubySystem::getWarmupEnabled()) {
|
||||
rs->m_cache_recorder->enqueueNextFetchRequest();
|
||||
} else if (RubySystem::getCooldownEnabled()) {
|
||||
rs->m_cache_recorder->enqueueNextFlushRequest();
|
||||
} else {
|
||||
testDrainComplete();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
|
||||
@@ -32,6 +32,10 @@
|
||||
#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
|
||||
#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
|
||||
|
||||
#include "config/build_gpu.hh"
|
||||
|
||||
#if BUILD_GPU
|
||||
|
||||
#include <iostream>
|
||||
#include <unordered_map>
|
||||
|
||||
@@ -71,6 +75,7 @@ class UncoalescedTable
|
||||
~UncoalescedTable() {}
|
||||
|
||||
void insertPacket(PacketPtr pkt);
|
||||
void insertReqType(PacketPtr pkt, RubyRequestType type);
|
||||
bool packetAvailable();
|
||||
void printRequestTable(std::stringstream& ss);
|
||||
|
||||
@@ -101,6 +106,8 @@ class UncoalescedTable
|
||||
std::map<InstSeqNum, PerInstPackets> instMap;
|
||||
|
||||
std::map<InstSeqNum, int> instPktsRemaining;
|
||||
|
||||
std::map<InstSeqNum, RubyRequestType> reqTypeMap;
|
||||
};
|
||||
|
||||
class CoalescedRequest
|
||||
@@ -543,4 +550,5 @@ operator<<(std::ostream& out, const GPUCoalescer& obj)
|
||||
} // namespace ruby
|
||||
} // namespace gem5
|
||||
|
||||
#endif // BUILD_GPU
|
||||
#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
|
||||
|
||||
@@ -178,13 +178,28 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
|
||||
uint64_t block_size_bytes)
|
||||
{
|
||||
std::vector<Sequencer*> sequencer_map;
|
||||
#if BUILD_GPU
|
||||
std::vector<GPUCoalescer*> coalescer_map;
|
||||
GPUCoalescer* coalescer_ptr = NULL;
|
||||
#endif
|
||||
Sequencer* sequencer_ptr = NULL;
|
||||
|
||||
for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
|
||||
sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getCPUSequencer());
|
||||
#if BUILD_GPU
|
||||
coalescer_map.push_back(m_abs_cntrl_vec[cntrl]->getGPUCoalescer());
|
||||
#endif
|
||||
|
||||
if (sequencer_ptr == NULL) {
|
||||
sequencer_ptr = sequencer_map[cntrl];
|
||||
}
|
||||
|
||||
#if BUILD_GPU
|
||||
if (coalescer_ptr == NULL) {
|
||||
coalescer_ptr = coalescer_map[cntrl];
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
assert(sequencer_ptr != NULL);
|
||||
@@ -193,6 +208,13 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
|
||||
if (sequencer_map[cntrl] == NULL) {
|
||||
sequencer_map[cntrl] = sequencer_ptr;
|
||||
}
|
||||
|
||||
#if BUILD_GPU
|
||||
if (coalescer_map[cntrl] == NULL) {
|
||||
coalescer_map[cntrl] = coalescer_ptr;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
// Remove the old CacheRecorder if it's still hanging about.
|
||||
@@ -201,8 +223,15 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
|
||||
}
|
||||
|
||||
// Create the CacheRecorder and record the cache trace
|
||||
#if BUILD_GPU
|
||||
m_cache_recorder = new CacheRecorder(uncompressed_trace, cache_trace_size,
|
||||
sequencer_map, block_size_bytes);
|
||||
sequencer_map, coalescer_map,
|
||||
block_size_bytes);
|
||||
#else
|
||||
m_cache_recorder = new CacheRecorder(uncompressed_trace, cache_trace_size,
|
||||
sequencer_map,
|
||||
block_size_bytes);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
|
||||
@@ -75,12 +75,14 @@ VIPERCoalescer::makeRequest(PacketPtr pkt)
|
||||
// ReadReq : cache read
|
||||
// WriteReq : cache write
|
||||
// AtomicOp : cache atomic
|
||||
// Flush : flush and invalidate cache
|
||||
//
|
||||
// VIPER does not expect MemSyncReq & Release since in GCN3, compute unit
|
||||
// does not specify an equivalent type of memory request.
|
||||
assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL1()) ||
|
||||
pkt->cmd == MemCmd::ReadReq ||
|
||||
pkt->cmd == MemCmd::WriteReq ||
|
||||
pkt->cmd == MemCmd::FlushReq ||
|
||||
pkt->isAtomicOp());
|
||||
|
||||
if (pkt->req->isInvL1() && m_cache_inv_pkt) {
|
||||
|
||||
@@ -32,6 +32,10 @@
|
||||
#ifndef __MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__
|
||||
#define __MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__
|
||||
|
||||
#include "config/build_gpu.hh"
|
||||
|
||||
#if BUILD_GPU
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "mem/ruby/common/Address.hh"
|
||||
@@ -92,4 +96,5 @@ class VIPERCoalescer : public GPUCoalescer
|
||||
} // namespace ruby
|
||||
} // namespace gem5
|
||||
|
||||
#endif // BUILD_GPU
|
||||
#endif //__MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__
|
||||
|
||||
Reference in New Issue
Block a user