diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py index 0f090e2f89..5d29959eff 100644 --- a/configs/example/gpufs/runfs.py +++ b/configs/example/gpufs/runfs.py @@ -158,6 +158,16 @@ def addRunFSOptions(parser): help="Root partition of disk image", ) + parser.add_argument( + "--disable-avx", + action="store_true", + default=False, + help="Disables AVX. AVX is used in some ROCm libraries but " + "does not have checkpointing support yet. If simulation either " + "creates a checkpoint or restores from one, then AVX needs to " + "be disabled for correct functionality ", + ) + def runGpuFSSystem(args): """ diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py index 7ddc4f0752..7cb0ce1aa5 100644 --- a/configs/example/gpufs/system/system.py +++ b/configs/example/gpufs/system/system.py @@ -234,7 +234,7 @@ def makeGpuFSSystem(args): # If we are using KVM cpu, enable AVX. AVX is used in some ROCm libraries # such as rocBLAS which is used in higher level libraries like PyTorch. use_avx = False - if ObjectList.is_kvm_cpu(TestCPUClass): + if ObjectList.is_kvm_cpu(TestCPUClass) and not args.disable_avx: # AVX also requires CR4.osxsave to be 1. These must be set together # of KVM will error out. system.workload.enable_osxsave = 1 diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc index e7b846529e..fdb6f9d7ce 100644 --- a/src/dev/amdgpu/pm4_packet_processor.cc +++ b/src/dev/amdgpu/pm4_packet_processor.cc @@ -1044,6 +1044,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const int num_queues = queues.size(); Addr id[num_queues]; Addr mqd_base[num_queues]; + uint64_t mqd_read_index[num_queues]; Addr base[num_queues]; Addr rptr[num_queues]; Addr wptr[num_queues]; @@ -1060,6 +1061,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const uint32_t hqd_active[num_queues]; uint32_t hqd_vmid[num_queues]; Addr aql_rptr[num_queues]; + uint32_t aql[num_queues]; uint32_t doorbell[num_queues]; uint32_t hqd_pq_control[num_queues]; @@ -1068,9 +1070,10 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const PM4Queue *q = iter.second; id[i] = q->id(); mqd_base[i] = q->mqdBase(); + mqd_read_index[i] = q->getMQD()->mqdReadIndex; bool cur_state = q->ib(); q->ib(false); - base[i] = q->base() >> 8; + base[i] = q->base(); rptr[i] = q->getRptr(); wptr[i] = q->getWptr(); q->ib(true); @@ -1088,6 +1091,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const hqd_active[i] = q->getMQD()->hqd_active; hqd_vmid[i] = q->getMQD()->hqd_vmid; aql_rptr[i] = q->getMQD()->aqlRptr; + aql[i] = q->getMQD()->aql; doorbell[i] = q->getMQD()->doorbell; hqd_pq_control[i] = q->getMQD()->hqd_pq_control; i++; @@ -1096,6 +1100,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const SERIALIZE_SCALAR(num_queues); SERIALIZE_ARRAY(id, num_queues); SERIALIZE_ARRAY(mqd_base, num_queues); + SERIALIZE_ARRAY(mqd_read_index, num_queues); SERIALIZE_ARRAY(base, num_queues); SERIALIZE_ARRAY(rptr, num_queues); SERIALIZE_ARRAY(wptr, num_queues); @@ -1112,6 +1117,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const SERIALIZE_ARRAY(hqd_active, num_queues); SERIALIZE_ARRAY(hqd_vmid, num_queues); SERIALIZE_ARRAY(aql_rptr, num_queues); + SERIALIZE_ARRAY(aql, num_queues); SERIALIZE_ARRAY(doorbell, num_queues); SERIALIZE_ARRAY(hqd_pq_control, num_queues); } @@ -1127,6 +1133,7 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp) Addr id[num_queues]; Addr mqd_base[num_queues]; + uint64_t mqd_read_index[num_queues]; Addr base[num_queues]; Addr rptr[num_queues]; Addr wptr[num_queues]; @@ -1143,11 +1150,13 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp) uint32_t hqd_active[num_queues]; uint32_t hqd_vmid[num_queues]; Addr aql_rptr[num_queues]; + uint32_t aql[num_queues]; uint32_t doorbell[num_queues]; uint32_t hqd_pq_control[num_queues]; UNSERIALIZE_ARRAY(id, num_queues); UNSERIALIZE_ARRAY(mqd_base, num_queues); + UNSERIALIZE_ARRAY(mqd_read_index, num_queues); UNSERIALIZE_ARRAY(base, num_queues); UNSERIALIZE_ARRAY(rptr, num_queues); UNSERIALIZE_ARRAY(wptr, num_queues); @@ -1164,6 +1173,7 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp) UNSERIALIZE_ARRAY(hqd_active, num_queues); UNSERIALIZE_ARRAY(hqd_vmid, num_queues); UNSERIALIZE_ARRAY(aql_rptr, num_queues); + UNSERIALIZE_ARRAY(aql, num_queues); UNSERIALIZE_ARRAY(doorbell, num_queues); UNSERIALIZE_ARRAY(hqd_pq_control, num_queues); @@ -1172,22 +1182,24 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp) memset(mqd, 0, sizeof(QueueDesc)); mqd->mqdBase = mqd_base[i] >> 8; - mqd->base = base[i]; - mqd->rptr = rptr[i]; - mqd->ibBase = ib_base[i]; - mqd->ibRptr = ib_rptr[i]; + mqd->mqdReadIndex = mqd_read_index[i]; + mqd->base = base[i] >> 8; + mqd->aql = aql[i]; PM4MapQueues* pkt = new PM4MapQueues; memset(pkt, 0, sizeof(PM4MapQueues)); newQueue(mqd, offset[i], pkt, id[i]); - queues[id[i]]->ib(false); - queues[id[i]]->wptr(wptr[i]); - queues[id[i]]->ib(true); - queues[id[i]]->wptr(ib_wptr[i]); + if (ib[i]) { + queues[id[i]]->wptr(ib_wptr[i]); + queues[id[i]]->rptr(ib_rptr[i]); + } else { + queues[id[i]]->rptr(rptr[i]); + queues[id[i]]->wptr(wptr[i]); + } + queues[id[i]]->ib(ib[i]); queues[id[i]]->offset(offset[i]); queues[id[i]]->processing(processing[i]); - queues[id[i]]->ib(ib[i]); queues[id[i]]->setPkt(me[i], pipe[i], queue[i], privileged[i]); queues[id[i]]->getMQD()->hqd_active = hqd_active[i]; queues[id[i]]->getMQD()->hqd_vmid = hqd_vmid[i]; @@ -1195,6 +1207,14 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp) queues[id[i]]->getMQD()->doorbell = doorbell[i]; queues[id[i]]->getMQD()->hqd_pq_control = hqd_pq_control[i]; + if (mqd->aql) { + int mqd_size = (1 << ((hqd_pq_control[i] & 0x3f) + 1)) * 4; + auto &hsa_pp = gpuDevice->CP()->hsaPacketProc(); + hsa_pp.setDeviceQueueDesc(aql_rptr[i], base[i], id[i], + mqd_size, 8, GfxVersion::gfx900, offset[i], + mqd_read_index[i]); + } + DPRINTF(PM4PacketProcessor, "PM4 queue %d, rptr: %p wptr: %p\n", queues[id[i]]->id(), queues[id[i]]->rptr(), queues[id[i]]->wptr()); diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm index 20a0979af1..be1243aaa5 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm @@ -65,7 +65,8 @@ machine(MachineType:TCC, "TCC Cache") AtomicPassOn, desc="Atomic Op Passed on to Directory"; AtomicDone, desc="AtomicOps Complete"; AtomicNotDone, desc="AtomicOps not Complete"; - Data, desc="data messgae"; + Data, desc="Data message"; + Flush, desc="Flush cache entry"; // Coming from this TCC L2_Repl, desc="L2 Replacement"; // Probes @@ -376,6 +377,8 @@ machine(MachineType:TCC, "TCC Cache") } else { trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe); } + } else if (in_msg.Type == CoherenceRequestType:WriteFlush) { + trigger(Event:Flush, in_msg.addr, cache_entry, tbe); } else { DPRINTF(RubySlicc, "%s\n", in_msg); error("Unexpected Response Message to Core"); @@ -509,6 +512,20 @@ machine(MachineType:TCC, "TCC Cache") } } + action(fw_sendFlushResponse, "fw", desc="send Flush Response") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysWBAck; + out_msg.Destination.clear(); + out_msg.Destination.add(in_msg.Requestor); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + out_msg.instSeqNum := in_msg.instSeqNum; + } + } + } + action(ar_sendAtomicResponse, "ar", desc="send Atomic Ack") { peek(coreRequestNetwork_in, CPURequestMsg) { enqueue(responseToCore_out, ResponseMsg, l2_response_latency + glc_atomic_latency, true) { @@ -628,6 +645,22 @@ machine(MachineType:TCC, "TCC Cache") } } + action(f_flush, "f", desc="write back data") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.WTRequestor := in_msg.Requestor; + out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:WriteFlush; + out_msg.Dirty := true; + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.writeMask.orMask(cache_entry.writeMask); + } + } + } + action(at_atomicThrough, "at", desc="write back data") { peek(coreRequestNetwork_in, CPURequestMsg) { enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { @@ -1075,4 +1108,21 @@ machine(MachineType:TCC, "TCC Cache") transition(WIB, WBAck,I) { pr_popResponseQueue; } + + transition({A, IV, WI, WIB}, Flush) { + st_stallAndWaitRequest; + } + + transition(I, Flush) { + fw_sendFlushResponse; + p_popRequestQueue; + } + + transition({V, W}, Flush, I) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + ut_updateTag; + f_flush; + i_invL2; + p_popRequestQueue; + } } diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm index 7e0ad4ed96..8244879c55 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm @@ -55,6 +55,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") I, AccessPermission:Invalid, desc="Invalid"; V, AccessPermission:Read_Only, desc="Valid"; A, AccessPermission:Invalid, desc="Waiting on Atomic"; + + F, AccessPermission:Invalid, desc="Flushing; Waiting for Ack"; } enumeration(Event, desc="TCP Events") { @@ -256,6 +258,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") peek(responseToTCP_in, ResponseMsg, block_on="addr") { Entry cache_entry := getCacheEntry(in_msg.addr); TBE tbe := TBEs.lookup(in_msg.addr); + DPRINTF(RubySlicc, "In responseToTCP_in with %s\n", in_msg); + if (in_msg.Type == CoherenceResponseType:TDSysResp) { if (disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) { // If L1 is disabled or requests have GLC or SLC flag set, @@ -273,6 +277,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck || in_msg.Type == CoherenceResponseType:NBSysWBAck) { trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe); + DPRINTF(RubySlicc, "Issuing TCC_AckWB\n"); } else { error("Unexpected Response Message to Core"); } @@ -469,6 +474,24 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") action(sf_setFlush, "sf", desc="set flush") { inFlush := true; APPEND_TRANSITION_COMMENT(" inFlush is true"); + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + assert(is_valid(cache_entry)); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.writeMask.clear(); + out_msg.writeMask.orMask(cache_entry.writeMask); + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:WriteFlush; + out_msg.InitialRequestTime := curCycle(); + out_msg.Shared := false; + out_msg.isSLCSet := false; + peek(mandatoryQueue_in, RubyRequest) { + out_msg.instSeqNum := in_msg.instSeqNum; + } + } } action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") { @@ -524,6 +547,16 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") cache_entry.Dirty := true; } + action(f_flushDone, "f", desc="flush done") { + assert(is_valid(cache_entry)); + + if (use_seq_not_coal) { + sequencer.writeCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache); + } else { + coalescer.writeCallback(address, MachineType:L1Cache, cache_entry.DataBlk); + } + } + action(inv_invDone, "inv", desc="local inv done") { if (use_seq_not_coal) { DPRINTF(RubySlicc, "Sequencer does not define invCallback!\n"); @@ -695,11 +728,16 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") ic_invCache; } - transition({V, I, A},Flush) {TagArrayFlash} { + transition({V,I}, Flush, F) {TagArrayFlash} { + a_allocate; sf_setFlush; p_popMandatoryQueue; } + transition(A, Flush) { + z_stall; + } + transition({I, V}, Evict, I) {TagArrayFlash} { inv_invDone; p_popMandatoryQueue; @@ -716,4 +754,10 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") wd_wtDone; pr_popResponseQueue; } + + transition(F, TCC_AckWB, I) { + f_flushDone; + pr_popResponseQueue; + ic_invCache; + } } diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm index 774b54a432..eed750832f 100644 --- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm +++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm @@ -83,6 +83,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") BM_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; B_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; B, AccessPermission:Backing_Store, desc="sent response, Blocked til ack"; + + F, AccessPermission:Busy, desc="sent Flus, blocked till ack"; } // Events @@ -120,6 +122,9 @@ machine(MachineType:Directory, "AMD Baseline protocol") // DMA DmaRead, desc="DMA read"; DmaWrite, desc="DMA write"; + + // Flush + Flush, desc="Flush entry"; } enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { @@ -411,6 +416,9 @@ machine(MachineType:Directory, "AMD Baseline protocol") DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr); trigger(Event:VicClean, in_msg.addr, entry, tbe); } + } else if (in_msg.Type == CoherenceRequestType:WriteFlush) { + DPRINTF(RubySlicc, "Got Flush from %s on %s\n", in_msg.Requestor, in_msg.addr); + trigger(Event:Flush, in_msg.addr, entry, tbe); } else { error("Bad request message type"); } @@ -562,6 +570,23 @@ machine(MachineType:Directory, "AMD Baseline protocol") } } + action(rf_sendResponseFlush, "rf", desc="send Flush Ack") { + peek(memQueue_in, MemoryMsg) { + enqueue(responseNetwork_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysWBAck; + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.WTRequestor := tbe.WTRequestor; + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := curCycle(); + //out_msg.instSeqNum := in_msg.instSeqNum; + } + } + } + action(l_queueMemWBReq, "lq", desc="Write WB data to memory") { peek(responseNetwork_in, ResponseMsg) { enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) { @@ -933,6 +958,23 @@ machine(MachineType:Directory, "AMD Baseline protocol") } } + action(f_writeFlushDataToMemory, "f", desc="Write flush data to memory") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) { + out_msg.addr := address; + out_msg.Type := MemoryRequestType:MEMORY_WB; + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Data; + out_msg.DataBlk := in_msg.DataBlk; + } + if (tbe.Dirty == false) { + // have to update the TBE, too, because of how this + // directory deals with functional writes + tbe.DataBlk := in_msg.DataBlk; + } + } + } + action(atd_allocateTBEforDMA, "atd", desc="allocate TBE Entry for DMA") { check_allocate(TBEs); peek(dmaRequestQueue_in, DMARequestMsg) { @@ -1553,4 +1595,17 @@ machine(MachineType:Directory, "AMD Baseline protocol") dt_deallocateTBE; pt_popTriggerQueue; } + + transition(U, Flush, F) {L3TagArrayRead, L3TagArrayWrite} { + t_allocateTBE; + f_writeFlushDataToMemory; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(F, WBAck, U) { + pm_popMemQueue; + dt_deallocateTBE; + } + } diff --git a/src/mem/ruby/slicc_interface/AbstractController.hh b/src/mem/ruby/slicc_interface/AbstractController.hh index 72b679d6cf..7d93644bd8 100644 --- a/src/mem/ruby/slicc_interface/AbstractController.hh +++ b/src/mem/ruby/slicc_interface/AbstractController.hh @@ -70,7 +70,9 @@ namespace ruby { class Network; +#ifdef BUILD_GPU class GPUCoalescer; +#endif class DMASequencer; // used to communicate that an in_port peeked the wrong message type diff --git a/src/mem/ruby/system/CacheRecorder.cc b/src/mem/ruby/system/CacheRecorder.cc index 20a8a30ebc..057b6aa041 100644 --- a/src/mem/ruby/system/CacheRecorder.cc +++ b/src/mem/ruby/system/CacheRecorder.cc @@ -30,8 +30,11 @@ #include "mem/ruby/system/CacheRecorder.hh" #include "debug/RubyCacheTrace.hh" +#include "mem/packet.hh" +#include "mem/ruby/system/GPUCoalescer.hh" #include "mem/ruby/system/RubySystem.hh" #include "mem/ruby/system/Sequencer.hh" +#include "sim/sim_exit.hh" namespace gem5 { @@ -54,14 +57,29 @@ CacheRecorder::CacheRecorder() { } +#if BUILD_GPU +CacheRecorder::CacheRecorder(uint8_t* uncompressed_trace, + uint64_t uncompressed_trace_size, + std::vector& seq_map, + std::vector& coal_map, + uint64_t block_size_bytes) + : m_uncompressed_trace(uncompressed_trace), + m_uncompressed_trace_size(uncompressed_trace_size), + m_seq_map(seq_map), m_coalescer_map(coal_map), m_bytes_read(0), + m_records_read(0), m_records_flushed(0), + m_block_size_bytes(block_size_bytes) +#else CacheRecorder::CacheRecorder(uint8_t* uncompressed_trace, uint64_t uncompressed_trace_size, std::vector& seq_map, uint64_t block_size_bytes) : m_uncompressed_trace(uncompressed_trace), m_uncompressed_trace_size(uncompressed_trace_size), - m_seq_map(seq_map), m_bytes_read(0), m_records_read(0), - m_records_flushed(0), m_block_size_bytes(block_size_bytes) + m_seq_map(seq_map), m_bytes_read(0), + m_records_read(0), m_records_flushed(0), + m_block_size_bytes(block_size_bytes) + +#endif { if (m_uncompressed_trace != NULL) { if (m_block_size_bytes < RubySystem::getBlockSizeBytes()) { @@ -81,6 +99,9 @@ CacheRecorder::~CacheRecorder() m_uncompressed_trace = NULL; } m_seq_map.clear(); +#if BUILD_GPU + m_coalescer_map.clear(); +#endif } void @@ -96,11 +117,27 @@ CacheRecorder::enqueueNextFlushRequest() Packet *pkt = new Packet(req, requestType); Sequencer* m_sequencer_ptr = m_seq_map[rec->m_cntrl_id]; +#if BUILD_GPU + GPUCoalescer* m_coal_ptr = m_coalescer_map[rec->m_cntrl_id]; +#endif assert(m_sequencer_ptr != NULL); +#if BUILD_GPU + if (m_coal_ptr == NULL) + m_sequencer_ptr->makeRequest(pkt); + else { + pkt->req->setReqInstSeqNum(m_records_flushed - 1); + m_coal_ptr->makeRequest(pkt); + } +#else m_sequencer_ptr->makeRequest(pkt); +#endif DPRINTF(RubyCacheTrace, "Flushing %s\n", *rec); + } else { + if (m_records_flushed > 0) { + exitSimLoop("Finished Drain", 0); + } DPRINTF(RubyCacheTrace, "Flushed all %d records\n", m_records_flushed); } } @@ -143,13 +180,27 @@ CacheRecorder::enqueueNextFetchRequest() pkt->dataStatic(traceRecord->m_data + rec_bytes_read); Sequencer* m_sequencer_ptr = m_seq_map[traceRecord->m_cntrl_id]; +#if BUILD_GPU + GPUCoalescer* m_coal_ptr; + m_coal_ptr = m_coalescer_map[traceRecord->m_cntrl_id]; +#endif assert(m_sequencer_ptr != NULL); +#if BUILD_GPU + if (m_coal_ptr == NULL) + m_sequencer_ptr->makeRequest(pkt); + else { + pkt->req->setReqInstSeqNum(m_records_read); + m_coal_ptr->makeRequest(pkt); + } +#else m_sequencer_ptr->makeRequest(pkt); +#endif } m_bytes_read += (sizeof(TraceRecord) + m_block_size_bytes); m_records_read++; } else { + exitSimLoop("Finished Warmup", 0); DPRINTF(RubyCacheTrace, "Fetched all %d records\n", m_records_read); } } @@ -168,6 +219,8 @@ CacheRecorder::addRecord(int cntrl, Addr data_addr, Addr pc_addr, memcpy(rec->m_data, data.getData(0, m_block_size_bytes), m_block_size_bytes); + DPRINTF(RubyCacheTrace, "Inside addRecord with cntrl id %d and type %d\n", + cntrl, type); m_records.push_back(rec); } diff --git a/src/mem/ruby/system/CacheRecorder.hh b/src/mem/ruby/system/CacheRecorder.hh index be95590313..e94dfad97a 100644 --- a/src/mem/ruby/system/CacheRecorder.hh +++ b/src/mem/ruby/system/CacheRecorder.hh @@ -38,6 +38,7 @@ #include #include "base/types.hh" +#include "config/build_gpu.hh" #include "mem/ruby/common/Address.hh" #include "mem/ruby/common/DataBlock.hh" #include "mem/ruby/common/TypeDefines.hh" @@ -50,6 +51,9 @@ namespace ruby { class Sequencer; +#if BUILD_GPU +class GPUCoalescer; +#endif /*! * Class for recording cache contents. Note that the last element of the @@ -76,10 +80,18 @@ class CacheRecorder CacheRecorder(); ~CacheRecorder(); +#if BUILD_GPU + CacheRecorder(uint8_t* uncompressed_trace, + uint64_t uncompressed_trace_size, + std::vector& SequencerMap, + std::vector& CoalescerMap, + uint64_t block_size_bytes); +#else CacheRecorder(uint8_t* uncompressed_trace, uint64_t uncompressed_trace_size, std::vector& SequencerMap, uint64_t block_size_bytes); +#endif void addRecord(int cntrl, Addr data_addr, Addr pc_addr, RubyRequestType type, Tick time, DataBlock& data); @@ -115,6 +127,9 @@ class CacheRecorder uint8_t* m_uncompressed_trace; uint64_t m_uncompressed_trace_size; std::vector m_seq_map; +#if BUILD_GPU + std::vector m_coalescer_map; +#endif uint64_t m_bytes_read; uint64_t m_records_read; uint64_t m_records_flushed; diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc index beb8da3f9c..a70af07467 100644 --- a/src/mem/ruby/system/GPUCoalescer.cc +++ b/src/mem/ruby/system/GPUCoalescer.cc @@ -73,6 +73,14 @@ UncoalescedTable::insertPacket(PacketPtr pkt) pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size()); } +void +UncoalescedTable::insertReqType(PacketPtr pkt, RubyRequestType type) +{ + uint64_t seqNum = pkt->req->getReqInstSeqNum(); + + reqTypeMap[seqNum] = type; +} + bool UncoalescedTable::packetAvailable() { @@ -128,9 +136,21 @@ UncoalescedTable::updateResources() instMap.erase(iter++); instPktsRemaining.erase(seq_num); - // Release the token - DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", seq_num); - coalescer->getGMTokenPort().sendTokens(1); + // Release the token if the Ruby system is not in cooldown + // or warmup phases. When in these phases, the RubyPorts + // are accessed directly using the makeRequest() command + // instead of accessing through the port. This makes + // sending tokens through the port unnecessary + if (!RubySystem::getWarmupEnabled() + && !RubySystem::getCooldownEnabled()) { + if (reqTypeMap[seq_num] != RubyRequestType_FLUSH) { + DPRINTF(GPUCoalescer, + "Returning token seqNum %d\n", seq_num); + coalescer->getGMTokenPort().sendTokens(1); + } + } + + reqTypeMap.erase(seq_num); } else { ++iter; } @@ -565,6 +585,14 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest, for (auto& pkt : pktList) { offset = getOffset(pkt->getAddr()); pkt_size = pkt->getSize(); + request_address = pkt->getAddr(); + + // When the Ruby system is cooldown phase, the requests come from + // the cache recorder. These requests do not get coalesced and + // do not return valid data. + if (RubySystem::getCooldownEnabled()) + continue; + if (pkt->getPtr()) { switch(type) { // Store and AtomicNoReturns follow the same path, as the @@ -627,7 +655,6 @@ GPUCoalescer::getRequestType(PacketPtr pkt) assert(!pkt->req->isLLSC()); assert(!pkt->req->isLockedRMW()); assert(!pkt->req->isInstFetch()); - assert(!pkt->isFlush()); if (pkt->req->isAtomicReturn()) { req_type = RubyRequestType_ATOMIC_RETURN; @@ -637,6 +664,8 @@ GPUCoalescer::getRequestType(PacketPtr pkt) req_type = RubyRequestType_LD; } else if (pkt->isWrite()) { req_type = RubyRequestType_ST; + } else if (pkt->isFlush()) { + req_type = RubyRequestType_FLUSH; } else { panic("Unsupported ruby packet type\n"); } @@ -658,7 +687,7 @@ GPUCoalescer::makeRequest(PacketPtr pkt) issueMemSyncRequest(pkt); } else { // otherwise, this must be either read or write command - assert(pkt->isRead() || pkt->isWrite()); + assert(pkt->isRead() || pkt->isWrite() || pkt->isFlush()); InstSeqNum seq_num = pkt->req->getReqInstSeqNum(); @@ -667,10 +696,17 @@ GPUCoalescer::makeRequest(PacketPtr pkt) // number of lanes actives for that vmem request (i.e., the popcnt // of the exec_mask. int num_packets = 1; - if (!m_usingRubyTester) { - num_packets = 0; - for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) { - num_packets += getDynInst(pkt)->getLaneStatus(i); + + // When Ruby is in warmup or cooldown phase, the requests come from + // the cache recorder. There is no dynamic instruction associated + // with these requests either + if (!RubySystem::getWarmupEnabled() + && !RubySystem::getCooldownEnabled()) { + if (!m_usingRubyTester) { + num_packets = 0; + for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) { + num_packets += getDynInst(pkt)->getLaneStatus(i); + } } } @@ -679,6 +715,7 @@ GPUCoalescer::makeRequest(PacketPtr pkt) // future cycle. Packets remaining is set to the number of excepted // requests from the instruction based on its exec_mask. uncoalescedTable.insertPacket(pkt); + uncoalescedTable.insertReqType(pkt, getRequestType(pkt)); uncoalescedTable.initPacketsRemaining(seq_num, num_packets); DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n", pkt->getAddr()); @@ -945,21 +982,27 @@ void GPUCoalescer::completeHitCallback(std::vector & mylist) { for (auto& pkt : mylist) { - RubyPort::SenderState *ss = - safe_cast(pkt->senderState); - MemResponsePort *port = ss->port; - assert(port != NULL); + // When Ruby is in warmup or cooldown phase, the requests come + // from the cache recorder. They do not track which port to use + // and do not need to send the response back + if (!RubySystem::getWarmupEnabled() + && !RubySystem::getCooldownEnabled()) { + RubyPort::SenderState *ss = + safe_cast(pkt->senderState); + MemResponsePort *port = ss->port; + assert(port != NULL); - pkt->senderState = ss->predecessor; + pkt->senderState = ss->predecessor; - if (pkt->cmd != MemCmd::WriteReq) { - // for WriteReq, we keep the original senderState until - // writeCompleteCallback - delete ss; + if (pkt->cmd != MemCmd::WriteReq) { + // for WriteReq, we keep the original senderState until + // writeCompleteCallback + delete ss; + } + + port->hitCallback(pkt); + trySendRetries(); } - - port->hitCallback(pkt); - trySendRetries(); } // We schedule an event in the same tick as hitCallback (similar to @@ -971,7 +1014,14 @@ GPUCoalescer::completeHitCallback(std::vector & mylist) schedule(issueEvent, curTick()); } - testDrainComplete(); + RubySystem *rs = m_ruby_system; + if (RubySystem::getWarmupEnabled()) { + rs->m_cache_recorder->enqueueNextFetchRequest(); + } else if (RubySystem::getCooldownEnabled()) { + rs->m_cache_recorder->enqueueNextFlushRequest(); + } else { + testDrainComplete(); + } } void diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh index dd28855547..3f936b4b41 100644 --- a/src/mem/ruby/system/GPUCoalescer.hh +++ b/src/mem/ruby/system/GPUCoalescer.hh @@ -32,6 +32,10 @@ #ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ #define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ +#include "config/build_gpu.hh" + +#if BUILD_GPU + #include #include @@ -71,6 +75,7 @@ class UncoalescedTable ~UncoalescedTable() {} void insertPacket(PacketPtr pkt); + void insertReqType(PacketPtr pkt, RubyRequestType type); bool packetAvailable(); void printRequestTable(std::stringstream& ss); @@ -101,6 +106,8 @@ class UncoalescedTable std::map instMap; std::map instPktsRemaining; + + std::map reqTypeMap; }; class CoalescedRequest @@ -543,4 +550,5 @@ operator<<(std::ostream& out, const GPUCoalescer& obj) } // namespace ruby } // namespace gem5 +#endif // BUILD_GPU #endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ diff --git a/src/mem/ruby/system/RubySystem.cc b/src/mem/ruby/system/RubySystem.cc index b38c903b09..32dec7b9e0 100644 --- a/src/mem/ruby/system/RubySystem.cc +++ b/src/mem/ruby/system/RubySystem.cc @@ -178,13 +178,28 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace, uint64_t block_size_bytes) { std::vector sequencer_map; +#if BUILD_GPU + std::vector coalescer_map; + GPUCoalescer* coalescer_ptr = NULL; +#endif Sequencer* sequencer_ptr = NULL; for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) { sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getCPUSequencer()); +#if BUILD_GPU + coalescer_map.push_back(m_abs_cntrl_vec[cntrl]->getGPUCoalescer()); +#endif + if (sequencer_ptr == NULL) { sequencer_ptr = sequencer_map[cntrl]; } + +#if BUILD_GPU + if (coalescer_ptr == NULL) { + coalescer_ptr = coalescer_map[cntrl]; + } +#endif + } assert(sequencer_ptr != NULL); @@ -193,6 +208,13 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace, if (sequencer_map[cntrl] == NULL) { sequencer_map[cntrl] = sequencer_ptr; } + +#if BUILD_GPU + if (coalescer_map[cntrl] == NULL) { + coalescer_map[cntrl] = coalescer_ptr; + } +#endif + } // Remove the old CacheRecorder if it's still hanging about. @@ -201,8 +223,15 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace, } // Create the CacheRecorder and record the cache trace +#if BUILD_GPU m_cache_recorder = new CacheRecorder(uncompressed_trace, cache_trace_size, - sequencer_map, block_size_bytes); + sequencer_map, coalescer_map, + block_size_bytes); +#else + m_cache_recorder = new CacheRecorder(uncompressed_trace, cache_trace_size, + sequencer_map, + block_size_bytes); +#endif } void diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc index ea95129841..a5198cce63 100644 --- a/src/mem/ruby/system/VIPERCoalescer.cc +++ b/src/mem/ruby/system/VIPERCoalescer.cc @@ -75,12 +75,14 @@ VIPERCoalescer::makeRequest(PacketPtr pkt) // ReadReq : cache read // WriteReq : cache write // AtomicOp : cache atomic + // Flush : flush and invalidate cache // // VIPER does not expect MemSyncReq & Release since in GCN3, compute unit // does not specify an equivalent type of memory request. assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL1()) || pkt->cmd == MemCmd::ReadReq || pkt->cmd == MemCmd::WriteReq || + pkt->cmd == MemCmd::FlushReq || pkt->isAtomicOp()); if (pkt->req->isInvL1() && m_cache_inv_pkt) { diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh index c7e21e946b..d185620244 100644 --- a/src/mem/ruby/system/VIPERCoalescer.hh +++ b/src/mem/ruby/system/VIPERCoalescer.hh @@ -32,6 +32,10 @@ #ifndef __MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__ #define __MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__ +#include "config/build_gpu.hh" + +#if BUILD_GPU + #include #include "mem/ruby/common/Address.hh" @@ -92,4 +96,5 @@ class VIPERCoalescer : public GPUCoalescer } // namespace ruby } // namespace gem5 +#endif // BUILD_GPU #endif //__MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__