From 107e05266d9cd03b2f9ff5ba7ac4d8be430a3aa0 Mon Sep 17 00:00:00 2001 From: Vishnu Ramadas Date: Fri, 29 Sep 2023 14:29:47 -0500 Subject: [PATCH 1/8] dev-amdgpu: Add aql, hsa queue information to checkpoint-restore GPUFS uses aql information from PM4 queues to initialize doorbells. This commit adds aql information to the checkpoint so that it can be used during restoration to correctly initialize all doorbells. Additionally, this commit also sets the hsa queue correctly during checkpoint-restoration Change-Id: Ief3ef6dc973f70f27255234872a12c396df05d89 --- src/dev/amdgpu/pm4_packet_processor.cc | 29 +++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc index e7b846529e..63a3bf8887 100644 --- a/src/dev/amdgpu/pm4_packet_processor.cc +++ b/src/dev/amdgpu/pm4_packet_processor.cc @@ -1044,6 +1044,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const int num_queues = queues.size(); Addr id[num_queues]; Addr mqd_base[num_queues]; + uint64_t mqd_read_index[num_queues]; Addr base[num_queues]; Addr rptr[num_queues]; Addr wptr[num_queues]; @@ -1060,6 +1061,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const uint32_t hqd_active[num_queues]; uint32_t hqd_vmid[num_queues]; Addr aql_rptr[num_queues]; + uint32_t aql[num_queues]; uint32_t doorbell[num_queues]; uint32_t hqd_pq_control[num_queues]; @@ -1068,9 +1070,10 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const PM4Queue *q = iter.second; id[i] = q->id(); mqd_base[i] = q->mqdBase(); + mqd_read_index[i] = q->getMQD()->mqdReadIndex; bool cur_state = q->ib(); q->ib(false); - base[i] = q->base() >> 8; + base[i] = q->base(); rptr[i] = q->getRptr(); wptr[i] = q->getWptr(); q->ib(true); @@ -1088,6 +1091,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const hqd_active[i] = q->getMQD()->hqd_active; hqd_vmid[i] = q->getMQD()->hqd_vmid; aql_rptr[i] = q->getMQD()->aqlRptr; + aql[i] = q->getMQD()->aql; doorbell[i] = q->getMQD()->doorbell; hqd_pq_control[i] = q->getMQD()->hqd_pq_control; i++; @@ -1096,6 +1100,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const SERIALIZE_SCALAR(num_queues); SERIALIZE_ARRAY(id, num_queues); SERIALIZE_ARRAY(mqd_base, num_queues); + SERIALIZE_ARRAY(mqd_read_index, num_queues); SERIALIZE_ARRAY(base, num_queues); SERIALIZE_ARRAY(rptr, num_queues); SERIALIZE_ARRAY(wptr, num_queues); @@ -1112,6 +1117,7 @@ PM4PacketProcessor::serialize(CheckpointOut &cp) const SERIALIZE_ARRAY(hqd_active, num_queues); SERIALIZE_ARRAY(hqd_vmid, num_queues); SERIALIZE_ARRAY(aql_rptr, num_queues); + SERIALIZE_ARRAY(aql, num_queues); SERIALIZE_ARRAY(doorbell, num_queues); SERIALIZE_ARRAY(hqd_pq_control, num_queues); } @@ -1127,6 +1133,7 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp) Addr id[num_queues]; Addr mqd_base[num_queues]; + uint64_t mqd_read_index[num_queues]; Addr base[num_queues]; Addr rptr[num_queues]; Addr wptr[num_queues]; @@ -1143,11 +1150,13 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp) uint32_t hqd_active[num_queues]; uint32_t hqd_vmid[num_queues]; Addr aql_rptr[num_queues]; + uint32_t aql[num_queues]; uint32_t doorbell[num_queues]; uint32_t hqd_pq_control[num_queues]; UNSERIALIZE_ARRAY(id, num_queues); UNSERIALIZE_ARRAY(mqd_base, num_queues); + UNSERIALIZE_ARRAY(mqd_read_index, num_queues); UNSERIALIZE_ARRAY(base, num_queues); UNSERIALIZE_ARRAY(rptr, num_queues); UNSERIALIZE_ARRAY(wptr, num_queues); @@ -1164,6 +1173,7 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp) UNSERIALIZE_ARRAY(hqd_active, num_queues); UNSERIALIZE_ARRAY(hqd_vmid, num_queues); UNSERIALIZE_ARRAY(aql_rptr, num_queues); + UNSERIALIZE_ARRAY(aql, num_queues); UNSERIALIZE_ARRAY(doorbell, num_queues); UNSERIALIZE_ARRAY(hqd_pq_control, num_queues); @@ -1172,19 +1182,20 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp) memset(mqd, 0, sizeof(QueueDesc)); mqd->mqdBase = mqd_base[i] >> 8; - mqd->base = base[i]; - mqd->rptr = rptr[i]; - mqd->ibBase = ib_base[i]; - mqd->ibRptr = ib_rptr[i]; + mqd->mqdReadIndex = mqd_read_index[i]; + mqd->base = base[i] >> 8; + mqd->aql = aql[i]; PM4MapQueues* pkt = new PM4MapQueues; memset(pkt, 0, sizeof(PM4MapQueues)); newQueue(mqd, offset[i], pkt, id[i]); queues[id[i]]->ib(false); + queues[id[i]]->rptr(rptr[i]); queues[id[i]]->wptr(wptr[i]); queues[id[i]]->ib(true); queues[id[i]]->wptr(ib_wptr[i]); + queues[id[i]]->rptr(ib_rptr[i]); queues[id[i]]->offset(offset[i]); queues[id[i]]->processing(processing[i]); queues[id[i]]->ib(ib[i]); @@ -1195,6 +1206,14 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp) queues[id[i]]->getMQD()->doorbell = doorbell[i]; queues[id[i]]->getMQD()->hqd_pq_control = hqd_pq_control[i]; + if (mqd->aql) { + int mqd_size = (1 << ((hqd_pq_control[i] & 0x3f) + 1)) * 4; + auto &hsa_pp = gpuDevice->CP()->hsaPacketProc(); + hsa_pp.setDeviceQueueDesc(aql_rptr[i], base[i], id[i], + mqd_size, 8, GfxVersion::gfx900, offset[i], + mqd_read_index[i]); + } + DPRINTF(PM4PacketProcessor, "PM4 queue %d, rptr: %p wptr: %p\n", queues[id[i]]->id(), queues[id[i]]->rptr(), queues[id[i]]->wptr()); From a50ead5907771d4b38bcfed615069d9e3ad9283d Mon Sep 17 00:00:00 2001 From: Vishnu Ramadas Date: Fri, 29 Sep 2023 16:32:22 -0500 Subject: [PATCH 2/8] mem-ruby: Add Flush as a supported memory type in VIPERCoalescer This commit adds flush as a recognized memory type in VIPERCoalescer. Change-Id: I0f1b6f4518548e8e893ef681955b12a49293d8b4 --- src/mem/ruby/system/VIPERCoalescer.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc index ea95129841..a5198cce63 100644 --- a/src/mem/ruby/system/VIPERCoalescer.cc +++ b/src/mem/ruby/system/VIPERCoalescer.cc @@ -75,12 +75,14 @@ VIPERCoalescer::makeRequest(PacketPtr pkt) // ReadReq : cache read // WriteReq : cache write // AtomicOp : cache atomic + // Flush : flush and invalidate cache // // VIPER does not expect MemSyncReq & Release since in GCN3, compute unit // does not specify an equivalent type of memory request. assert((pkt->cmd == MemCmd::MemSyncReq && pkt->req->isInvL1()) || pkt->cmd == MemCmd::ReadReq || pkt->cmd == MemCmd::WriteReq || + pkt->cmd == MemCmd::FlushReq || pkt->isAtomicOp()); if (pkt->req->isInvL1() && m_cache_inv_pkt) { From 61e39d5b26465ce362249bd544cc68a725af1fdf Mon Sep 17 00:00:00 2001 From: Vishnu Ramadas Date: Fri, 29 Sep 2023 16:37:41 -0500 Subject: [PATCH 3/8] mem-ruby: Add cache cooldown and warmup support to GPUCoalescer The GPU Coalescer does not contain cache cooldown and warmup support. This commit updates the coalsecer to support cache cooldown during flush and warmup during checkpoint restore. Change-Id: I5459471dec20ff304fd5954af1079a7486ee860a --- src/mem/ruby/system/GPUCoalescer.cc | 94 ++++++++++++++++++++++------- src/mem/ruby/system/GPUCoalescer.hh | 3 + 2 files changed, 75 insertions(+), 22 deletions(-) diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc index beb8da3f9c..a70af07467 100644 --- a/src/mem/ruby/system/GPUCoalescer.cc +++ b/src/mem/ruby/system/GPUCoalescer.cc @@ -73,6 +73,14 @@ UncoalescedTable::insertPacket(PacketPtr pkt) pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size()); } +void +UncoalescedTable::insertReqType(PacketPtr pkt, RubyRequestType type) +{ + uint64_t seqNum = pkt->req->getReqInstSeqNum(); + + reqTypeMap[seqNum] = type; +} + bool UncoalescedTable::packetAvailable() { @@ -128,9 +136,21 @@ UncoalescedTable::updateResources() instMap.erase(iter++); instPktsRemaining.erase(seq_num); - // Release the token - DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", seq_num); - coalescer->getGMTokenPort().sendTokens(1); + // Release the token if the Ruby system is not in cooldown + // or warmup phases. When in these phases, the RubyPorts + // are accessed directly using the makeRequest() command + // instead of accessing through the port. This makes + // sending tokens through the port unnecessary + if (!RubySystem::getWarmupEnabled() + && !RubySystem::getCooldownEnabled()) { + if (reqTypeMap[seq_num] != RubyRequestType_FLUSH) { + DPRINTF(GPUCoalescer, + "Returning token seqNum %d\n", seq_num); + coalescer->getGMTokenPort().sendTokens(1); + } + } + + reqTypeMap.erase(seq_num); } else { ++iter; } @@ -565,6 +585,14 @@ GPUCoalescer::hitCallback(CoalescedRequest* crequest, for (auto& pkt : pktList) { offset = getOffset(pkt->getAddr()); pkt_size = pkt->getSize(); + request_address = pkt->getAddr(); + + // When the Ruby system is cooldown phase, the requests come from + // the cache recorder. These requests do not get coalesced and + // do not return valid data. + if (RubySystem::getCooldownEnabled()) + continue; + if (pkt->getPtr()) { switch(type) { // Store and AtomicNoReturns follow the same path, as the @@ -627,7 +655,6 @@ GPUCoalescer::getRequestType(PacketPtr pkt) assert(!pkt->req->isLLSC()); assert(!pkt->req->isLockedRMW()); assert(!pkt->req->isInstFetch()); - assert(!pkt->isFlush()); if (pkt->req->isAtomicReturn()) { req_type = RubyRequestType_ATOMIC_RETURN; @@ -637,6 +664,8 @@ GPUCoalescer::getRequestType(PacketPtr pkt) req_type = RubyRequestType_LD; } else if (pkt->isWrite()) { req_type = RubyRequestType_ST; + } else if (pkt->isFlush()) { + req_type = RubyRequestType_FLUSH; } else { panic("Unsupported ruby packet type\n"); } @@ -658,7 +687,7 @@ GPUCoalescer::makeRequest(PacketPtr pkt) issueMemSyncRequest(pkt); } else { // otherwise, this must be either read or write command - assert(pkt->isRead() || pkt->isWrite()); + assert(pkt->isRead() || pkt->isWrite() || pkt->isFlush()); InstSeqNum seq_num = pkt->req->getReqInstSeqNum(); @@ -667,10 +696,17 @@ GPUCoalescer::makeRequest(PacketPtr pkt) // number of lanes actives for that vmem request (i.e., the popcnt // of the exec_mask. int num_packets = 1; - if (!m_usingRubyTester) { - num_packets = 0; - for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) { - num_packets += getDynInst(pkt)->getLaneStatus(i); + + // When Ruby is in warmup or cooldown phase, the requests come from + // the cache recorder. There is no dynamic instruction associated + // with these requests either + if (!RubySystem::getWarmupEnabled() + && !RubySystem::getCooldownEnabled()) { + if (!m_usingRubyTester) { + num_packets = 0; + for (int i = 0; i < TheGpuISA::NumVecElemPerVecReg; i++) { + num_packets += getDynInst(pkt)->getLaneStatus(i); + } } } @@ -679,6 +715,7 @@ GPUCoalescer::makeRequest(PacketPtr pkt) // future cycle. Packets remaining is set to the number of excepted // requests from the instruction based on its exec_mask. uncoalescedTable.insertPacket(pkt); + uncoalescedTable.insertReqType(pkt, getRequestType(pkt)); uncoalescedTable.initPacketsRemaining(seq_num, num_packets); DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n", pkt->getAddr()); @@ -945,21 +982,27 @@ void GPUCoalescer::completeHitCallback(std::vector & mylist) { for (auto& pkt : mylist) { - RubyPort::SenderState *ss = - safe_cast(pkt->senderState); - MemResponsePort *port = ss->port; - assert(port != NULL); + // When Ruby is in warmup or cooldown phase, the requests come + // from the cache recorder. They do not track which port to use + // and do not need to send the response back + if (!RubySystem::getWarmupEnabled() + && !RubySystem::getCooldownEnabled()) { + RubyPort::SenderState *ss = + safe_cast(pkt->senderState); + MemResponsePort *port = ss->port; + assert(port != NULL); - pkt->senderState = ss->predecessor; + pkt->senderState = ss->predecessor; - if (pkt->cmd != MemCmd::WriteReq) { - // for WriteReq, we keep the original senderState until - // writeCompleteCallback - delete ss; + if (pkt->cmd != MemCmd::WriteReq) { + // for WriteReq, we keep the original senderState until + // writeCompleteCallback + delete ss; + } + + port->hitCallback(pkt); + trySendRetries(); } - - port->hitCallback(pkt); - trySendRetries(); } // We schedule an event in the same tick as hitCallback (similar to @@ -971,7 +1014,14 @@ GPUCoalescer::completeHitCallback(std::vector & mylist) schedule(issueEvent, curTick()); } - testDrainComplete(); + RubySystem *rs = m_ruby_system; + if (RubySystem::getWarmupEnabled()) { + rs->m_cache_recorder->enqueueNextFetchRequest(); + } else if (RubySystem::getCooldownEnabled()) { + rs->m_cache_recorder->enqueueNextFlushRequest(); + } else { + testDrainComplete(); + } } void diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh index dd28855547..d6db5c00ba 100644 --- a/src/mem/ruby/system/GPUCoalescer.hh +++ b/src/mem/ruby/system/GPUCoalescer.hh @@ -71,6 +71,7 @@ class UncoalescedTable ~UncoalescedTable() {} void insertPacket(PacketPtr pkt); + void insertReqType(PacketPtr pkt, RubyRequestType type); bool packetAvailable(); void printRequestTable(std::stringstream& ss); @@ -101,6 +102,8 @@ class UncoalescedTable std::map instMap; std::map instPktsRemaining; + + std::map reqTypeMap; }; class CoalescedRequest From 085789d00c4391b6b863981fb25e9cb8a7e7a445 Mon Sep 17 00:00:00 2001 From: Vishnu Ramadas Date: Fri, 29 Sep 2023 18:19:37 -0500 Subject: [PATCH 4/8] mem-ruby: Add flush support to GPU_VIPER protocol This commit adds flush support to the GPU VIPER coherence protocol. The L1 cache will now initiate a flush request if the packet it receives is of type RubyRequestType_FLUSH. During the flush process, the L1 cache will a request to L2 if its in either V or I state. L2 will issue a flush request to the directory if its cache line is in the valid state before invalidating its copy. The directory, on receiving this request, writes data to memory and sends an ack back to the L2. L2 forwards this ack back to the L1, which then ends the flush by calling the write callback Change-Id: I9dfc0c7b71a1e9f6d5e9e6ed4977c1e6a3b5ba46 --- src/mem/ruby/protocol/GPU_VIPER-TCC.sm | 52 ++++++++++++++++++- src/mem/ruby/protocol/GPU_VIPER-TCP.sm | 46 ++++++++++++++++- src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm | 55 +++++++++++++++++++++ 3 files changed, 151 insertions(+), 2 deletions(-) diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm index 20a0979af1..be1243aaa5 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-TCC.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-TCC.sm @@ -65,7 +65,8 @@ machine(MachineType:TCC, "TCC Cache") AtomicPassOn, desc="Atomic Op Passed on to Directory"; AtomicDone, desc="AtomicOps Complete"; AtomicNotDone, desc="AtomicOps not Complete"; - Data, desc="data messgae"; + Data, desc="Data message"; + Flush, desc="Flush cache entry"; // Coming from this TCC L2_Repl, desc="L2 Replacement"; // Probes @@ -376,6 +377,8 @@ machine(MachineType:TCC, "TCC Cache") } else { trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe); } + } else if (in_msg.Type == CoherenceRequestType:WriteFlush) { + trigger(Event:Flush, in_msg.addr, cache_entry, tbe); } else { DPRINTF(RubySlicc, "%s\n", in_msg); error("Unexpected Response Message to Core"); @@ -509,6 +512,20 @@ machine(MachineType:TCC, "TCC Cache") } } + action(fw_sendFlushResponse, "fw", desc="send Flush Response") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(responseToCore_out, ResponseMsg, l2_response_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:TDSysWBAck; + out_msg.Destination.clear(); + out_msg.Destination.add(in_msg.Requestor); + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + out_msg.instSeqNum := in_msg.instSeqNum; + } + } + } + action(ar_sendAtomicResponse, "ar", desc="send Atomic Ack") { peek(coreRequestNetwork_in, CPURequestMsg) { enqueue(responseToCore_out, ResponseMsg, l2_response_latency + glc_atomic_latency, true) { @@ -628,6 +645,22 @@ machine(MachineType:TCC, "TCC Cache") } } + action(f_flush, "f", desc="write back data") { + peek(coreRequestNetwork_in, CPURequestMsg) { + enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + out_msg.WTRequestor := in_msg.Requestor; + out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:WriteFlush; + out_msg.Dirty := true; + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.writeMask.orMask(cache_entry.writeMask); + } + } + } + action(at_atomicThrough, "at", desc="write back data") { peek(coreRequestNetwork_in, CPURequestMsg) { enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) { @@ -1075,4 +1108,21 @@ machine(MachineType:TCC, "TCC Cache") transition(WIB, WBAck,I) { pr_popResponseQueue; } + + transition({A, IV, WI, WIB}, Flush) { + st_stallAndWaitRequest; + } + + transition(I, Flush) { + fw_sendFlushResponse; + p_popRequestQueue; + } + + transition({V, W}, Flush, I) {TagArrayRead, TagArrayWrite} { + t_allocateTBE; + ut_updateTag; + f_flush; + i_invL2; + p_popRequestQueue; + } } diff --git a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm index 7e0ad4ed96..8244879c55 100644 --- a/src/mem/ruby/protocol/GPU_VIPER-TCP.sm +++ b/src/mem/ruby/protocol/GPU_VIPER-TCP.sm @@ -55,6 +55,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") I, AccessPermission:Invalid, desc="Invalid"; V, AccessPermission:Read_Only, desc="Valid"; A, AccessPermission:Invalid, desc="Waiting on Atomic"; + + F, AccessPermission:Invalid, desc="Flushing; Waiting for Ack"; } enumeration(Event, desc="TCP Events") { @@ -256,6 +258,8 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") peek(responseToTCP_in, ResponseMsg, block_on="addr") { Entry cache_entry := getCacheEntry(in_msg.addr); TBE tbe := TBEs.lookup(in_msg.addr); + DPRINTF(RubySlicc, "In responseToTCP_in with %s\n", in_msg); + if (in_msg.Type == CoherenceResponseType:TDSysResp) { if (disableL1 || in_msg.isGLCSet || in_msg.isSLCSet) { // If L1 is disabled or requests have GLC or SLC flag set, @@ -273,6 +277,7 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck || in_msg.Type == CoherenceResponseType:NBSysWBAck) { trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe); + DPRINTF(RubySlicc, "Issuing TCC_AckWB\n"); } else { error("Unexpected Response Message to Core"); } @@ -469,6 +474,24 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") action(sf_setFlush, "sf", desc="set flush") { inFlush := true; APPEND_TRANSITION_COMMENT(" inFlush is true"); + enqueue(requestNetwork_out, CPURequestMsg, issue_latency) { + out_msg.addr := address; + out_msg.Requestor := machineID; + assert(is_valid(cache_entry)); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.writeMask.clear(); + out_msg.writeMask.orMask(cache_entry.writeMask); + out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC, + TCC_select_low_bit, TCC_select_num_bits)); + out_msg.MessageSize := MessageSizeType:Data; + out_msg.Type := CoherenceRequestType:WriteFlush; + out_msg.InitialRequestTime := curCycle(); + out_msg.Shared := false; + out_msg.isSLCSet := false; + peek(mandatoryQueue_in, RubyRequest) { + out_msg.instSeqNum := in_msg.instSeqNum; + } + } } action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") { @@ -524,6 +547,16 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") cache_entry.Dirty := true; } + action(f_flushDone, "f", desc="flush done") { + assert(is_valid(cache_entry)); + + if (use_seq_not_coal) { + sequencer.writeCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache); + } else { + coalescer.writeCallback(address, MachineType:L1Cache, cache_entry.DataBlk); + } + } + action(inv_invDone, "inv", desc="local inv done") { if (use_seq_not_coal) { DPRINTF(RubySlicc, "Sequencer does not define invCallback!\n"); @@ -695,11 +728,16 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") ic_invCache; } - transition({V, I, A},Flush) {TagArrayFlash} { + transition({V,I}, Flush, F) {TagArrayFlash} { + a_allocate; sf_setFlush; p_popMandatoryQueue; } + transition(A, Flush) { + z_stall; + } + transition({I, V}, Evict, I) {TagArrayFlash} { inv_invDone; p_popMandatoryQueue; @@ -716,4 +754,10 @@ machine(MachineType:TCP, "GPU TCP (L1 Data Cache)") wd_wtDone; pr_popResponseQueue; } + + transition(F, TCC_AckWB, I) { + f_flushDone; + pr_popResponseQueue; + ic_invCache; + } } diff --git a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm index 774b54a432..eed750832f 100644 --- a/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm +++ b/src/mem/ruby/protocol/MOESI_AMD_Base-dir.sm @@ -83,6 +83,8 @@ machine(MachineType:Directory, "AMD Baseline protocol") BM_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; B_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory"; B, AccessPermission:Backing_Store, desc="sent response, Blocked til ack"; + + F, AccessPermission:Busy, desc="sent Flus, blocked till ack"; } // Events @@ -120,6 +122,9 @@ machine(MachineType:Directory, "AMD Baseline protocol") // DMA DmaRead, desc="DMA read"; DmaWrite, desc="DMA write"; + + // Flush + Flush, desc="Flush entry"; } enumeration(RequestType, desc="To communicate stats from transitions to recordStats") { @@ -411,6 +416,9 @@ machine(MachineType:Directory, "AMD Baseline protocol") DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr); trigger(Event:VicClean, in_msg.addr, entry, tbe); } + } else if (in_msg.Type == CoherenceRequestType:WriteFlush) { + DPRINTF(RubySlicc, "Got Flush from %s on %s\n", in_msg.Requestor, in_msg.addr); + trigger(Event:Flush, in_msg.addr, entry, tbe); } else { error("Bad request message type"); } @@ -562,6 +570,23 @@ machine(MachineType:Directory, "AMD Baseline protocol") } } + action(rf_sendResponseFlush, "rf", desc="send Flush Ack") { + peek(memQueue_in, MemoryMsg) { + enqueue(responseNetwork_out, ResponseMsg, 1) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:NBSysWBAck; + out_msg.Destination.add(tbe.OriginalRequestor); + out_msg.WTRequestor := tbe.WTRequestor; + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Control; + out_msg.InitialRequestTime := tbe.InitialRequestTime; + out_msg.ForwardRequestTime := curCycle(); + out_msg.ProbeRequestStartTime := curCycle(); + //out_msg.instSeqNum := in_msg.instSeqNum; + } + } + } + action(l_queueMemWBReq, "lq", desc="Write WB data to memory") { peek(responseNetwork_in, ResponseMsg) { enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) { @@ -933,6 +958,23 @@ machine(MachineType:Directory, "AMD Baseline protocol") } } + action(f_writeFlushDataToMemory, "f", desc="Write flush data to memory") { + peek(requestNetwork_in, CPURequestMsg) { + enqueue(memQueue_out, MemoryMsg, to_memory_controller_latency) { + out_msg.addr := address; + out_msg.Type := MemoryRequestType:MEMORY_WB; + out_msg.Sender := machineID; + out_msg.MessageSize := MessageSizeType:Writeback_Data; + out_msg.DataBlk := in_msg.DataBlk; + } + if (tbe.Dirty == false) { + // have to update the TBE, too, because of how this + // directory deals with functional writes + tbe.DataBlk := in_msg.DataBlk; + } + } + } + action(atd_allocateTBEforDMA, "atd", desc="allocate TBE Entry for DMA") { check_allocate(TBEs); peek(dmaRequestQueue_in, DMARequestMsg) { @@ -1553,4 +1595,17 @@ machine(MachineType:Directory, "AMD Baseline protocol") dt_deallocateTBE; pt_popTriggerQueue; } + + transition(U, Flush, F) {L3TagArrayRead, L3TagArrayWrite} { + t_allocateTBE; + f_writeFlushDataToMemory; + w_sendResponseWBAck; + p_popRequestQueue; + } + + transition(F, WBAck, U) { + pm_popMemQueue; + dt_deallocateTBE; + } + } From ae5a51994c112d04bfe0eb84189ca9ec5a46102e Mon Sep 17 00:00:00 2001 From: Vishnu Ramadas Date: Fri, 29 Sep 2023 18:27:46 -0500 Subject: [PATCH 5/8] mem-ruby: Update cache recorder to use GPUCoalescer port for GPUs Previously, the cache recorder used the Sequencer to issue flush requests and cache warmup requests. The GPU however uses GPUCoalescer to access the cache, and not the Sequencer. This commit adds a GPUCoalescer map to the cache recorder and uses it to send flushes and cache warmup requests to any GPU caches in the system Change-Id: I10490cf5e561c8559a98d4eb0550c62eefe769c9 --- src/mem/ruby/system/CacheRecorder.cc | 34 ++++++++++++++++++++++++---- src/mem/ruby/system/CacheRecorder.hh | 3 +++ src/mem/ruby/system/RubySystem.cc | 17 +++++++++++++- 3 files changed, 49 insertions(+), 5 deletions(-) diff --git a/src/mem/ruby/system/CacheRecorder.cc b/src/mem/ruby/system/CacheRecorder.cc index 20a8a30ebc..ec552c07c5 100644 --- a/src/mem/ruby/system/CacheRecorder.cc +++ b/src/mem/ruby/system/CacheRecorder.cc @@ -30,8 +30,11 @@ #include "mem/ruby/system/CacheRecorder.hh" #include "debug/RubyCacheTrace.hh" +#include "mem/packet.hh" +#include "mem/ruby/system/GPUCoalescer.hh" #include "mem/ruby/system/RubySystem.hh" #include "mem/ruby/system/Sequencer.hh" +#include "sim/sim_exit.hh" namespace gem5 { @@ -57,11 +60,13 @@ CacheRecorder::CacheRecorder() CacheRecorder::CacheRecorder(uint8_t* uncompressed_trace, uint64_t uncompressed_trace_size, std::vector& seq_map, + std::vector& coal_map, uint64_t block_size_bytes) : m_uncompressed_trace(uncompressed_trace), m_uncompressed_trace_size(uncompressed_trace_size), - m_seq_map(seq_map), m_bytes_read(0), m_records_read(0), - m_records_flushed(0), m_block_size_bytes(block_size_bytes) + m_seq_map(seq_map), m_coalescer_map(coal_map), m_bytes_read(0), + m_records_read(0), m_records_flushed(0), + m_block_size_bytes(block_size_bytes) { if (m_uncompressed_trace != NULL) { if (m_block_size_bytes < RubySystem::getBlockSizeBytes()) { @@ -81,6 +86,7 @@ CacheRecorder::~CacheRecorder() m_uncompressed_trace = NULL; } m_seq_map.clear(); + m_coalescer_map.clear(); } void @@ -96,11 +102,21 @@ CacheRecorder::enqueueNextFlushRequest() Packet *pkt = new Packet(req, requestType); Sequencer* m_sequencer_ptr = m_seq_map[rec->m_cntrl_id]; + GPUCoalescer* m_coal_ptr = m_coalescer_map[rec->m_cntrl_id]; assert(m_sequencer_ptr != NULL); - m_sequencer_ptr->makeRequest(pkt); + if (m_coal_ptr == NULL) + m_sequencer_ptr->makeRequest(pkt); + else { + pkt->req->setReqInstSeqNum(m_records_flushed - 1); + m_coal_ptr->makeRequest(pkt); + } DPRINTF(RubyCacheTrace, "Flushing %s\n", *rec); + } else { + if (m_records_flushed > 0) { + exitSimLoop("Finished Drain", 0); + } DPRINTF(RubyCacheTrace, "Flushed all %d records\n", m_records_flushed); } } @@ -143,13 +159,21 @@ CacheRecorder::enqueueNextFetchRequest() pkt->dataStatic(traceRecord->m_data + rec_bytes_read); Sequencer* m_sequencer_ptr = m_seq_map[traceRecord->m_cntrl_id]; + GPUCoalescer* m_coal_ptr; + m_coal_ptr = m_coalescer_map[traceRecord->m_cntrl_id]; assert(m_sequencer_ptr != NULL); - m_sequencer_ptr->makeRequest(pkt); + if (m_coal_ptr == NULL) + m_sequencer_ptr->makeRequest(pkt); + else { + pkt->req->setReqInstSeqNum(m_records_read); + m_coal_ptr->makeRequest(pkt); + } } m_bytes_read += (sizeof(TraceRecord) + m_block_size_bytes); m_records_read++; } else { + exitSimLoop("Finished Warmup", 0); DPRINTF(RubyCacheTrace, "Fetched all %d records\n", m_records_read); } } @@ -168,6 +192,8 @@ CacheRecorder::addRecord(int cntrl, Addr data_addr, Addr pc_addr, memcpy(rec->m_data, data.getData(0, m_block_size_bytes), m_block_size_bytes); + DPRINTF(RubyCacheTrace, "Inside addRecord with cntrl id %d and type %d\n", + cntrl, type); m_records.push_back(rec); } diff --git a/src/mem/ruby/system/CacheRecorder.hh b/src/mem/ruby/system/CacheRecorder.hh index be95590313..9363e2fde7 100644 --- a/src/mem/ruby/system/CacheRecorder.hh +++ b/src/mem/ruby/system/CacheRecorder.hh @@ -50,6 +50,7 @@ namespace ruby { class Sequencer; +class GPUCoalescer; /*! * Class for recording cache contents. Note that the last element of the @@ -79,6 +80,7 @@ class CacheRecorder CacheRecorder(uint8_t* uncompressed_trace, uint64_t uncompressed_trace_size, std::vector& SequencerMap, + std::vector& CoalescerMap, uint64_t block_size_bytes); void addRecord(int cntrl, Addr data_addr, Addr pc_addr, RubyRequestType type, Tick time, DataBlock& data); @@ -115,6 +117,7 @@ class CacheRecorder uint8_t* m_uncompressed_trace; uint64_t m_uncompressed_trace_size; std::vector m_seq_map; + std::vector m_coalescer_map; uint64_t m_bytes_read; uint64_t m_records_read; uint64_t m_records_flushed; diff --git a/src/mem/ruby/system/RubySystem.cc b/src/mem/ruby/system/RubySystem.cc index b38c903b09..232e337752 100644 --- a/src/mem/ruby/system/RubySystem.cc +++ b/src/mem/ruby/system/RubySystem.cc @@ -178,13 +178,22 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace, uint64_t block_size_bytes) { std::vector sequencer_map; + std::vector coalescer_map; Sequencer* sequencer_ptr = NULL; + GPUCoalescer* coalescer_ptr = NULL; for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) { sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getCPUSequencer()); + coalescer_map.push_back(m_abs_cntrl_vec[cntrl]->getGPUCoalescer()); + if (sequencer_ptr == NULL) { sequencer_ptr = sequencer_map[cntrl]; } + + if (coalescer_ptr == NULL) { + coalescer_ptr = coalescer_map[cntrl]; + } + } assert(sequencer_ptr != NULL); @@ -193,6 +202,11 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace, if (sequencer_map[cntrl] == NULL) { sequencer_map[cntrl] = sequencer_ptr; } + + if (coalescer_map[cntrl] == NULL) { + coalescer_map[cntrl] = coalescer_ptr; + } + } // Remove the old CacheRecorder if it's still hanging about. @@ -202,7 +216,8 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace, // Create the CacheRecorder and record the cache trace m_cache_recorder = new CacheRecorder(uncompressed_trace, cache_trace_size, - sequencer_map, block_size_bytes); + sequencer_map, coalescer_map, + block_size_bytes); } void From f69191a31d091562201f544c28462b87d6c46206 Mon Sep 17 00:00:00 2001 From: Vishnu Ramadas Date: Mon, 2 Oct 2023 19:37:46 -0500 Subject: [PATCH 6/8] dev-amdgpu: Remove duplicate writes to PM4 queue pointers During checkpoint restoration, the unserialize() function writes rptr, wptr, and indirect buffer rptr, wptr to PM4 queue's rptr, wptr fields. This commit updates this to write only the relevant pointers to the queue structure. If indirect buffers are used, then it writes only the indirect buffer pointers to the queue. If they are not used, then it writes rptr, wptr values to the queue. Change-Id: Iedb25a726112e1af99cc1e7bc012de51c4ebfd45 --- src/dev/amdgpu/pm4_packet_processor.cc | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc index 63a3bf8887..fdb6f9d7ce 100644 --- a/src/dev/amdgpu/pm4_packet_processor.cc +++ b/src/dev/amdgpu/pm4_packet_processor.cc @@ -1190,15 +1190,16 @@ PM4PacketProcessor::unserialize(CheckpointIn &cp) memset(pkt, 0, sizeof(PM4MapQueues)); newQueue(mqd, offset[i], pkt, id[i]); - queues[id[i]]->ib(false); - queues[id[i]]->rptr(rptr[i]); - queues[id[i]]->wptr(wptr[i]); - queues[id[i]]->ib(true); - queues[id[i]]->wptr(ib_wptr[i]); - queues[id[i]]->rptr(ib_rptr[i]); + if (ib[i]) { + queues[id[i]]->wptr(ib_wptr[i]); + queues[id[i]]->rptr(ib_rptr[i]); + } else { + queues[id[i]]->rptr(rptr[i]); + queues[id[i]]->wptr(wptr[i]); + } + queues[id[i]]->ib(ib[i]); queues[id[i]]->offset(offset[i]); queues[id[i]]->processing(processing[i]); - queues[id[i]]->ib(ib[i]); queues[id[i]]->setPkt(me[i], pipe[i], queue[i], privileged[i]); queues[id[i]]->getMQD()->hqd_active = hqd_active[i]; queues[id[i]]->getMQD()->hqd_vmid = hqd_vmid[i]; From d3637a489d6ddcc8ca5d99f20b53a1ea64bbc422 Mon Sep 17 00:00:00 2001 From: Vishnu Ramadas Date: Tue, 3 Oct 2023 12:10:42 -0500 Subject: [PATCH 7/8] configs: Add option to disable AVX in GPUFS GPUFS+KVM simulations automatically enable AVX. This commit adds a command line option to disable AVX if its not needed for a GPUFS simulation. Change-Id: Ic22592767dbdca86f3718eca9c837a8e29b6b781 --- configs/example/gpufs/runfs.py | 10 ++++++++++ configs/example/gpufs/system/system.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py index 0f090e2f89..5d29959eff 100644 --- a/configs/example/gpufs/runfs.py +++ b/configs/example/gpufs/runfs.py @@ -158,6 +158,16 @@ def addRunFSOptions(parser): help="Root partition of disk image", ) + parser.add_argument( + "--disable-avx", + action="store_true", + default=False, + help="Disables AVX. AVX is used in some ROCm libraries but " + "does not have checkpointing support yet. If simulation either " + "creates a checkpoint or restores from one, then AVX needs to " + "be disabled for correct functionality ", + ) + def runGpuFSSystem(args): """ diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py index 7ddc4f0752..7cb0ce1aa5 100644 --- a/configs/example/gpufs/system/system.py +++ b/configs/example/gpufs/system/system.py @@ -234,7 +234,7 @@ def makeGpuFSSystem(args): # If we are using KVM cpu, enable AVX. AVX is used in some ROCm libraries # such as rocBLAS which is used in higher level libraries like PyTorch. use_avx = False - if ObjectList.is_kvm_cpu(TestCPUClass): + if ObjectList.is_kvm_cpu(TestCPUClass) and not args.disable_avx: # AVX also requires CR4.osxsave to be 1. These must be set together # of KVM will error out. system.workload.enable_osxsave = 1 From a19667427a96c560b243ffbadbba3b7f4f6db2b4 Mon Sep 17 00:00:00 2001 From: Vishnu Ramadas Date: Thu, 5 Oct 2023 18:59:54 -0500 Subject: [PATCH 8/8] mem-ruby: Add BUILD_GPU guard to ruby cooldown and warmup phases Ruby was recently updated to support flushes and warmup for GPUs. Since this support uses the GPUCoalescer, non-GPU builds face a compile time issue. This is because GPU code is not built for non-GPU builds. This commit addes "#if BUILD_GPU" guards around the GPU-related code in common files like AbstractController.hh, CacheRecorder.*, RubySystem.cc, GPUCoalescer.hh, and VIPERCoalescer.hh. This support allows GPU builds to use flushing while non-GPU builds compile without problems Change-Id: If8ee4ff881fe154553289e8c00881ee1b6e3f113 --- .../slicc_interface/AbstractController.hh | 2 ++ src/mem/ruby/system/CacheRecorder.cc | 27 +++++++++++++++++++ src/mem/ruby/system/CacheRecorder.hh | 12 +++++++++ src/mem/ruby/system/GPUCoalescer.hh | 5 ++++ src/mem/ruby/system/RubySystem.cc | 16 ++++++++++- src/mem/ruby/system/VIPERCoalescer.hh | 5 ++++ 6 files changed, 66 insertions(+), 1 deletion(-) diff --git a/src/mem/ruby/slicc_interface/AbstractController.hh b/src/mem/ruby/slicc_interface/AbstractController.hh index 72b679d6cf..7d93644bd8 100644 --- a/src/mem/ruby/slicc_interface/AbstractController.hh +++ b/src/mem/ruby/slicc_interface/AbstractController.hh @@ -70,7 +70,9 @@ namespace ruby { class Network; +#ifdef BUILD_GPU class GPUCoalescer; +#endif class DMASequencer; // used to communicate that an in_port peeked the wrong message type diff --git a/src/mem/ruby/system/CacheRecorder.cc b/src/mem/ruby/system/CacheRecorder.cc index ec552c07c5..057b6aa041 100644 --- a/src/mem/ruby/system/CacheRecorder.cc +++ b/src/mem/ruby/system/CacheRecorder.cc @@ -57,6 +57,7 @@ CacheRecorder::CacheRecorder() { } +#if BUILD_GPU CacheRecorder::CacheRecorder(uint8_t* uncompressed_trace, uint64_t uncompressed_trace_size, std::vector& seq_map, @@ -67,6 +68,18 @@ CacheRecorder::CacheRecorder(uint8_t* uncompressed_trace, m_seq_map(seq_map), m_coalescer_map(coal_map), m_bytes_read(0), m_records_read(0), m_records_flushed(0), m_block_size_bytes(block_size_bytes) +#else +CacheRecorder::CacheRecorder(uint8_t* uncompressed_trace, + uint64_t uncompressed_trace_size, + std::vector& seq_map, + uint64_t block_size_bytes) + : m_uncompressed_trace(uncompressed_trace), + m_uncompressed_trace_size(uncompressed_trace_size), + m_seq_map(seq_map), m_bytes_read(0), + m_records_read(0), m_records_flushed(0), + m_block_size_bytes(block_size_bytes) + +#endif { if (m_uncompressed_trace != NULL) { if (m_block_size_bytes < RubySystem::getBlockSizeBytes()) { @@ -86,7 +99,9 @@ CacheRecorder::~CacheRecorder() m_uncompressed_trace = NULL; } m_seq_map.clear(); +#if BUILD_GPU m_coalescer_map.clear(); +#endif } void @@ -102,14 +117,20 @@ CacheRecorder::enqueueNextFlushRequest() Packet *pkt = new Packet(req, requestType); Sequencer* m_sequencer_ptr = m_seq_map[rec->m_cntrl_id]; +#if BUILD_GPU GPUCoalescer* m_coal_ptr = m_coalescer_map[rec->m_cntrl_id]; +#endif assert(m_sequencer_ptr != NULL); +#if BUILD_GPU if (m_coal_ptr == NULL) m_sequencer_ptr->makeRequest(pkt); else { pkt->req->setReqInstSeqNum(m_records_flushed - 1); m_coal_ptr->makeRequest(pkt); } +#else + m_sequencer_ptr->makeRequest(pkt); +#endif DPRINTF(RubyCacheTrace, "Flushing %s\n", *rec); @@ -159,15 +180,21 @@ CacheRecorder::enqueueNextFetchRequest() pkt->dataStatic(traceRecord->m_data + rec_bytes_read); Sequencer* m_sequencer_ptr = m_seq_map[traceRecord->m_cntrl_id]; +#if BUILD_GPU GPUCoalescer* m_coal_ptr; m_coal_ptr = m_coalescer_map[traceRecord->m_cntrl_id]; +#endif assert(m_sequencer_ptr != NULL); +#if BUILD_GPU if (m_coal_ptr == NULL) m_sequencer_ptr->makeRequest(pkt); else { pkt->req->setReqInstSeqNum(m_records_read); m_coal_ptr->makeRequest(pkt); } +#else + m_sequencer_ptr->makeRequest(pkt); +#endif } m_bytes_read += (sizeof(TraceRecord) + m_block_size_bytes); diff --git a/src/mem/ruby/system/CacheRecorder.hh b/src/mem/ruby/system/CacheRecorder.hh index 9363e2fde7..e94dfad97a 100644 --- a/src/mem/ruby/system/CacheRecorder.hh +++ b/src/mem/ruby/system/CacheRecorder.hh @@ -38,6 +38,7 @@ #include #include "base/types.hh" +#include "config/build_gpu.hh" #include "mem/ruby/common/Address.hh" #include "mem/ruby/common/DataBlock.hh" #include "mem/ruby/common/TypeDefines.hh" @@ -50,7 +51,9 @@ namespace ruby { class Sequencer; +#if BUILD_GPU class GPUCoalescer; +#endif /*! * Class for recording cache contents. Note that the last element of the @@ -77,11 +80,18 @@ class CacheRecorder CacheRecorder(); ~CacheRecorder(); +#if BUILD_GPU CacheRecorder(uint8_t* uncompressed_trace, uint64_t uncompressed_trace_size, std::vector& SequencerMap, std::vector& CoalescerMap, uint64_t block_size_bytes); +#else + CacheRecorder(uint8_t* uncompressed_trace, + uint64_t uncompressed_trace_size, + std::vector& SequencerMap, + uint64_t block_size_bytes); +#endif void addRecord(int cntrl, Addr data_addr, Addr pc_addr, RubyRequestType type, Tick time, DataBlock& data); @@ -117,7 +127,9 @@ class CacheRecorder uint8_t* m_uncompressed_trace; uint64_t m_uncompressed_trace_size; std::vector m_seq_map; +#if BUILD_GPU std::vector m_coalescer_map; +#endif uint64_t m_bytes_read; uint64_t m_records_read; uint64_t m_records_flushed; diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh index d6db5c00ba..3f936b4b41 100644 --- a/src/mem/ruby/system/GPUCoalescer.hh +++ b/src/mem/ruby/system/GPUCoalescer.hh @@ -32,6 +32,10 @@ #ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ #define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ +#include "config/build_gpu.hh" + +#if BUILD_GPU + #include #include @@ -546,4 +550,5 @@ operator<<(std::ostream& out, const GPUCoalescer& obj) } // namespace ruby } // namespace gem5 +#endif // BUILD_GPU #endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ diff --git a/src/mem/ruby/system/RubySystem.cc b/src/mem/ruby/system/RubySystem.cc index 232e337752..32dec7b9e0 100644 --- a/src/mem/ruby/system/RubySystem.cc +++ b/src/mem/ruby/system/RubySystem.cc @@ -178,21 +178,27 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace, uint64_t block_size_bytes) { std::vector sequencer_map; +#if BUILD_GPU std::vector coalescer_map; - Sequencer* sequencer_ptr = NULL; GPUCoalescer* coalescer_ptr = NULL; +#endif + Sequencer* sequencer_ptr = NULL; for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) { sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getCPUSequencer()); +#if BUILD_GPU coalescer_map.push_back(m_abs_cntrl_vec[cntrl]->getGPUCoalescer()); +#endif if (sequencer_ptr == NULL) { sequencer_ptr = sequencer_map[cntrl]; } +#if BUILD_GPU if (coalescer_ptr == NULL) { coalescer_ptr = coalescer_map[cntrl]; } +#endif } @@ -203,9 +209,11 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace, sequencer_map[cntrl] = sequencer_ptr; } +#if BUILD_GPU if (coalescer_map[cntrl] == NULL) { coalescer_map[cntrl] = coalescer_ptr; } +#endif } @@ -215,9 +223,15 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace, } // Create the CacheRecorder and record the cache trace +#if BUILD_GPU m_cache_recorder = new CacheRecorder(uncompressed_trace, cache_trace_size, sequencer_map, coalescer_map, block_size_bytes); +#else + m_cache_recorder = new CacheRecorder(uncompressed_trace, cache_trace_size, + sequencer_map, + block_size_bytes); +#endif } void diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh index c7e21e946b..d185620244 100644 --- a/src/mem/ruby/system/VIPERCoalescer.hh +++ b/src/mem/ruby/system/VIPERCoalescer.hh @@ -32,6 +32,10 @@ #ifndef __MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__ #define __MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__ +#include "config/build_gpu.hh" + +#if BUILD_GPU + #include #include "mem/ruby/common/Address.hh" @@ -92,4 +96,5 @@ class VIPERCoalescer : public GPUCoalescer } // namespace ruby } // namespace gem5 +#endif // BUILD_GPU #endif //__MEM_RUBY_SYSTEM_VIPERCOALESCER_HH__