diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index d15c4328bd..c39dec8430 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -862,33 +862,6 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) delete pkt->senderState; delete pkt; - return true; - } else if (pkt->cmd == MemCmd::WriteCompleteResp) { - // this is for writeComplete callback - // we simply get decrement write-related wait counters - assert(gpuDynInst); - M5_VAR_USED Wavefront *w = - computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId]; - assert(w); - DPRINTF(GPUExec, "WriteCompleteResp: WF[%d][%d] WV%d %s decrementing " - "outstanding reqs %d => %d\n", gpuDynInst->simdId, - gpuDynInst->wfSlotId, gpuDynInst->wfDynId, - gpuDynInst->disassemble(), w->outstandingReqs, - w->outstandingReqs - 1); - if (gpuDynInst->allLanesZero()) { - // ask gm pipe to decrement request counters, instead of directly - // performing here, to avoid asynchronous counter update and - // instruction retirement (which may hurt waincnt effects) - computeUnit->globalMemoryPipe.handleResponse(gpuDynInst); - - DPRINTF(GPUMem, "CU%d: WF[%d][%d]: write totally complete\n", - computeUnit->cu_id, gpuDynInst->simdId, - gpuDynInst->wfSlotId); - } - - delete pkt->senderState; - delete pkt; - return true; } @@ -1319,10 +1292,16 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt) Addr paddr = pkt->req->getPaddr(); - // mem sync resp and write-complete callback must be handled already in + // mem sync resp callback must be handled already in // DataPort::recvTimingResp assert(pkt->cmd != MemCmd::MemSyncResp); - assert(pkt->cmd != MemCmd::WriteCompleteResp); + + // The status vector and global memory response for WriteResp packets get + // handled by the WriteCompleteResp packets. + if (pkt->cmd == MemCmd::WriteResp) { + delete pkt; + return; + } // this is for read, write and atomic int index = gpuDynInst->memStatusVector[paddr].back(); @@ -1356,17 +1335,13 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt) gpuDynInst->memStatusVector.clear(); - // note: only handle read response here; for write, the response - // is separately handled when writeComplete callback is received - if (pkt->isRead()) { - gpuDynInst-> - profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue); - compute_unit->globalMemoryPipe.handleResponse(gpuDynInst); + gpuDynInst-> + profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue); + compute_unit->globalMemoryPipe.handleResponse(gpuDynInst); - DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n", - compute_unit->cu_id, gpuDynInst->simdId, - gpuDynInst->wfSlotId); - } + DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n", + compute_unit->cu_id, gpuDynInst->simdId, + gpuDynInst->wfSlotId); } else { if (pkt->isRead()) { if (!compute_unit->headTailMap.count(gpuDynInst)) { diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc index d9df1d8931..3f73568396 100644 --- a/src/mem/ruby/system/GPUCoalescer.cc +++ b/src/mem/ruby/system/GPUCoalescer.cc @@ -682,10 +682,11 @@ GPUCoalescer::coalescePacket(PacketPtr pkt) // create a new coalecsed request and issue it immediately. auto reqList = std::deque { creq }; coalescedTable.insert(std::make_pair(line_addr, reqList)); - - DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n", - RubyRequestType_to_string(creq->getRubyType()), seqNum); - issueRequest(creq); + if (!coalescedReqs.count(seqNum)) { + coalescedReqs.insert(std::make_pair(seqNum, reqList)); + } else { + coalescedReqs.at(seqNum).push_back(creq); + } } else { // The request is for a line address that is already outstanding // but for a different instruction. Add it as a new request to be @@ -773,6 +774,17 @@ GPUCoalescer::completeIssue() [&](PacketPtr pkt) { return coalescePacket(pkt); } ); + if (coalescedReqs.count(seq_num)) { + auto& creqs = coalescedReqs.at(seq_num); + for (auto creq : creqs) { + DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n", + RubyRequestType_to_string(creq->getRubyType()), + seq_num); + issueRequest(creq); + } + coalescedReqs.erase(seq_num); + } + assert(pkt_list_size >= pkt_list->size()); size_t pkt_list_diff = pkt_list_size - pkt_list->size(); diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh index 086cc6da3b..709b491a83 100644 --- a/src/mem/ruby/system/GPUCoalescer.hh +++ b/src/mem/ruby/system/GPUCoalescer.hh @@ -430,6 +430,10 @@ class GPUCoalescer : public RubyPort // (typically the number of blocks in TCP). If there are duplicates of // an address, the are serviced in age order. std::map> coalescedTable; + // Map of instruction sequence number to coalesced requests that get + // created in coalescePacket, used in completeIssue to send the fully + // coalesced request + std::unordered_map> coalescedReqs; // a map btw an instruction sequence number and PendingWriteInst // this is used to do a final call back for each write when it is diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc index 9a6434ab50..b47aaefca4 100644 --- a/src/mem/ruby/system/RubyPort.cc +++ b/src/mem/ruby/system/RubyPort.cc @@ -546,7 +546,8 @@ RubyPort::MemResponsePort::hitCallback(PacketPtr pkt) } // Flush, acquire, release requests don't access physical memory - if (pkt->isFlush() || pkt->cmd == MemCmd::MemSyncReq) { + if (pkt->isFlush() || pkt->cmd == MemCmd::MemSyncReq + || pkt->cmd == MemCmd::WriteCompleteResp) { accessPhysMem = false; } diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc index 6589a7d764..f0873a42a4 100644 --- a/src/mem/ruby/system/VIPERCoalescer.cc +++ b/src/mem/ruby/system/VIPERCoalescer.cc @@ -238,19 +238,28 @@ VIPERCoalescer::writeCompleteCallback(Addr addr, uint64_t instSeqNum) assert(m_writeCompletePktMap.count(key) == 1 && !m_writeCompletePktMap[key].empty()); - for (auto writeCompletePkt : m_writeCompletePktMap[key]) { - if (makeLineAddress(writeCompletePkt->getAddr()) == addr) { - RubyPort::SenderState *ss = - safe_cast - (writeCompletePkt->senderState); - MemResponsePort *port = ss->port; - assert(port != NULL); + m_writeCompletePktMap[key].erase( + std::remove_if( + m_writeCompletePktMap[key].begin(), + m_writeCompletePktMap[key].end(), + [addr](PacketPtr writeCompletePkt) -> bool { + if (makeLineAddress(writeCompletePkt->getAddr()) == addr) { + RubyPort::SenderState *ss = + safe_cast + (writeCompletePkt->senderState); + MemResponsePort *port = ss->port; + assert(port != NULL); - writeCompletePkt->senderState = ss->predecessor; - delete ss; - port->hitCallback(writeCompletePkt); - } - } + writeCompletePkt->senderState = ss->predecessor; + delete ss; + port->hitCallback(writeCompletePkt); + return true; + } + return false; + } + ), + m_writeCompletePktMap[key].end() + ); trySendRetries();