diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
index d15c4328bd..c39dec8430 100644
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@@ -862,33 +862,6 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
 
         delete pkt->senderState;
         delete pkt;
-        return true;
-    } else if (pkt->cmd == MemCmd::WriteCompleteResp) {
-        // this is for writeComplete callback
-        // we simply get decrement write-related wait counters
-        assert(gpuDynInst);
-        M5_VAR_USED Wavefront *w =
-            computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
-        assert(w);
-        DPRINTF(GPUExec, "WriteCompleteResp: WF[%d][%d] WV%d %s decrementing "
-                        "outstanding reqs %d => %d\n", gpuDynInst->simdId,
-                        gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
-                        gpuDynInst->disassemble(), w->outstandingReqs,
-                        w->outstandingReqs - 1);
-        if (gpuDynInst->allLanesZero()) {
-            // ask gm pipe to decrement request counters, instead of directly
-            // performing here, to avoid asynchronous counter update and
-            // instruction retirement (which may hurt waincnt effects)
-            computeUnit->globalMemoryPipe.handleResponse(gpuDynInst);
-
-            DPRINTF(GPUMem, "CU%d: WF[%d][%d]: write totally complete\n",
-                            computeUnit->cu_id, gpuDynInst->simdId,
-                            gpuDynInst->wfSlotId);
-        }
-
-        delete pkt->senderState;
-        delete pkt;
-
         return true;
     }
 
@@ -1319,10 +1292,16 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
 
     Addr paddr = pkt->req->getPaddr();
 
-    // mem sync resp and write-complete callback must be handled already in
+    // mem sync resp callback must be handled already in
     // DataPort::recvTimingResp
     assert(pkt->cmd != MemCmd::MemSyncResp);
-    assert(pkt->cmd != MemCmd::WriteCompleteResp);
+
+    // The status vector and global memory response for WriteResp packets get
+    // handled by the WriteCompleteResp packets.
+    if (pkt->cmd == MemCmd::WriteResp) {
+        delete pkt;
+        return;
+    }
 
     // this is for read, write and atomic
     int index = gpuDynInst->memStatusVector[paddr].back();
@@ -1356,17 +1335,13 @@ ComputeUnit::DataPort::processMemRespEvent(PacketPtr pkt)
 
         gpuDynInst->memStatusVector.clear();
 
-        // note: only handle read response here; for write, the response
-        // is separately handled when writeComplete callback is received
-        if (pkt->isRead()) {
-            gpuDynInst->
-                profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
-            compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
+        gpuDynInst->
+            profileRoundTripTime(curTick(), InstMemoryHop::GMEnqueue);
+        compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
 
-            DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
-                    compute_unit->cu_id, gpuDynInst->simdId,
-                    gpuDynInst->wfSlotId);
-        }
+        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
+                compute_unit->cu_id, gpuDynInst->simdId,
+                gpuDynInst->wfSlotId);
     } else {
         if (pkt->isRead()) {
             if (!compute_unit->headTailMap.count(gpuDynInst)) {
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
index d9df1d8931..3f73568396 100644
--- a/src/mem/ruby/system/GPUCoalescer.cc
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -682,10 +682,11 @@ GPUCoalescer::coalescePacket(PacketPtr pkt)
             // create a new coalecsed request and issue it immediately.
             auto reqList = std::deque<CoalescedRequest*> { creq };
             coalescedTable.insert(std::make_pair(line_addr, reqList));
-
-            DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
-                    RubyRequestType_to_string(creq->getRubyType()), seqNum);
-            issueRequest(creq);
+            if (!coalescedReqs.count(seqNum)) {
+                coalescedReqs.insert(std::make_pair(seqNum, reqList));
+            } else {
+                coalescedReqs.at(seqNum).push_back(creq);
+            }
         } else {
             // The request is for a line address that is already outstanding
             // but for a different instruction. Add it as a new request to be
@@ -773,6 +774,17 @@ GPUCoalescer::completeIssue()
                 [&](PacketPtr pkt) { return coalescePacket(pkt); }
             );
 
+            if (coalescedReqs.count(seq_num)) {
+                auto& creqs = coalescedReqs.at(seq_num);
+                for (auto creq : creqs) {
+                    DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
+                            RubyRequestType_to_string(creq->getRubyType()),
+                                                      seq_num);
+                    issueRequest(creq);
+                }
+                coalescedReqs.erase(seq_num);
+            }
+
             assert(pkt_list_size >= pkt_list->size());
             size_t pkt_list_diff = pkt_list_size - pkt_list->size();
 
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh
index 086cc6da3b..709b491a83 100644
--- a/src/mem/ruby/system/GPUCoalescer.hh
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -430,6 +430,10 @@ class GPUCoalescer : public RubyPort
     // (typically the number of blocks in TCP). If there are duplicates of
     // an address, the are serviced in age order.
     std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
+    // Map of instruction sequence number to coalesced requests that get
+    // created in coalescePacket, used in completeIssue to send the fully
+    // coalesced request
+    std::unordered_map<uint64_t, std::deque<CoalescedRequest*>> coalescedReqs;
 
     // a map btw an instruction sequence number and PendingWriteInst
     // this is used to do a final call back for each write when it is
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
index 9a6434ab50..b47aaefca4 100644
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -546,7 +546,8 @@ RubyPort::MemResponsePort::hitCallback(PacketPtr pkt)
     }
 
     // Flush, acquire, release requests don't access physical memory
-    if (pkt->isFlush() || pkt->cmd == MemCmd::MemSyncReq) {
+    if (pkt->isFlush() || pkt->cmd == MemCmd::MemSyncReq
+        || pkt->cmd == MemCmd::WriteCompleteResp) {
         accessPhysMem = false;
     }
 
diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc
index 6589a7d764..f0873a42a4 100644
--- a/src/mem/ruby/system/VIPERCoalescer.cc
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -238,19 +238,28 @@ VIPERCoalescer::writeCompleteCallback(Addr addr, uint64_t instSeqNum)
     assert(m_writeCompletePktMap.count(key) == 1 &&
            !m_writeCompletePktMap[key].empty());
 
-    for (auto writeCompletePkt : m_writeCompletePktMap[key]) {
-        if (makeLineAddress(writeCompletePkt->getAddr()) == addr) {
-            RubyPort::SenderState *ss =
-                safe_cast<RubyPort::SenderState *>
-                    (writeCompletePkt->senderState);
-            MemResponsePort *port = ss->port;
-            assert(port != NULL);
+    m_writeCompletePktMap[key].erase(
+        std::remove_if(
+            m_writeCompletePktMap[key].begin(),
+            m_writeCompletePktMap[key].end(),
+            [addr](PacketPtr writeCompletePkt) -> bool {
+                if (makeLineAddress(writeCompletePkt->getAddr()) == addr) {
+                    RubyPort::SenderState *ss =
+                        safe_cast<RubyPort::SenderState *>
+                            (writeCompletePkt->senderState);
+                    MemResponsePort *port = ss->port;
+                    assert(port != NULL);
 
-            writeCompletePkt->senderState = ss->predecessor;
-            delete ss;
-            port->hitCallback(writeCompletePkt);
-        }
-    }
+                    writeCompletePkt->senderState = ss->predecessor;
+                    delete ss;
+                    port->hitCallback(writeCompletePkt);
+                    return true;
+                }
+                return false;
+            }
+        ),
+        m_writeCompletePktMap[key].end()
+    );
 
     trySendRetries();