Revert "mem-cache: Remove writebacks packet list"

This reverts commit bf0a722acd. Reason for revert: This patch introduces a bug: The problem here is that the insertion of block A may cause the eviction of block B, which on the lower level may cause the eviction of block A. Since A is not marked as present yet, A is "safely" removed from the snoop filter However, by reverting it, using atomic and a Tags sub-class that can generate multiple evictions at once becomes broken when using Atomic mode and shall be fixed in a future patch. Change-Id: I5b27e54b54ae5b50255588835c1a2ebf3015f002 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/19088 Reviewed-by: Nikos Nikoleris <nikos.nikoleris@arm.com> Maintainer: Nikos Nikoleris <nikos.nikoleris@arm.com> Tested-by: kokoro <noreply+kokoro@google.com>
2019-06-01 00:01:12 +00:00
parent b68735f9fa
commit cc6eff061c
6 changed files with 192 additions and 143 deletions
--- a/src/mem/cache/base.cc
+++ b/src/mem/cache/base.cc
@@ -342,11 +342,20 @@ BaseCache::recvTimingReq(PacketPtr pkt)
    // the delay provided by the crossbar
    Tick forward_time = clockEdge(forwardLatency) + pkt->headerDelay;

-    // Note that lat is passed by reference here. The function
-    // access() will set the lat value.
    Cycles lat;
    CacheBlk *blk = nullptr;
-    bool satisfied = access(pkt, blk, lat);
+    bool satisfied = false;
+    {
+        PacketList writebacks;
+        // Note that lat is passed by reference here. The function
+        // access() will set the lat value.
+        satisfied = access(pkt, blk, lat, writebacks);
+
+        // After the evicted blocks are selected, they must be forwarded
+        // to the write buffer to ensure they logically precede anything
+        // happening below
+        doWritebacks(writebacks, clockEdge(lat + forwardLatency));
+    }

    // Here we charge the headerDelay that takes into account the latencies
    // of the bus, if the packet comes from it.
@@ -448,6 +457,8 @@ BaseCache::recvTimingResp(PacketPtr pkt)
            miss_latency;
    }

+    PacketList writebacks;
+
    bool is_fill = !mshr->isForward &&
        (pkt->isRead() || pkt->cmd == MemCmd::UpgradeResp ||
         mshr->wasWholeLineWrite);
@@ -464,7 +475,7 @@ BaseCache::recvTimingResp(PacketPtr pkt)

        const bool allocate = (writeAllocator && mshr->wasWholeLineWrite) ?
            writeAllocator->allocate() : mshr->allocOnFill();
-        blk = handleFill(pkt, blk, allocate);
+        blk = handleFill(pkt, blk, writebacks, allocate);
        assert(blk != nullptr);
        ppFill->notify(pkt);
    }
@@ -520,9 +531,13 @@ BaseCache::recvTimingResp(PacketPtr pkt)

    // if we used temp block, check to see if its valid and then clear it out
    if (blk == tempBlock && tempBlock->isValid()) {
-        evictBlock(blk, clockEdge(forwardLatency) + pkt->headerDelay);
+        evictBlock(blk, writebacks);
    }

+    const Tick forward_time = clockEdge(forwardLatency) + pkt->headerDelay;
+    // copy writebacks to write buffer
+    doWritebacks(writebacks, forward_time);
+
    DPRINTF(CacheVerbose, "%s: Leaving with %s\n", __func__, pkt->print());
    delete pkt;
 }
@@ -540,7 +555,8 @@ BaseCache::recvAtomic(PacketPtr pkt)
    Cycles lat = lookupLatency;

    CacheBlk *blk = nullptr;
-    bool satisfied = access(pkt, blk, lat);
+    PacketList writebacks;
+    bool satisfied = access(pkt, blk, lat, writebacks);

    if (pkt->isClean() && blk && blk->isDirty()) {
        // A cache clean opearation is looking for a dirty
@@ -550,12 +566,17 @@ BaseCache::recvAtomic(PacketPtr pkt)
        DPRINTF(CacheVerbose, "%s: packet %s found block: %s\n",
                __func__, pkt->print(), blk->print());
        PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest(), pkt->id);
+        writebacks.push_back(wb_pkt);
        pkt->setSatisfied();
-        doWritebacksAtomic(wb_pkt);
    }

+    // handle writebacks resulting from the access here to ensure they
+    // logically precede anything happening below
+    doWritebacksAtomic(writebacks);
+    assert(writebacks.empty());
+
    if (!satisfied) {
-        lat += handleAtomicReqMiss(pkt, blk);
+        lat += handleAtomicReqMiss(pkt, blk, writebacks);
    }

    // Note that we don't invoke the prefetcher at all in atomic mode.
@@ -569,6 +590,9 @@ BaseCache::recvAtomic(PacketPtr pkt)
    // immediately rather than calling requestMemSideBus() as we do
    // there).

+    // do any writebacks resulting from the response handling
+    doWritebacksAtomic(writebacks);
+
    // if we used temp block, check to see if its valid and if so
    // clear it out, but only do so after the call to recvAtomic is
    // finished so that any downstream observers (such as a snoop
@@ -776,7 +800,7 @@ BaseCache::getNextQueueEntry()

 bool
 BaseCache::updateCompressionData(CacheBlk *blk, const uint64_t* data,
-    uint32_t delay, Cycles tag_latency)
+                                 PacketList &writebacks)
 {
    // tempBlock does not exist in the tags, so don't do anything for it.
    if (blk == tempBlock) {
@@ -866,8 +890,7 @@ BaseCache::updateCompressionData(CacheBlk *blk, const uint64_t* data,
            if (evict_blk->wasPrefetched()) {
                unusedPrefetches++;
            }
-            Cycles lat = calculateAccessLatency(evict_blk, delay, tag_latency);
-            evictBlock(evict_blk, clockEdge(lat + forwardLatency));
+            evictBlock(evict_blk, writebacks);
        }
    }

@@ -1001,7 +1024,8 @@ BaseCache::calculateAccessLatency(const CacheBlk* blk, const uint32_t delay,
 }

 bool
-BaseCache::access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat)
+BaseCache::access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat,
+                  PacketList &writebacks)
 {
    // sanity check
    assert(pkt->isRequest());
@@ -1100,7 +1124,7 @@ BaseCache::access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat)

        if (!blk) {
            // need to do a replacement
-            blk = allocateBlock(pkt, tag_latency);
+            blk = allocateBlock(pkt, writebacks);
            if (!blk) {
                // no replaceable block available: give up, fwd to next level.
                incMissCount(pkt);
@@ -1119,7 +1143,7 @@ BaseCache::access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat)
            // a smaller size, and now it doesn't fit the entry anymore).
            // If that is the case we might need to evict blocks.
            if (!updateCompressionData(blk, pkt->getConstPtr<uint64_t>(),
-                pkt->headerDelay, tag_latency)) {
+                writebacks)) {
                // This is a failed data expansion (write), which happened
                // after finding the replacement entries and accessing the
                // block's data. There were no replaceable entries available
@@ -1195,7 +1219,7 @@ BaseCache::access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat)
                return false;
            } else {
                // a writeback that misses needs to allocate a new block
-                blk = allocateBlock(pkt, tag_latency);
+                blk = allocateBlock(pkt, writebacks);
                if (!blk) {
                    // no replaceable block available: give up, fwd to
                    // next level.
@@ -1218,7 +1242,7 @@ BaseCache::access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat)
            // a smaller size, and now it doesn't fit the entry anymore).
            // If that is the case we might need to evict blocks.
            if (!updateCompressionData(blk, pkt->getConstPtr<uint64_t>(),
-                pkt->headerDelay, tag_latency)) {
+                writebacks)) {
                // This is a failed data expansion (write), which happened
                // after finding the replacement entries and accessing the
                // block's data. There were no replaceable entries available
@@ -1311,7 +1335,8 @@ BaseCache::maintainClusivity(bool from_cache, CacheBlk *blk)
 }

 CacheBlk*
-BaseCache::handleFill(PacketPtr pkt, CacheBlk *blk, bool allocate)
+BaseCache::handleFill(PacketPtr pkt, CacheBlk *blk, PacketList &writebacks,
+                      bool allocate)
 {
    assert(pkt->isResponse());
    Addr addr = pkt->getAddr();
@@ -1328,12 +1353,9 @@ BaseCache::handleFill(PacketPtr pkt, CacheBlk *blk, bool allocate)
        // better have read new data...
        assert(pkt->hasData() || pkt->cmd == MemCmd::InvalidateResp);

-        // Need to do a replacement if allocating, otherwise we stick
-        // with the temporary storage. The tag lookup has already been
-        // done to decide the eviction victims, so it is set to 0 here.
-        // The eviction itself, however, is delayed until the new data
-        // for the block that is requesting the replacement arrives.
-        blk = allocate ? allocateBlock(pkt, Cycles(0)) : nullptr;
+        // need to do a replacement if allocating, otherwise we stick
+        // with the temporary storage
+        blk = allocate ? allocateBlock(pkt, writebacks) : nullptr;

        if (!blk) {
            // No replaceable block or a mostly exclusive
@@ -1434,7 +1456,7 @@ BaseCache::handleFill(PacketPtr pkt, CacheBlk *blk, bool allocate)
 }

 CacheBlk*
-BaseCache::allocateBlock(const PacketPtr pkt, Cycles tag_latency)
+BaseCache::allocateBlock(const PacketPtr pkt, PacketList &writebacks)
 {
    // Get address
    const Addr addr = pkt->getAddr();
@@ -1507,9 +1529,7 @@ BaseCache::allocateBlock(const PacketPtr pkt, Cycles tag_latency)
                    unusedPrefetches++;
                }

-                Cycles lat =
-                    calculateAccessLatency(blk, pkt->headerDelay, tag_latency);
-                evictBlock(blk, clockEdge(lat + forwardLatency));
+                evictBlock(blk, writebacks);
            }
        }

@@ -1542,15 +1562,11 @@ BaseCache::invalidateBlock(CacheBlk *blk)
 }

 void
-BaseCache::evictBlock(CacheBlk *blk, Tick forward_timing)
+BaseCache::evictBlock(CacheBlk *blk, PacketList &writebacks)
 {
    PacketPtr pkt = evictBlock(blk);
    if (pkt) {
-        if (system->isTimingMode()) {
-            doWritebacks(pkt, forward_timing);
-        } else {
-            doWritebacksAtomic(pkt);
-        }
+        writebacks.push_back(pkt);
    }
 }

@@ -1819,7 +1835,9 @@ BaseCache::sendMSHRQueuePacket(MSHR* mshr)
                    __func__, pkt->print(), blk->print());
            PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest(),
                                             pkt->id);
-            doWritebacks(wb_pkt, 0);
+            PacketList writebacks;
+            writebacks.push_back(wb_pkt);
+            doWritebacks(writebacks, 0);
        }

        return false;
--- a/src/mem/cache/base.hh
+++ b/src/mem/cache/base.hh
@@ -454,9 +454,11 @@ class BaseCache : public ClockedObject
     * @param pkt The memory request to perform.
     * @param blk The cache block to be updated.
     * @param lat The latency of the access.
+     * @param writebacks List for any writebacks that need to be performed.
     * @return Boolean indicating whether the request was satisfied.
     */
-    virtual bool access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat);
+    virtual bool access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat,
+                        PacketList &writebacks);

    /*
     * Handle a timing request that hit in the cache
@@ -549,9 +551,11 @@ class BaseCache : public ClockedObject
     *
     * @param pkt The packet with the requests
     * @param blk The referenced block
+     * @param writebacks A list with packets for any performed writebacks
     * @return Cycles for handling the request
     */
-    virtual Cycles handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk) = 0;
+    virtual Cycles handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk,
+                                       PacketList &writebacks) = 0;

    /**
     * Performs the access specified by the request.
@@ -591,18 +595,13 @@ class BaseCache : public ClockedObject

    /**
     * Insert writebacks into the write buffer
-     *
-     * @param pkt The writeback packet.
-     * @param forward_time Tick to which the writeback should be scheduled.
     */
-    virtual void doWritebacks(PacketPtr pkt, Tick forward_time) = 0;
+    virtual void doWritebacks(PacketList& writebacks, Tick forward_time) = 0;

    /**
-     * Send writebacks down the memory hierarchy in atomic mode.
-     *
-     * @param pkt The writeback packet.
+     * Send writebacks down the memory hierarchy in atomic mode
     */
-    virtual void doWritebacksAtomic(PacketPtr pkt) = 0;
+    virtual void doWritebacksAtomic(PacketList& writebacks) = 0;

    /**
     * Create an appropriate downstream bus request packet.
@@ -648,7 +647,8 @@ class BaseCache : public ClockedObject
     */
    void writebackTempBlockAtomic() {
        assert(tempBlockWriteback != nullptr);
-        doWritebacksAtomic(tempBlockWriteback);
+        PacketList writebacks{tempBlockWriteback};
+        doWritebacksAtomic(writebacks);
        tempBlockWriteback = nullptr;
    }

@@ -680,12 +680,11 @@ class BaseCache : public ClockedObject
     *
     * @param blk The block to be overwriten.
     * @param data A pointer to the data to be compressed (blk's new data).
-     * @param delay The delay until the packet's metadata is present.
-     * @param tag_latency Latency to access the tags of the replacement victim.
+     * @param writebacks List for any writebacks that need to be performed.
     * @return Whether operation is successful or not.
     */
    bool updateCompressionData(CacheBlk *blk, const uint64_t* data,
-        uint32_t delay, Cycles tag_latency);
+                               PacketList &writebacks);

    /**
     * Perform any necessary updates to the block and perform any data
@@ -718,27 +717,34 @@ class BaseCache : public ClockedObject
     * Populates a cache block and handles all outstanding requests for the
     * satisfied fill request. This version takes two memory requests. One
     * contains the fill data, the other is an optional target to satisfy.
+     * Note that the reason we return a list of writebacks rather than
+     * inserting them directly in the write buffer is that this function
+     * is called by both atomic and timing-mode accesses, and in atomic
+     * mode we don't mess with the write buffer (we just perform the
+     * writebacks atomically once the original request is complete).
     *
     * @param pkt The memory request with the fill data.
     * @param blk The cache block if it already exists.
+     * @param writebacks List for any writebacks that need to be performed.
     * @param allocate Whether to allocate a block or use the temp block
     * @return Pointer to the new cache block.
     */
-    CacheBlk *handleFill(PacketPtr pkt, CacheBlk *blk, bool allocate);
+    CacheBlk *handleFill(PacketPtr pkt, CacheBlk *blk,
+                         PacketList &writebacks, bool allocate);

    /**
-     * Allocate a new block for the packet's data. The victim block might be
-     * valid, and thus the necessary writebacks are done. May return nullptr
-     * if there are no replaceable blocks. If a replaceable block is found,
-     * it inserts the new block in its place. The new block, however, is not
-     * set as valid yet.
+     * Allocate a new block and perform any necessary writebacks
+     *
+     * Find a victim block and if necessary prepare writebacks for any
+     * existing data. May return nullptr if there are no replaceable
+     * blocks. If a replaceable block is found, it inserts the new block in
+     * its place. The new block, however, is not set as valid yet.
     *
     * @param pkt Packet holding the address to update
-     * @param tag_latency Latency to access the tags of the replacement victim.
+     * @param writebacks A list of writeback packets for the evicted blocks
     * @return the allocated block
     */
-    CacheBlk *allocateBlock(const PacketPtr pkt, Cycles tag_latency);
-
+    CacheBlk *allocateBlock(const PacketPtr pkt, PacketList &writebacks);
    /**
     * Evict a cache block.
     *
@@ -755,10 +761,9 @@ class BaseCache : public ClockedObject
     * Performs a writeback if necesssary and invalidates the block
     *
     * @param blk Block to invalidate
-     * @param forward_time Tick to which the writeback should be scheduled if
-     *                     in timing mode.
+     * @param writebacks Return a list of packets with writebacks
     */
-    void evictBlock(CacheBlk *blk, Tick forward_time);
+    void evictBlock(CacheBlk *blk, PacketList &writebacks);

    /**
     * Invalidate a cache block.
--- a/src/mem/cache/cache.cc
+++ b/src/mem/cache/cache.cc
@@ -161,7 +161,8 @@ Cache::satisfyRequest(PacketPtr pkt, CacheBlk *blk,
 /////////////////////////////////////////////////////

 bool
-Cache::access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat)
+Cache::access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat,
+              PacketList &writebacks)
 {

    if (pkt->req->isUncacheable()) {
@@ -173,91 +174,98 @@ Cache::access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat)

        DPRINTF(Cache, "%s for %s\n", __func__, pkt->print());

-        // lookupLatency is the latency in case the request is uncacheable.
-        lat = lookupLatency;
-
        // flush and invalidate any existing block
        CacheBlk *old_blk(tags->findBlock(pkt->getAddr(), pkt->isSecure()));
        if (old_blk && old_blk->isValid()) {
-            BaseCache::evictBlock(old_blk, clockEdge(lat + forwardLatency));
+            BaseCache::evictBlock(old_blk, writebacks);
        }

        blk = nullptr;
+        // lookupLatency is the latency in case the request is uncacheable.
+        lat = lookupLatency;
        return false;
    }

-    return BaseCache::access(pkt, blk, lat);
+    return BaseCache::access(pkt, blk, lat, writebacks);
 }

 void
-Cache::doWritebacks(PacketPtr pkt, Tick forward_time)
+Cache::doWritebacks(PacketList& writebacks, Tick forward_time)
 {
-    // We use forwardLatency here because we are copying writebacks to
-    // write buffer.
+    while (!writebacks.empty()) {
+        PacketPtr wbPkt = writebacks.front();
+        // We use forwardLatency here because we are copying writebacks to
+        // write buffer.

-    // Call isCachedAbove for Writebacks, CleanEvicts and
-    // WriteCleans to discover if the block is cached above.
-    if (isCachedAbove(pkt)) {
-        if (pkt->cmd == MemCmd::CleanEvict) {
-            // Delete CleanEvict because cached copies exist above. The
-            // packet destructor will delete the request object because
-            // this is a non-snoop request packet which does not require a
-            // response.
-            delete pkt;
-        } else if (pkt->cmd == MemCmd::WritebackClean) {
-            // clean writeback, do not send since the block is
-            // still cached above
-            assert(writebackClean);
-            delete pkt;
+        // Call isCachedAbove for Writebacks, CleanEvicts and
+        // WriteCleans to discover if the block is cached above.
+        if (isCachedAbove(wbPkt)) {
+            if (wbPkt->cmd == MemCmd::CleanEvict) {
+                // Delete CleanEvict because cached copies exist above. The
+                // packet destructor will delete the request object because
+                // this is a non-snoop request packet which does not require a
+                // response.
+                delete wbPkt;
+            } else if (wbPkt->cmd == MemCmd::WritebackClean) {
+                // clean writeback, do not send since the block is
+                // still cached above
+                assert(writebackClean);
+                delete wbPkt;
+            } else {
+                assert(wbPkt->cmd == MemCmd::WritebackDirty ||
+                       wbPkt->cmd == MemCmd::WriteClean);
+                // Set BLOCK_CACHED flag in Writeback and send below, so that
+                // the Writeback does not reset the bit corresponding to this
+                // address in the snoop filter below.
+                wbPkt->setBlockCached();
+                allocateWriteBuffer(wbPkt, forward_time);
+            }
        } else {
-            assert(pkt->cmd == MemCmd::WritebackDirty ||
-                   pkt->cmd == MemCmd::WriteClean);
-            // Set BLOCK_CACHED flag in Writeback and send below, so that
-            // the Writeback does not reset the bit corresponding to this
-            // address in the snoop filter below.
-            pkt->setBlockCached();
-            allocateWriteBuffer(pkt, forward_time);
+            // If the block is not cached above, send packet below. Both
+            // CleanEvict and Writeback with BLOCK_CACHED flag cleared will
+            // reset the bit corresponding to this address in the snoop filter
+            // below.
+            allocateWriteBuffer(wbPkt, forward_time);
        }
-    } else {
-        // If the block is not cached above, send packet below. Both
-        // CleanEvict and Writeback with BLOCK_CACHED flag cleared will
-        // reset the bit corresponding to this address in the snoop filter
-        // below.
-        allocateWriteBuffer(pkt, forward_time);
+        writebacks.pop_front();
    }
 }

 void
-Cache::doWritebacksAtomic(PacketPtr pkt)
+Cache::doWritebacksAtomic(PacketList& writebacks)
 {
-    // Call isCachedAbove for both Writebacks and CleanEvicts. If
-    // isCachedAbove returns true we set BLOCK_CACHED flag in Writebacks
-    // and discard CleanEvicts.
-    if (isCachedAbove(pkt, false)) {
-        if (pkt->cmd == MemCmd::WritebackDirty ||
-            pkt->cmd == MemCmd::WriteClean) {
-            // Set BLOCK_CACHED flag in Writeback and send below,
-            // so that the Writeback does not reset the bit
-            // corresponding to this address in the snoop filter
-            // below. We can discard CleanEvicts because cached
-            // copies exist above. Atomic mode isCachedAbove
-            // modifies packet to set BLOCK_CACHED flag
-            memSidePort.sendAtomic(pkt);
+    while (!writebacks.empty()) {
+        PacketPtr wbPkt = writebacks.front();
+        // Call isCachedAbove for both Writebacks and CleanEvicts. If
+        // isCachedAbove returns true we set BLOCK_CACHED flag in Writebacks
+        // and discard CleanEvicts.
+        if (isCachedAbove(wbPkt, false)) {
+            if (wbPkt->cmd == MemCmd::WritebackDirty ||
+                wbPkt->cmd == MemCmd::WriteClean) {
+                // Set BLOCK_CACHED flag in Writeback and send below,
+                // so that the Writeback does not reset the bit
+                // corresponding to this address in the snoop filter
+                // below. We can discard CleanEvicts because cached
+                // copies exist above. Atomic mode isCachedAbove
+                // modifies packet to set BLOCK_CACHED flag
+                memSidePort.sendAtomic(wbPkt);
+            }
+        } else {
+            // If the block is not cached above, send packet below. Both
+            // CleanEvict and Writeback with BLOCK_CACHED flag cleared will
+            // reset the bit corresponding to this address in the snoop filter
+            // below.
+            memSidePort.sendAtomic(wbPkt);
        }
-    } else {
-        // If the block is not cached above, send packet below. Both
-        // CleanEvict and Writeback with BLOCK_CACHED flag cleared will
-        // reset the bit corresponding to this address in the snoop filter
-        // below.
-        memSidePort.sendAtomic(pkt);
+        writebacks.pop_front();
+        // In case of CleanEvicts, the packet destructor will delete the
+        // request object because this is a non-snoop request packet which
+        // does not require a response.
+        delete wbPkt;
    }
-
-    // In case of CleanEvicts, the packet destructor will delete the
-    // request object because this is a non-snoop request packet which
-    // does not require a response.
-    delete pkt;
 }

+
 void
 Cache::recvTimingSnoopResp(PacketPtr pkt)
 {
@@ -555,7 +563,8 @@ Cache::createMissPacket(PacketPtr cpu_pkt, CacheBlk *blk,


 Cycles
-Cache::handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk)
+Cache::handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk,
+                           PacketList &writebacks)
 {
    // deal with the packets that go through the write path of
    // the cache, i.e. any evictions and writes
@@ -617,7 +626,7 @@ Cache::handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk)
                // the write to a whole line
                const bool allocate = allocOnFill(pkt->cmd) &&
                    (!writeAllocator || writeAllocator->allocate());
-                blk = handleFill(bus_pkt, blk, allocate);
+                blk = handleFill(bus_pkt, blk, writebacks, allocate);
                assert(blk != NULL);
                is_invalidate = false;
                satisfyRequest(pkt, blk);
@@ -625,7 +634,8 @@ Cache::handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk)
                       bus_pkt->cmd == MemCmd::UpgradeResp) {
                // we're updating cache state to allow us to
                // satisfy the upstream request from the cache
-                blk = handleFill(bus_pkt, blk, allocOnFill(pkt->cmd));
+                blk = handleFill(bus_pkt, blk, writebacks,
+                                 allocOnFill(pkt->cmd));
                satisfyRequest(pkt, blk);
                maintainClusivity(pkt->fromCache(), blk);
            } else {
@@ -1011,15 +1021,17 @@ Cache::handleSnoop(PacketPtr pkt, CacheBlk *blk, bool is_timing,
            DPRINTF(CacheVerbose, "%s: packet (snoop) %s found block: %s\n",
                    __func__, pkt->print(), blk->print());
            PacketPtr wb_pkt = writecleanBlk(blk, pkt->req->getDest(), pkt->id);
+            PacketList writebacks;
+            writebacks.push_back(wb_pkt);

            if (is_timing) {
                // anything that is merely forwarded pays for the forward
                // latency and the delay provided by the crossbar
                Tick forward_time = clockEdge(forwardLatency) +
                    pkt->headerDelay;
-                doWritebacks(wb_pkt, forward_time);
+                doWritebacks(writebacks, forward_time);
            } else {
-                doWritebacksAtomic(wb_pkt);
+                doWritebacksAtomic(writebacks);
            }
            pkt->setSatisfied();
        }
--- a/src/mem/cache/cache.hh
+++ b/src/mem/cache/cache.hh
@@ -87,7 +87,8 @@ class Cache : public BaseCache
     */
    void promoteWholeLineWrites(PacketPtr pkt);

-    bool access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat) override;
+    bool access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat,
+                PacketList &writebacks) override;

    void handleTimingReqHit(PacketPtr pkt, CacheBlk *blk,
                            Tick request_time) override;
@@ -98,9 +99,9 @@ class Cache : public BaseCache

    void recvTimingReq(PacketPtr pkt) override;

-    void doWritebacks(PacketPtr pkt, Tick forward_time) override;
+    void doWritebacks(PacketList& writebacks, Tick forward_time) override;

-    void doWritebacksAtomic(PacketPtr pkt) override;
+    void doWritebacksAtomic(PacketList& writebacks) override;

    void serviceMSHRTargets(MSHR *mshr, const PacketPtr pkt,
                            CacheBlk *blk) override;
@@ -109,7 +110,8 @@ class Cache : public BaseCache

    void recvTimingSnoopResp(PacketPtr pkt) override;

-    Cycles handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk) override;
+    Cycles handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk,
+                               PacketList &writebacks) override;

    Tick recvAtomic(PacketPtr pkt) override;

--- a/src/mem/cache/noncoherent_cache.cc
+++ b/src/mem/cache/noncoherent_cache.cc
@@ -80,9 +80,10 @@ NoncoherentCache::satisfyRequest(PacketPtr pkt, CacheBlk *blk, bool, bool)
 }

 bool
-NoncoherentCache::access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat)
+NoncoherentCache::access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat,
+                         PacketList &writebacks)
 {
-    bool success = BaseCache::access(pkt, blk, lat);
+    bool success = BaseCache::access(pkt, blk, lat, writebacks);

    if (pkt->isWriteback() || pkt->cmd == MemCmd::WriteClean) {
        assert(blk && blk->isValid());
@@ -97,16 +98,24 @@ NoncoherentCache::access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat)
 }

 void
-NoncoherentCache::doWritebacks(PacketPtr pkt, Tick forward_time)
+NoncoherentCache::doWritebacks(PacketList& writebacks, Tick forward_time)
 {
-    allocateWriteBuffer(pkt, forward_time);
+    while (!writebacks.empty()) {
+        PacketPtr wb_pkt = writebacks.front();
+        allocateWriteBuffer(wb_pkt, forward_time);
+        writebacks.pop_front();
+    }
 }

 void
-NoncoherentCache::doWritebacksAtomic(PacketPtr pkt)
+NoncoherentCache::doWritebacksAtomic(PacketList& writebacks)
 {
-    memSidePort.sendAtomic(pkt);
-    delete pkt;
+    while (!writebacks.empty()) {
+        PacketPtr wb_pkt = writebacks.front();
+        memSidePort.sendAtomic(wb_pkt);
+        writebacks.pop_front();
+        delete wb_pkt;
+    }
 }

 void
@@ -162,7 +171,8 @@ NoncoherentCache::createMissPacket(PacketPtr cpu_pkt, CacheBlk *blk,


 Cycles
-NoncoherentCache::handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk)
+NoncoherentCache::handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk,
+                                      PacketList &writebacks)
 {
    PacketPtr bus_pkt = createMissPacket(pkt, blk, true,
                                         pkt->isWholeLineWrite(blkSize));
@@ -187,7 +197,7 @@ NoncoherentCache::handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk)
        // afterall it is a read response
        DPRINTF(Cache, "Block for addr %#llx being updated in Cache\n",
                bus_pkt->getAddr());
-        blk = handleFill(bus_pkt, blk, allocOnFill(bus_pkt->cmd));
+        blk = handleFill(bus_pkt, blk, writebacks, allocOnFill(bus_pkt->cmd));
        assert(blk);
    }
    satisfyRequest(pkt, blk);
--- a/src/mem/cache/noncoherent_cache.hh
+++ b/src/mem/cache/noncoherent_cache.hh
@@ -71,7 +71,8 @@ struct NoncoherentCacheParams;
 class NoncoherentCache : public BaseCache
 {
  protected:
-    bool access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat) override;
+    bool access(PacketPtr pkt, CacheBlk *&blk, Cycles &lat,
+                PacketList &writebacks) override;

    void handleTimingReqMiss(PacketPtr pkt, CacheBlk *blk,
                             Tick forward_time,
@@ -79,10 +80,10 @@ class NoncoherentCache : public BaseCache

    void recvTimingReq(PacketPtr pkt) override;

-    void doWritebacks(PacketPtr pkt,
+    void doWritebacks(PacketList& writebacks,
                      Tick forward_time) override;

-    void doWritebacksAtomic(PacketPtr pkt) override;
+    void doWritebacksAtomic(PacketList& writebacks) override;

    void serviceMSHRTargets(MSHR *mshr, const PacketPtr pkt,
                            CacheBlk *blk) override;
@@ -97,7 +98,8 @@ class NoncoherentCache : public BaseCache
        panic("Unexpected timing snoop response %s", pkt->print());
    }

-    Cycles handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk) override;
+    Cycles handleAtomicReqMiss(PacketPtr pkt, CacheBlk *&blk,
+                               PacketList &writebacks) override;

    Tick recvAtomic(PacketPtr pkt) override;