diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh
index 4825facf13..ec39cbd113 100644
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@@ -119,7 +119,7 @@ class FullO3CPU : public BaseO3CPU
     };
 
     BaseMMU *mmu;
-    using LSQRequest = typename LSQ<Impl>::LSQRequest;
+    using LSQRequest = LSQ::LSQRequest;
 
     /** Overall CPU status. */
     Status _status;
diff --git a/src/cpu/o3/dyn_inst.hh b/src/cpu/o3/dyn_inst.hh
index 5a11af4ca4..53a94de197 100644
--- a/src/cpu/o3/dyn_inst.hh
+++ b/src/cpu/o3/dyn_inst.hh
@@ -396,7 +396,7 @@ class BaseO3DynInst : public ExecContext, public RefCounted
      * Saved memory request (needed when the DTB address translation is
      * delayed due to a hw page table walk).
      */
-    typename ::LSQ<O3CPUImpl>::LSQ::LSQRequest *savedReq;
+    LSQ::LSQRequest *savedReq;
 
     /////////////////////// Checker //////////////////////
     // Need a copy of main request pointer to verify on writes.
diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh
index 7bcfec63ef..51e7a23fcd 100644
--- a/src/cpu/o3/iew.hh
+++ b/src/cpu/o3/iew.hh
@@ -354,7 +354,7 @@ class DefaultIEW
     InstructionQueue instQueue;
 
     /** Load / store queue. */
-    LSQ<Impl> ldstQueue;
+    LSQ ldstQueue;
 
     /** Pointer to the functional unit pool. */
     FUPool *fuPool;
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index 8ad0ad5d8a..24039c8bf6 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -1,5 +1,18 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2011-2012, 2014, 2017-2019 ARM Limited
+ * Copyright (c) 2013 Advanced Micro Devices, Inc.
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Copyright (c) 2005-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -26,9 +39,1393 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "cpu/o3/isa_specific.hh"
-#include "cpu/o3/lsq_impl.hh"
+#include "cpu/o3/lsq.hh"
 
-// Force the instantiation of LDSTQ for all the implementations we care about.
-template class LSQ<O3CPUImpl>;
+#include <algorithm>
+#include <list>
+#include <string>
 
+#include "base/compiler.hh"
+#include "base/logging.hh"
+#include "cpu/o3/cpu.hh"
+#include "cpu/o3/dyn_inst.hh"
+#include "cpu/o3/iew.hh"
+#include "cpu/o3/limits.hh"
+#include "debug/Drain.hh"
+#include "debug/Fetch.hh"
+#include "debug/HtmCpu.hh"
+#include "debug/LSQ.hh"
+#include "debug/Writeback.hh"
+#include "params/DerivO3CPU.hh"
+
+LSQ::LSQSenderState::LSQSenderState(LSQRequest *request, bool is_load) :
+    _request(request), isLoad(is_load), needWB(is_load)
+{}
+
+ContextID
+LSQ::LSQSenderState::contextId()
+{
+    return inst->contextId();
+}
+
+LSQ::DcachePort::DcachePort(LSQ *_lsq, FullO3CPU<O3CPUImpl> *_cpu) :
+    RequestPort(_cpu->name() + ".dcache_port", _cpu), lsq(_lsq), cpu(_cpu)
+{}
+
+LSQ::LSQ(FullO3CPU<O3CPUImpl> *cpu_ptr, DefaultIEW<O3CPUImpl> *iew_ptr,
+        const DerivO3CPUParams &params)
+    : cpu(cpu_ptr), iewStage(iew_ptr),
+      _cacheBlocked(false),
+      cacheStorePorts(params.cacheStorePorts), usedStorePorts(0),
+      cacheLoadPorts(params.cacheLoadPorts), usedLoadPorts(0),
+      lsqPolicy(params.smtLSQPolicy),
+      LQEntries(params.LQEntries),
+      SQEntries(params.SQEntries),
+      maxLQEntries(maxLSQAllocation(lsqPolicy, LQEntries, params.numThreads,
+                  params.smtLSQThreshold)),
+      maxSQEntries(maxLSQAllocation(lsqPolicy, SQEntries, params.numThreads,
+                  params.smtLSQThreshold)),
+      dcachePort(this, cpu_ptr),
+      numThreads(params.numThreads)
+{
+    assert(numThreads > 0 && numThreads <= O3MaxThreads);
+
+    //**********************************************
+    //************ Handle SMT Parameters ***********
+    //**********************************************
+
+    /* Run SMT olicy checks. */
+        if (lsqPolicy == SMTQueuePolicy::Dynamic) {
+        DPRINTF(LSQ, "LSQ sharing policy set to Dynamic\n");
+    } else if (lsqPolicy == SMTQueuePolicy::Partitioned) {
+        DPRINTF(Fetch, "LSQ sharing policy set to Partitioned: "
+                "%i entries per LQ | %i entries per SQ\n",
+                maxLQEntries,maxSQEntries);
+    } else if (lsqPolicy == SMTQueuePolicy::Threshold) {
+
+        assert(params.smtLSQThreshold > params.LQEntries);
+        assert(params.smtLSQThreshold > params.SQEntries);
+
+        DPRINTF(LSQ, "LSQ sharing policy set to Threshold: "
+                "%i entries per LQ | %i entries per SQ\n",
+                maxLQEntries,maxSQEntries);
+    } else {
+        panic("Invalid LSQ sharing policy. Options are: Dynamic, "
+                    "Partitioned, Threshold");
+    }
+
+    thread.reserve(numThreads);
+    for (ThreadID tid = 0; tid < numThreads; tid++) {
+        thread.emplace_back(maxLQEntries, maxSQEntries);
+        thread[tid].init(cpu, iew_ptr, params, this, tid);
+        thread[tid].setDcachePort(&dcachePort);
+    }
+}
+
+
+std::string
+LSQ::name() const
+{
+    return iewStage->name() + ".lsq";
+}
+
+void
+LSQ::setActiveThreads(std::list<ThreadID> *at_ptr)
+{
+    activeThreads = at_ptr;
+    assert(activeThreads != 0);
+}
+
+void
+LSQ::drainSanityCheck() const
+{
+    assert(isDrained());
+
+    for (ThreadID tid = 0; tid < numThreads; tid++)
+        thread[tid].drainSanityCheck();
+}
+
+bool
+LSQ::isDrained() const
+{
+    bool drained(true);
+
+    if (!lqEmpty()) {
+        DPRINTF(Drain, "Not drained, LQ not empty.\n");
+        drained = false;
+    }
+
+    if (!sqEmpty()) {
+        DPRINTF(Drain, "Not drained, SQ not empty.\n");
+        drained = false;
+    }
+
+    return drained;
+}
+
+void
+LSQ::takeOverFrom()
+{
+    usedStorePorts = 0;
+    _cacheBlocked = false;
+
+    for (ThreadID tid = 0; tid < numThreads; tid++) {
+        thread[tid].takeOverFrom();
+    }
+}
+
+void
+LSQ::tick()
+{
+    // Re-issue loads which got blocked on the per-cycle load ports limit.
+    if (usedLoadPorts == cacheLoadPorts && !_cacheBlocked)
+        iewStage->cacheUnblocked();
+
+    usedLoadPorts = 0;
+    usedStorePorts = 0;
+}
+
+bool
+LSQ::cacheBlocked() const
+{
+    return _cacheBlocked;
+}
+
+void
+LSQ::cacheBlocked(bool v)
+{
+    _cacheBlocked = v;
+}
+
+bool
+LSQ::cachePortAvailable(bool is_load) const
+{
+    bool ret;
+    if (is_load) {
+        ret  = usedLoadPorts < cacheLoadPorts;
+    } else {
+        ret  = usedStorePorts < cacheStorePorts;
+    }
+    return ret;
+}
+
+void
+LSQ::cachePortBusy(bool is_load)
+{
+    assert(cachePortAvailable(is_load));
+    if (is_load) {
+        usedLoadPorts++;
+    } else {
+        usedStorePorts++;
+    }
+}
+
+void
+LSQ::insertLoad(const O3DynInstPtr &load_inst)
+{
+    ThreadID tid = load_inst->threadNumber;
+
+    thread[tid].insertLoad(load_inst);
+}
+
+void
+LSQ::insertStore(const O3DynInstPtr &store_inst)
+{
+    ThreadID tid = store_inst->threadNumber;
+
+    thread[tid].insertStore(store_inst);
+}
+
+Fault
+LSQ::executeLoad(const O3DynInstPtr &inst)
+{
+    ThreadID tid = inst->threadNumber;
+
+    return thread[tid].executeLoad(inst);
+}
+
+Fault
+LSQ::executeStore(const O3DynInstPtr &inst)
+{
+    ThreadID tid = inst->threadNumber;
+
+    return thread[tid].executeStore(inst);
+}
+
+void
+LSQ::commitLoads(InstSeqNum &youngest_inst, ThreadID tid)
+{
+    thread.at(tid).commitLoads(youngest_inst);
+}
+
+void
+LSQ::commitStores(InstSeqNum &youngest_inst, ThreadID tid)
+{
+    thread.at(tid).commitStores(youngest_inst);
+}
+
+void
+LSQ::writebackStores()
+{
+    std::list<ThreadID>::iterator threads = activeThreads->begin();
+    std::list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        if (numStoresToWB(tid) > 0) {
+            DPRINTF(Writeback,"[tid:%i] Writing back stores. %i stores "
+                "available for Writeback.\n", tid, numStoresToWB(tid));
+        }
+
+        thread[tid].writebackStores();
+    }
+}
+
+void
+LSQ::squash(const InstSeqNum &squashed_num, ThreadID tid)
+{
+    thread.at(tid).squash(squashed_num);
+}
+
+bool
+LSQ::violation()
+{
+    /* Answers: Does Anybody Have a Violation?*/
+    std::list<ThreadID>::iterator threads = activeThreads->begin();
+    std::list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        if (thread[tid].violation())
+            return true;
+    }
+
+    return false;
+}
+
+bool LSQ::violation(ThreadID tid) { return thread.at(tid).violation(); }
+
+O3DynInstPtr
+LSQ::getMemDepViolator(ThreadID tid)
+{
+    return thread.at(tid).getMemDepViolator();
+}
+
+int
+LSQ::getLoadHead(ThreadID tid)
+{
+    return thread.at(tid).getLoadHead();
+}
+
+InstSeqNum
+LSQ::getLoadHeadSeqNum(ThreadID tid)
+{
+    return thread.at(tid).getLoadHeadSeqNum();
+}
+
+int
+LSQ::getStoreHead(ThreadID tid)
+{
+    return thread.at(tid).getStoreHead();
+}
+
+InstSeqNum
+LSQ::getStoreHeadSeqNum(ThreadID tid)
+{
+    return thread.at(tid).getStoreHeadSeqNum();
+}
+
+int LSQ::getCount(ThreadID tid) { return thread.at(tid).getCount(); }
+
+int LSQ::numLoads(ThreadID tid) { return thread.at(tid).numLoads(); }
+
+int LSQ::numStores(ThreadID tid) { return thread.at(tid).numStores(); }
+
+int
+LSQ::numHtmStarts(ThreadID tid) const
+{
+    if (tid == InvalidThreadID)
+        return 0;
+    else
+        return thread[tid].numHtmStarts();
+}
+int
+LSQ::numHtmStops(ThreadID tid) const
+{
+    if (tid == InvalidThreadID)
+        return 0;
+    else
+        return thread[tid].numHtmStops();
+}
+
+void
+LSQ::resetHtmStartsStops(ThreadID tid)
+{
+    if (tid != InvalidThreadID)
+        thread[tid].resetHtmStartsStops();
+}
+
+uint64_t
+LSQ::getLatestHtmUid(ThreadID tid) const
+{
+    if (tid == InvalidThreadID)
+        return 0;
+    else
+        return thread[tid].getLatestHtmUid();
+}
+
+void
+LSQ::setLastRetiredHtmUid(ThreadID tid, uint64_t htmUid)
+{
+    if (tid != InvalidThreadID)
+        thread[tid].setLastRetiredHtmUid(htmUid);
+}
+
+void
+LSQ::recvReqRetry()
+{
+    iewStage->cacheUnblocked();
+    cacheBlocked(false);
+
+    for (ThreadID tid : *activeThreads) {
+        thread[tid].recvRetry();
+    }
+}
+
+void
+LSQ::completeDataAccess(PacketPtr pkt)
+{
+    auto senderState = dynamic_cast<LSQSenderState*>(pkt->senderState);
+    thread[cpu->contextToThread(senderState->contextId())]
+        .completeDataAccess(pkt);
+}
+
+bool
+LSQ::recvTimingResp(PacketPtr pkt)
+{
+    if (pkt->isError())
+        DPRINTF(LSQ, "Got error packet back for address: %#X\n",
+                pkt->getAddr());
+
+    auto senderState = dynamic_cast<LSQSenderState*>(pkt->senderState);
+    panic_if(!senderState, "Got packet back with unknown sender state\n");
+
+    thread[cpu->contextToThread(senderState->contextId())].recvTimingResp(pkt);
+
+    if (pkt->isInvalidate()) {
+        // This response also contains an invalidate; e.g. this can be the case
+        // if cmd is ReadRespWithInvalidate.
+        //
+        // The calling order between completeDataAccess and checkSnoop matters.
+        // By calling checkSnoop after completeDataAccess, we ensure that the
+        // fault set by checkSnoop is not lost. Calling writeback (more
+        // specifically inst->completeAcc) in completeDataAccess overwrites
+        // fault, and in case this instruction requires squashing (as
+        // determined by checkSnoop), the ReExec fault set by checkSnoop would
+        // be lost otherwise.
+
+        DPRINTF(LSQ, "received invalidation with response for addr:%#x\n",
+                pkt->getAddr());
+
+        for (ThreadID tid = 0; tid < numThreads; tid++) {
+            thread[tid].checkSnoop(pkt);
+        }
+    }
+    // Update the LSQRequest state (this may delete the request)
+    senderState->request()->packetReplied();
+
+    return true;
+}
+
+void
+LSQ::recvTimingSnoopReq(PacketPtr pkt)
+{
+    DPRINTF(LSQ, "received pkt for addr:%#x %s\n", pkt->getAddr(),
+            pkt->cmdString());
+
+    // must be a snoop
+    if (pkt->isInvalidate()) {
+        DPRINTF(LSQ, "received invalidation for addr:%#x\n",
+                pkt->getAddr());
+        for (ThreadID tid = 0; tid < numThreads; tid++) {
+            thread[tid].checkSnoop(pkt);
+        }
+    }
+}
+
+int
+LSQ::getCount()
+{
+    unsigned total = 0;
+
+    std::list<ThreadID>::iterator threads = activeThreads->begin();
+    std::list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        total += getCount(tid);
+    }
+
+    return total;
+}
+
+int
+LSQ::numLoads()
+{
+    unsigned total = 0;
+
+    std::list<ThreadID>::iterator threads = activeThreads->begin();
+    std::list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        total += numLoads(tid);
+    }
+
+    return total;
+}
+
+int
+LSQ::numStores()
+{
+    unsigned total = 0;
+
+    std::list<ThreadID>::iterator threads = activeThreads->begin();
+    std::list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        total += thread[tid].numStores();
+    }
+
+    return total;
+}
+
+unsigned
+LSQ::numFreeLoadEntries()
+{
+    unsigned total = 0;
+
+    std::list<ThreadID>::iterator threads = activeThreads->begin();
+    std::list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        total += thread[tid].numFreeLoadEntries();
+    }
+
+    return total;
+}
+
+unsigned
+LSQ::numFreeStoreEntries()
+{
+    unsigned total = 0;
+
+    std::list<ThreadID>::iterator threads = activeThreads->begin();
+    std::list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        total += thread[tid].numFreeStoreEntries();
+    }
+
+    return total;
+}
+
+unsigned
+LSQ::numFreeLoadEntries(ThreadID tid)
+{
+        return thread[tid].numFreeLoadEntries();
+}
+
+unsigned
+LSQ::numFreeStoreEntries(ThreadID tid)
+{
+        return thread[tid].numFreeStoreEntries();
+}
+
+bool
+LSQ::isFull()
+{
+    std::list<ThreadID>::iterator threads = activeThreads->begin();
+    std::list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        if (!(thread[tid].lqFull() || thread[tid].sqFull()))
+            return false;
+    }
+
+    return true;
+}
+
+bool
+LSQ::isFull(ThreadID tid)
+{
+    //@todo: Change to Calculate All Entries for
+    //Dynamic Policy
+    if (lsqPolicy == SMTQueuePolicy::Dynamic)
+        return isFull();
+    else
+        return thread[tid].lqFull() || thread[tid].sqFull();
+}
+
+bool
+LSQ::isEmpty() const
+{
+    return lqEmpty() && sqEmpty();
+}
+
+bool
+LSQ::lqEmpty() const
+{
+    std::list<ThreadID>::const_iterator threads = activeThreads->begin();
+    std::list<ThreadID>::const_iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        if (!thread[tid].lqEmpty())
+            return false;
+    }
+
+    return true;
+}
+
+bool
+LSQ::sqEmpty() const
+{
+    std::list<ThreadID>::const_iterator threads = activeThreads->begin();
+    std::list<ThreadID>::const_iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        if (!thread[tid].sqEmpty())
+            return false;
+    }
+
+    return true;
+}
+
+bool
+LSQ::lqFull()
+{
+    std::list<ThreadID>::iterator threads = activeThreads->begin();
+    std::list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        if (!thread[tid].lqFull())
+            return false;
+    }
+
+    return true;
+}
+
+bool
+LSQ::lqFull(ThreadID tid)
+{
+    //@todo: Change to Calculate All Entries for
+    //Dynamic Policy
+    if (lsqPolicy == SMTQueuePolicy::Dynamic)
+        return lqFull();
+    else
+        return thread[tid].lqFull();
+}
+
+bool
+LSQ::sqFull()
+{
+    std::list<ThreadID>::iterator threads = activeThreads->begin();
+    std::list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        if (!sqFull(tid))
+            return false;
+    }
+
+    return true;
+}
+
+bool
+LSQ::sqFull(ThreadID tid)
+{
+     //@todo: Change to Calculate All Entries for
+    //Dynamic Policy
+    if (lsqPolicy == SMTQueuePolicy::Dynamic)
+        return sqFull();
+    else
+        return thread[tid].sqFull();
+}
+
+bool
+LSQ::isStalled()
+{
+    std::list<ThreadID>::iterator threads = activeThreads->begin();
+    std::list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        if (!thread[tid].isStalled())
+            return false;
+    }
+
+    return true;
+}
+
+bool
+LSQ::isStalled(ThreadID tid)
+{
+    if (lsqPolicy == SMTQueuePolicy::Dynamic)
+        return isStalled();
+    else
+        return thread[tid].isStalled();
+}
+
+bool
+LSQ::hasStoresToWB()
+{
+    std::list<ThreadID>::iterator threads = activeThreads->begin();
+    std::list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        if (hasStoresToWB(tid))
+            return true;
+    }
+
+    return false;
+}
+
+bool
+LSQ::hasStoresToWB(ThreadID tid)
+{
+    return thread.at(tid).hasStoresToWB();
+}
+
+int
+LSQ::numStoresToWB(ThreadID tid)
+{
+    return thread.at(tid).numStoresToWB();
+}
+
+bool
+LSQ::willWB()
+{
+    std::list<ThreadID>::iterator threads = activeThreads->begin();
+    std::list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        if (willWB(tid))
+            return true;
+    }
+
+    return false;
+}
+
+bool
+LSQ::willWB(ThreadID tid)
+{
+    return thread.at(tid).willWB();
+}
+
+void
+LSQ::dumpInsts() const
+{
+    std::list<ThreadID>::const_iterator threads = activeThreads->begin();
+    std::list<ThreadID>::const_iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        thread[tid].dumpInsts();
+    }
+}
+
+void
+LSQ::dumpInsts(ThreadID tid) const
+{
+    thread.at(tid).dumpInsts();
+}
+
+Fault
+LSQ::pushRequest(const O3DynInstPtr& inst, bool isLoad, uint8_t *data,
+        unsigned int size, Addr addr, Request::Flags flags, uint64_t *res,
+        AtomicOpFunctorPtr amo_op, const std::vector<bool>& byte_enable)
+{
+    // This comming request can be either load, store or atomic.
+    // Atomic request has a corresponding pointer to its atomic memory
+    // operation
+    GEM5_VAR_USED bool isAtomic = !isLoad && amo_op;
+
+    ThreadID tid = cpu->contextToThread(inst->contextId());
+    auto cacheLineSize = cpu->cacheLineSize();
+    bool needs_burst = transferNeedsBurst(addr, size, cacheLineSize);
+    LSQRequest* req = nullptr;
+
+    // Atomic requests that access data across cache line boundary are
+    // currently not allowed since the cache does not guarantee corresponding
+    // atomic memory operations to be executed atomically across a cache line.
+    // For ISAs such as x86 that supports cross-cache-line atomic instructions,
+    // the cache needs to be modified to perform atomic update to both cache
+    // lines. For now, such cross-line update is not supported.
+    assert(!isAtomic || (isAtomic && !needs_burst));
+
+    const bool htm_cmd = isLoad && (flags & Request::HTM_CMD);
+
+    if (inst->translationStarted()) {
+        req = inst->savedReq;
+        assert(req);
+    } else {
+        if (htm_cmd) {
+            assert(addr == 0x0lu);
+            assert(size == 8);
+            req = new HtmCmdRequest(&thread[tid], inst, flags);
+        } else if (needs_burst) {
+            req = new SplitDataRequest(&thread[tid], inst, isLoad, addr,
+                    size, flags, data, res);
+        } else {
+            req = new SingleDataRequest(&thread[tid], inst, isLoad, addr,
+                    size, flags, data, res, std::move(amo_op));
+        }
+        assert(req);
+        req->_byteEnable = byte_enable;
+        inst->setRequest();
+        req->taskId(cpu->taskId());
+
+        // There might be fault from a previous execution attempt if this is
+        // a strictly ordered load
+        inst->getFault() = NoFault;
+
+        req->initiateTranslation();
+    }
+
+    /* This is the place were instructions get the effAddr. */
+    if (req->isTranslationComplete()) {
+        if (req->isMemAccessRequired()) {
+            inst->effAddr = req->getVaddr();
+            inst->effSize = size;
+            inst->effAddrValid(true);
+
+            if (cpu->checker) {
+                inst->reqToVerify = std::make_shared<Request>(*req->request());
+            }
+            Fault fault;
+            if (isLoad)
+                fault = cpu->read(req, inst->lqIdx);
+            else
+                fault = cpu->write(req, data, inst->sqIdx);
+            // inst->getFault() may have the first-fault of a
+            // multi-access split request at this point.
+            // Overwrite that only if we got another type of fault
+            // (e.g. re-exec).
+            if (fault != NoFault)
+                inst->getFault() = fault;
+        } else if (isLoad) {
+            inst->setMemAccPredicate(false);
+            // Commit will have to clean up whatever happened.  Set this
+            // instruction as executed.
+            inst->setExecuted();
+        }
+    }
+
+    if (inst->traceData)
+        inst->traceData->setMem(addr, size, flags);
+
+    return inst->getFault();
+}
+
+void
+LSQ::SingleDataRequest::finish(const Fault &fault, const RequestPtr &req,
+        ThreadContext* tc, BaseTLB::Mode mode)
+{
+    _fault.push_back(fault);
+    numInTranslationFragments = 0;
+    numTranslatedFragments = 1;
+    /* If the instruction has been squahsed, let the request know
+     * as it may have to self-destruct. */
+    if (_inst->isSquashed()) {
+        squashTranslation();
+    } else {
+        _inst->strictlyOrdered(req->isStrictlyOrdered());
+
+        flags.set(Flag::TranslationFinished);
+        if (fault == NoFault) {
+            _inst->physEffAddr = req->getPaddr();
+            _inst->memReqFlags = req->getFlags();
+            if (req->isCondSwap()) {
+                assert(_res);
+                req->setExtraData(*_res);
+            }
+            setState(State::Request);
+        } else {
+            setState(State::Fault);
+        }
+
+        LSQRequest::_inst->fault = fault;
+        LSQRequest::_inst->translationCompleted(true);
+    }
+}
+
+void
+LSQ::SplitDataRequest::finish(const Fault &fault, const RequestPtr &req,
+        ThreadContext* tc, BaseTLB::Mode mode)
+{
+    int i;
+    for (i = 0; i < _requests.size() && _requests[i] != req; i++);
+    assert(i < _requests.size());
+    _fault[i] = fault;
+
+    numInTranslationFragments--;
+    numTranslatedFragments++;
+
+    if (fault == NoFault)
+        mainReq->setFlags(req->getFlags());
+
+    if (numTranslatedFragments == _requests.size()) {
+        if (_inst->isSquashed()) {
+            squashTranslation();
+        } else {
+            _inst->strictlyOrdered(mainReq->isStrictlyOrdered());
+            flags.set(Flag::TranslationFinished);
+            _inst->translationCompleted(true);
+
+            for (i = 0; i < _fault.size() && _fault[i] == NoFault; i++);
+            if (i > 0) {
+                _inst->physEffAddr = request(0)->getPaddr();
+                _inst->memReqFlags = mainReq->getFlags();
+                if (mainReq->isCondSwap()) {
+                    assert (i == _fault.size());
+                    assert(_res);
+                    mainReq->setExtraData(*_res);
+                }
+                if (i == _fault.size()) {
+                    _inst->fault = NoFault;
+                    setState(State::Request);
+                } else {
+                  _inst->fault = _fault[i];
+                  setState(State::PartialFault);
+                }
+            } else {
+                _inst->fault = _fault[0];
+                setState(State::Fault);
+            }
+        }
+
+    }
+}
+
+void
+LSQ::SingleDataRequest::initiateTranslation()
+{
+    assert(_requests.size() == 0);
+
+    addRequest(_addr, _size, _byteEnable);
+
+    if (_requests.size() > 0) {
+        _requests.back()->setReqInstSeqNum(_inst->seqNum);
+        _requests.back()->taskId(_taskId);
+        _inst->translationStarted(true);
+        setState(State::Translation);
+        flags.set(Flag::TranslationStarted);
+
+        _inst->savedReq = this;
+        sendFragmentToTranslation(0);
+    } else {
+        _inst->setMemAccPredicate(false);
+    }
+}
+
+PacketPtr
+LSQ::SplitDataRequest::mainPacket()
+{
+    return _mainPacket;
+}
+
+RequestPtr
+LSQ::SplitDataRequest::mainRequest()
+{
+    return mainReq;
+}
+
+void
+LSQ::SplitDataRequest::initiateTranslation()
+{
+    auto cacheLineSize = _port.cacheLineSize();
+    Addr base_addr = _addr;
+    Addr next_addr = addrBlockAlign(_addr + cacheLineSize, cacheLineSize);
+    Addr final_addr = addrBlockAlign(_addr + _size, cacheLineSize);
+    uint32_t size_so_far = 0;
+
+    mainReq = std::make_shared<Request>(base_addr,
+                _size, _flags, _inst->requestorId(),
+                _inst->instAddr(), _inst->contextId());
+    mainReq->setByteEnable(_byteEnable);
+
+    // Paddr is not used in mainReq. However, we will accumulate the flags
+    // from the sub requests into mainReq by calling setFlags() in finish().
+    // setFlags() assumes that paddr is set so flip the paddr valid bit here to
+    // avoid a potential assert in setFlags() when we call it from  finish().
+    mainReq->setPaddr(0);
+
+    /* Get the pre-fix, possibly unaligned. */
+    auto it_start = _byteEnable.begin();
+    auto it_end = _byteEnable.begin() + (next_addr - base_addr);
+    addRequest(base_addr, next_addr - base_addr,
+                     std::vector<bool>(it_start, it_end));
+    size_so_far = next_addr - base_addr;
+
+    /* We are block aligned now, reading whole blocks. */
+    base_addr = next_addr;
+    while (base_addr != final_addr) {
+        auto it_start = _byteEnable.begin() + size_so_far;
+        auto it_end = _byteEnable.begin() + size_so_far + cacheLineSize;
+        addRequest(base_addr, cacheLineSize,
+                         std::vector<bool>(it_start, it_end));
+        size_so_far += cacheLineSize;
+        base_addr += cacheLineSize;
+    }
+
+    /* Deal with the tail. */
+    if (size_so_far < _size) {
+        auto it_start = _byteEnable.begin() + size_so_far;
+        auto it_end = _byteEnable.end();
+        addRequest(base_addr, _size - size_so_far,
+                         std::vector<bool>(it_start, it_end));
+    }
+
+    if (_requests.size() > 0) {
+        /* Setup the requests and send them to translation. */
+        for (auto& r: _requests) {
+            r->setReqInstSeqNum(_inst->seqNum);
+            r->taskId(_taskId);
+        }
+
+        _inst->translationStarted(true);
+        setState(State::Translation);
+        flags.set(Flag::TranslationStarted);
+        _inst->savedReq = this;
+        numInTranslationFragments = 0;
+        numTranslatedFragments = 0;
+        _fault.resize(_requests.size());
+
+        for (uint32_t i = 0; i < _requests.size(); i++) {
+            sendFragmentToTranslation(i);
+        }
+    } else {
+        _inst->setMemAccPredicate(false);
+    }
+}
+
+LSQ::LSQRequest::LSQRequest(
+        LSQUnit *port, const O3DynInstPtr& inst, bool isLoad) :
+    _state(State::NotIssued), _senderState(nullptr),
+    _port(*port), _inst(inst), _data(nullptr),
+    _res(nullptr), _addr(0), _size(0), _flags(0),
+    _numOutstandingPackets(0), _amo_op(nullptr)
+{
+    flags.set(Flag::IsLoad, isLoad);
+    flags.set(Flag::WbStore,
+              _inst->isStoreConditional() || _inst->isAtomic());
+    flags.set(Flag::IsAtomic, _inst->isAtomic());
+    install();
+}
+
+LSQ::LSQRequest::LSQRequest(
+        LSQUnit *port, const O3DynInstPtr& inst, bool isLoad,
+        const Addr& addr, const uint32_t& size, const Request::Flags& flags_,
+           PacketDataPtr data, uint64_t* res, AtomicOpFunctorPtr amo_op)
+    : _state(State::NotIssued), _senderState(nullptr),
+    numTranslatedFragments(0),
+    numInTranslationFragments(0),
+    _port(*port), _inst(inst), _data(data),
+    _res(res), _addr(addr), _size(size),
+    _flags(flags_),
+    _numOutstandingPackets(0),
+    _amo_op(std::move(amo_op))
+{
+    flags.set(Flag::IsLoad, isLoad);
+    flags.set(Flag::WbStore,
+              _inst->isStoreConditional() || _inst->isAtomic());
+    flags.set(Flag::IsAtomic, _inst->isAtomic());
+    install();
+}
+
+void
+LSQ::LSQRequest::install()
+{
+    if (isLoad()) {
+        _port.loadQueue[_inst->lqIdx].setRequest(this);
+    } else {
+        // Store, StoreConditional, and Atomic requests are pushed
+        // to this storeQueue
+        _port.storeQueue[_inst->sqIdx].setRequest(this);
+    }
+}
+
+bool LSQ::LSQRequest::squashed() const { return _inst->isSquashed(); }
+
+void
+LSQ::LSQRequest::addRequest(Addr addr, unsigned size,
+           const std::vector<bool>& byte_enable)
+{
+    if (isAnyActiveElement(byte_enable.begin(), byte_enable.end())) {
+        auto request = std::make_shared<Request>(
+                addr, size, _flags, _inst->requestorId(),
+                _inst->instAddr(), _inst->contextId(),
+                std::move(_amo_op));
+        request->setByteEnable(byte_enable);
+        _requests.push_back(request);
+    }
+}
+
+LSQ::LSQRequest::~LSQRequest()
+{
+    assert(!isAnyOutstandingRequest());
+    _inst->savedReq = nullptr;
+    if (_senderState)
+        delete _senderState;
+
+    for (auto r: _packets)
+        delete r;
+};
+
+void
+LSQ::LSQRequest::sendFragmentToTranslation(int i)
+{
+    numInTranslationFragments++;
+    _port.getMMUPtr()->translateTiming(request(i), _inst->thread->getTC(),
+            this, isLoad() ? BaseTLB::Read : BaseTLB::Write);
+}
+
+bool
+LSQ::SingleDataRequest::recvTimingResp(PacketPtr pkt)
+{
+    assert(_numOutstandingPackets == 1);
+    auto state = dynamic_cast<LSQSenderState*>(pkt->senderState);
+    flags.set(Flag::Complete);
+    state->outstanding--;
+    assert(pkt == _packets.front());
+    _port.completeDataAccess(pkt);
+    return true;
+}
+
+bool
+LSQ::SplitDataRequest::recvTimingResp(PacketPtr pkt)
+{
+    auto state = dynamic_cast<LSQSenderState*>(pkt->senderState);
+    uint32_t pktIdx = 0;
+    while (pktIdx < _packets.size() && pkt != _packets[pktIdx])
+        pktIdx++;
+    assert(pktIdx < _packets.size());
+    numReceivedPackets++;
+    state->outstanding--;
+    if (numReceivedPackets == _packets.size()) {
+        flags.set(Flag::Complete);
+        /* Assemble packets. */
+        PacketPtr resp = isLoad()
+            ? Packet::createRead(mainReq)
+            : Packet::createWrite(mainReq);
+        if (isLoad())
+            resp->dataStatic(_inst->memData);
+        else
+            resp->dataStatic(_data);
+        resp->senderState = _senderState;
+        _port.completeDataAccess(resp);
+        delete resp;
+    }
+    return true;
+}
+
+void
+LSQ::SingleDataRequest::buildPackets()
+{
+    assert(_senderState);
+    /* Retries do not create new packets. */
+    if (_packets.size() == 0) {
+        _packets.push_back(
+                isLoad()
+                    ?  Packet::createRead(request())
+                    :  Packet::createWrite(request()));
+        _packets.back()->dataStatic(_inst->memData);
+        _packets.back()->senderState = _senderState;
+
+        // hardware transactional memory
+        // If request originates in a transaction (not necessarily a HtmCmd),
+        // then the packet should be marked as such.
+        if (_inst->inHtmTransactionalState()) {
+            _packets.back()->setHtmTransactional(
+                _inst->getHtmTransactionUid());
+
+            DPRINTF(HtmCpu,
+              "HTM %s pc=0x%lx - vaddr=0x%lx - paddr=0x%lx - htmUid=%u\n",
+              isLoad() ? "LD" : "ST",
+              _inst->instAddr(),
+              _packets.back()->req->hasVaddr() ?
+                  _packets.back()->req->getVaddr() : 0lu,
+              _packets.back()->getAddr(),
+              _inst->getHtmTransactionUid());
+        }
+    }
+    assert(_packets.size() == 1);
+}
+
+void
+LSQ::SplitDataRequest::buildPackets()
+{
+    /* Extra data?? */
+    Addr base_address = _addr;
+
+    if (_packets.size() == 0) {
+        /* New stuff */
+        if (isLoad()) {
+            _mainPacket = Packet::createRead(mainReq);
+            _mainPacket->dataStatic(_inst->memData);
+
+            // hardware transactional memory
+            // If request originates in a transaction,
+            // packet should be marked as such
+            if (_inst->inHtmTransactionalState()) {
+                _mainPacket->setHtmTransactional(
+                    _inst->getHtmTransactionUid());
+                DPRINTF(HtmCpu,
+                  "HTM LD.0 pc=0x%lx-vaddr=0x%lx-paddr=0x%lx-htmUid=%u\n",
+                  _inst->instAddr(),
+                  _mainPacket->req->hasVaddr() ?
+                      _mainPacket->req->getVaddr() : 0lu,
+                  _mainPacket->getAddr(),
+                  _inst->getHtmTransactionUid());
+            }
+        }
+        for (int i = 0; i < _requests.size() && _fault[i] == NoFault; i++) {
+            RequestPtr r = _requests[i];
+            PacketPtr pkt = isLoad() ? Packet::createRead(r)
+                                     : Packet::createWrite(r);
+            ptrdiff_t offset = r->getVaddr() - base_address;
+            if (isLoad()) {
+                pkt->dataStatic(_inst->memData + offset);
+            } else {
+                uint8_t* req_data = new uint8_t[r->getSize()];
+                std::memcpy(req_data,
+                        _inst->memData + offset,
+                        r->getSize());
+                pkt->dataDynamic(req_data);
+            }
+            pkt->senderState = _senderState;
+            _packets.push_back(pkt);
+
+            // hardware transactional memory
+            // If request originates in a transaction,
+            // packet should be marked as such
+            if (_inst->inHtmTransactionalState()) {
+                _packets.back()->setHtmTransactional(
+                    _inst->getHtmTransactionUid());
+                DPRINTF(HtmCpu,
+                  "HTM %s.%d pc=0x%lx-vaddr=0x%lx-paddr=0x%lx-htmUid=%u\n",
+                  isLoad() ? "LD" : "ST",
+                  i+1,
+                  _inst->instAddr(),
+                  _packets.back()->req->hasVaddr() ?
+                      _packets.back()->req->getVaddr() : 0lu,
+                  _packets.back()->getAddr(),
+                  _inst->getHtmTransactionUid());
+            }
+        }
+    }
+    assert(_packets.size() > 0);
+}
+
+void
+LSQ::SingleDataRequest::sendPacketToCache()
+{
+    assert(_numOutstandingPackets == 0);
+    if (lsqUnit()->trySendPacket(isLoad(), _packets.at(0)))
+        _numOutstandingPackets = 1;
+}
+
+void
+LSQ::SplitDataRequest::sendPacketToCache()
+{
+    /* Try to send the packets. */
+    while (numReceivedPackets + _numOutstandingPackets < _packets.size() &&
+            lsqUnit()->trySendPacket(isLoad(),
+                _packets.at(numReceivedPackets + _numOutstandingPackets))) {
+        _numOutstandingPackets++;
+    }
+}
+
+Cycles
+LSQ::SingleDataRequest::handleLocalAccess(ThreadContext *thread, PacketPtr pkt)
+{
+    return pkt->req->localAccessor(thread, pkt);
+}
+
+Cycles
+LSQ::SplitDataRequest::handleLocalAccess(
+        ThreadContext *thread, PacketPtr mainPkt)
+{
+    Cycles delay(0);
+    unsigned offset = 0;
+
+    for (auto r: _requests) {
+        PacketPtr pkt =
+            new Packet(r, isLoad() ? MemCmd::ReadReq : MemCmd::WriteReq);
+        pkt->dataStatic(mainPkt->getPtr<uint8_t>() + offset);
+        Cycles d = r->localAccessor(thread, pkt);
+        if (d > delay)
+            delay = d;
+        offset += r->getSize();
+        delete pkt;
+    }
+    return delay;
+}
+
+bool
+LSQ::SingleDataRequest::isCacheBlockHit(Addr blockAddr, Addr blockMask)
+{
+    return ( (LSQRequest::_requests[0]->getPaddr() & blockMask) == blockAddr);
+}
+
+/**
+ * Caches may probe into the load-store queue to enforce memory ordering
+ * guarantees. This method supports probes by providing a mechanism to compare
+ * snoop messages with requests tracked by the load-store queue.
+ *
+ * Consistency models must enforce ordering constraints. TSO, for instance,
+ * must prevent memory reorderings except stores which are reordered after
+ * loads. The reordering restrictions negatively impact performance by
+ * cutting down on memory level parallelism. However, the core can regain
+ * performance by generating speculative loads. Speculative loads may issue
+ * without affecting correctness if precautions are taken to handle invalid
+ * memory orders. The load queue must squash under memory model violations.
+ * Memory model violations may occur when block ownership is granted to
+ * another core or the block cannot be accurately monitored by the load queue.
+ */
+bool
+LSQ::SplitDataRequest::isCacheBlockHit(Addr blockAddr, Addr blockMask)
+{
+    bool is_hit = false;
+    for (auto &r: _requests) {
+       /**
+        * The load-store queue handles partial faults which complicates this
+        * method. Physical addresses must be compared between requests and
+        * snoops. Some requests will not have a valid physical address, since
+        * partial faults may have outstanding translations. Therefore, the
+        * existence of a valid request address must be checked before
+        * comparing block hits. We assume no pipeline squash is needed if a
+        * valid request address does not exist.
+        */
+        if (r->hasPaddr() && (r->getPaddr() & blockMask) == blockAddr) {
+            is_hit = true;
+            break;
+        }
+    }
+    return is_hit;
+}
+
+bool
+LSQ::DcachePort::recvTimingResp(PacketPtr pkt)
+{
+    return lsq->recvTimingResp(pkt);
+}
+
+void
+LSQ::DcachePort::recvTimingSnoopReq(PacketPtr pkt)
+{
+    for (ThreadID tid = 0; tid < cpu->numThreads; tid++) {
+        if (cpu->getCpuAddrMonitor(tid)->doMonitor(pkt)) {
+            cpu->wakeup(tid);
+        }
+    }
+    lsq->recvTimingSnoopReq(pkt);
+}
+
+void
+LSQ::DcachePort::recvReqRetry()
+{
+    lsq->recvReqRetry();
+}
+
+LSQ::HtmCmdRequest::HtmCmdRequest(LSQUnit* port, const O3DynInstPtr& inst,
+        const Request::Flags& flags_) :
+    SingleDataRequest(port, inst, true, 0x0lu, 8, flags_,
+        nullptr, nullptr, nullptr)
+{
+    assert(_requests.size() == 0);
+
+    addRequest(_addr, _size, _byteEnable);
+
+    if (_requests.size() > 0) {
+        _requests.back()->setReqInstSeqNum(_inst->seqNum);
+        _requests.back()->taskId(_taskId);
+        _requests.back()->setPaddr(_addr);
+        _requests.back()->setInstCount(_inst->getCpuPtr()->totalInsts());
+
+        _inst->strictlyOrdered(_requests.back()->isStrictlyOrdered());
+        _inst->fault = NoFault;
+        _inst->physEffAddr = _requests.back()->getPaddr();
+        _inst->memReqFlags = _requests.back()->getFlags();
+        _inst->savedReq = this;
+
+        setState(State::Translation);
+    } else {
+        panic("unexpected behaviour");
+    }
+}
+
+void
+LSQ::HtmCmdRequest::initiateTranslation()
+{
+    // Transaction commands are implemented as loads to avoid significant
+    // changes to the cpu and memory interfaces
+    // The virtual and physical address uses a dummy value of 0x00
+    // Address translation does not really occur thus the code below
+
+    flags.set(Flag::TranslationStarted);
+    flags.set(Flag::TranslationFinished);
+
+    _inst->translationStarted(true);
+    _inst->translationCompleted(true);
+
+    setState(State::Request);
+}
+
+void
+LSQ::HtmCmdRequest::finish(const Fault &fault, const RequestPtr &req,
+        ThreadContext* tc, BaseTLB::Mode mode)
+{
+    panic("unexpected behaviour");
+}
+
+Fault
+LSQ::read(LSQRequest* req, int load_idx)
+{
+    ThreadID tid = cpu->contextToThread(req->request()->contextId());
+
+    return thread.at(tid).read(req, load_idx);
+}
+
+Fault
+LSQ::write(LSQRequest* req, uint8_t *data, int store_idx)
+{
+    ThreadID tid = cpu->contextToThread(req->request()->contextId());
+
+    return thread.at(tid).write(req, data, store_idx);
+}
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 7bc15f6ccf..b18ea88c74 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -70,7 +70,6 @@ class DefaultIEW;
 
 class LSQUnit;
 
-template <class Impl>
 class LSQ
 {
   public:
@@ -83,40 +82,36 @@ class LSQ
         LSQRequest* _request;
 
         /** Default constructor. */
-        LSQSenderState(LSQRequest* request, bool isLoad_)
-            : _request(request), mainPkt(nullptr), pendingPacket(nullptr),
-              outstanding(0), isLoad(isLoad_), needWB(isLoad_), isSplit(false),
-              pktToSend(false), deleted(false)
-          { }
-      public:
+        LSQSenderState(LSQRequest* request, bool is_load);
 
+      public:
         /** Instruction which initiated the access to memory. */
         O3DynInstPtr inst;
         /** The main packet from a split load, used during writeback. */
-        PacketPtr mainPkt;
+        PacketPtr mainPkt = nullptr;
         /** A second packet from a split store that needs sending. */
-        PacketPtr pendingPacket;
+        PacketPtr pendingPacket = nullptr;
         /** Number of outstanding packets to complete. */
-        uint8_t outstanding;
+        uint8_t outstanding = 0;
         /** Whether or not it is a load. */
-        bool isLoad;
+        bool isLoad = false;
         /** Whether or not the instruction will need to writeback. */
-        bool needWB;
+        bool needWB = false;
         /** Whether or not this access is split in two. */
-        bool isSplit;
+        bool isSplit = false;
         /** Whether or not there is a packet that needs sending. */
-        bool pktToSend;
+        bool pktToSend = false;
         /** Has the request been deleted?
          * LSQ entries can be squashed before the response comes back. in that
          * case the SenderState knows.
          */
-        bool deleted;
+        bool deleted = false;
         ContextID contextId();
 
         /** Completes a packet and returns whether the access is finished. */
-        inline bool isComplete() { return outstanding == 0; }
-        inline void deleteRequest() { deleted = true; }
-        inline bool alive() { return !deleted; }
+        bool isComplete() { return outstanding == 0; }
+        void deleteRequest() { deleted = true; }
+        bool alive() { return !deleted; }
         LSQRequest* request() { return _request; }
         virtual void complete() = 0;
         void writebackDone() { _request->writebackDone(); }
@@ -130,15 +125,12 @@ class LSQ
       protected:
 
         /** Pointer to LSQ. */
-        LSQ<Impl> *lsq;
-        FullO3CPU<Impl> *cpu;
+        LSQ *lsq;
+        FullO3CPU<O3CPUImpl> *cpu;
 
       public:
         /** Default constructor. */
-        DcachePort(LSQ<Impl> *_lsq, FullO3CPU<Impl>* _cpu)
-            : RequestPort(_cpu->name() + ".dcache_port", _cpu), lsq(_lsq),
-              cpu(_cpu)
-        { }
+        DcachePort(LSQ *_lsq, FullO3CPU<O3CPUImpl> *_cpu);
 
       protected:
 
@@ -148,7 +140,8 @@ class LSQ
         virtual bool recvTimingResp(PacketPtr pkt);
         virtual void recvTimingSnoopReq(PacketPtr pkt);
 
-        virtual void recvFunctionalSnoop(PacketPtr pkt)
+        virtual void
+        recvFunctionalSnoop(PacketPtr pkt)
         {
             // @todo: Is there a need for potential invalidation here?
         }
@@ -640,8 +633,8 @@ class LSQ
       protected:
         /* Given that we are inside templates, children need explicit
          * declaration of the names in the parent class. */
-        using Flag = typename LSQRequest::Flag;
-        using State = typename LSQRequest::State;
+        using Flag = LSQRequest::Flag;
+        using State = LSQRequest::State;
         using LSQRequest::_addr;
         using LSQRequest::_fault;
         using LSQRequest::_flags;
@@ -674,7 +667,7 @@ class LSQ
             LSQRequest(port, inst, isLoad, addr, size, flags_, data, res,
                        std::move(amo_op)) {}
 
-        inline virtual ~SingleDataRequest() {}
+        virtual ~SingleDataRequest() {}
         virtual void initiateTranslation();
         virtual void finish(const Fault &fault, const RequestPtr &req,
                 ThreadContext* tc, BaseTLB::Mode mode);
@@ -691,27 +684,27 @@ class LSQ
     // of encapsulating hardware transactional memory command requests
     class HtmCmdRequest : public SingleDataRequest
     {
-    protected:
-      /* Given that we are inside templates, children need explicit
-       * declaration of the names in the parent class. */
-      using Flag = typename LSQRequest::Flag;
-      using State = typename LSQRequest::State;
-      using LSQRequest::_addr;
-      using LSQRequest::_size;
-      using LSQRequest::_byteEnable;
-      using LSQRequest::_requests;
-      using LSQRequest::_inst;
-      using LSQRequest::_taskId;
-      using LSQRequest::flags;
-      using LSQRequest::setState;
-    public:
-      HtmCmdRequest(LSQUnit* port, const O3DynInstPtr& inst,
-              const Request::Flags& flags_);
-      inline virtual ~HtmCmdRequest() {}
-      virtual void initiateTranslation();
-      virtual void finish(const Fault &fault, const RequestPtr &req,
-              ThreadContext* tc, BaseTLB::Mode mode);
-      virtual std::string name() const { return "HtmCmdRequest"; }
+      protected:
+        /* Given that we are inside templates, children need explicit
+         * declaration of the names in the parent class. */
+        using Flag = LSQRequest::Flag;
+        using State = LSQRequest::State;
+        using LSQRequest::_addr;
+        using LSQRequest::_size;
+        using LSQRequest::_byteEnable;
+        using LSQRequest::_requests;
+        using LSQRequest::_inst;
+        using LSQRequest::_taskId;
+        using LSQRequest::flags;
+        using LSQRequest::setState;
+      public:
+        HtmCmdRequest(LSQUnit* port, const O3DynInstPtr& inst,
+                const Request::Flags& flags_);
+        virtual ~HtmCmdRequest() {}
+        virtual void initiateTranslation();
+        virtual void finish(const Fault &fault, const RequestPtr &req,
+                ThreadContext* tc, BaseTLB::Mode mode);
+        virtual std::string name() const { return "HtmCmdRequest"; }
     };
 
     class SplitDataRequest : public LSQRequest
@@ -719,8 +712,8 @@ class LSQ
       protected:
         /* Given that we are inside templates, children need explicit
          * declaration of the names in the parent class. */
-        using Flag = typename LSQRequest::Flag;
-        using State = typename LSQRequest::State;
+        using Flag = LSQRequest::Flag;
+        using State = LSQRequest::State;
         using LSQRequest::_addr;
         using LSQRequest::_data;
         using LSQRequest::_fault;
@@ -791,9 +784,8 @@ class LSQ
     };
 
     /** Constructs an LSQ with the given parameters. */
-    LSQ(FullO3CPU<Impl> *cpu_ptr, DefaultIEW<Impl> *iew_ptr,
+    LSQ(FullO3CPU<O3CPUImpl> *cpu_ptr, DefaultIEW<O3CPUImpl> *iew_ptr,
             const DerivO3CPUParams &params);
-    ~LSQ() { }
 
     /** Returns the name of the LSQ. */
     std::string name() const;
@@ -1002,10 +994,10 @@ class LSQ
                       const std::vector<bool>& byte_enable);
 
     /** The CPU pointer. */
-    FullO3CPU<Impl> *cpu;
+    FullO3CPU<O3CPUImpl> *cpu;
 
     /** The IEW stage pointer. */
-    DefaultIEW<Impl> *iewStage;
+    DefaultIEW<O3CPUImpl> *iewStage;
 
     /** Is D-cache blocked? */
     bool cacheBlocked() const;
diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh
deleted file mode 100644
index 8ef440ecfc..0000000000
--- a/src/cpu/o3/lsq_impl.hh
+++ /dev/null
@@ -1,1528 +0,0 @@
-/*
- * Copyright (c) 2011-2012, 2014, 2017-2019 ARM Limited
- * Copyright (c) 2013 Advanced Micro Devices, Inc.
- * All rights reserved
- *
- * The license below extends only to copyright in the software and shall
- * not be construed as granting a license to any other intellectual
- * property including but not limited to intellectual property relating
- * to a hardware implementation of the functionality of the software
- * licensed hereunder.  You may use the software subject to the license
- * terms below provided that you ensure that this notice is replicated
- * unmodified and in its entirety in all distributions of the software,
- * modified or unmodified, in source code or in binary form.
- *
- * Copyright (c) 2005-2006 The Regents of The University of Michigan
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __CPU_O3_LSQ_IMPL_HH__
-#define __CPU_O3_LSQ_IMPL_HH__
-
-#include <algorithm>
-#include <list>
-#include <string>
-
-#include "base/compiler.hh"
-#include "base/logging.hh"
-#include "cpu/o3/cpu.hh"
-#include "cpu/o3/dyn_inst.hh"
-#include "cpu/o3/iew.hh"
-#include "cpu/o3/limits.hh"
-#include "cpu/o3/lsq.hh"
-#include "debug/Drain.hh"
-#include "debug/Fetch.hh"
-#include "debug/HtmCpu.hh"
-#include "debug/LSQ.hh"
-#include "debug/Writeback.hh"
-#include "params/DerivO3CPU.hh"
-
-template <class Impl>
-ContextID
-LSQ<Impl>::LSQSenderState::contextId()
-{
-    return inst->contextId();
-}
-
-template <class Impl>
-LSQ<Impl>::LSQ(FullO3CPU<Impl> *cpu_ptr, DefaultIEW<Impl> *iew_ptr,
-        const DerivO3CPUParams &params)
-    : cpu(cpu_ptr), iewStage(iew_ptr),
-      _cacheBlocked(false),
-      cacheStorePorts(params.cacheStorePorts), usedStorePorts(0),
-      cacheLoadPorts(params.cacheLoadPorts), usedLoadPorts(0),
-      lsqPolicy(params.smtLSQPolicy),
-      LQEntries(params.LQEntries),
-      SQEntries(params.SQEntries),
-      maxLQEntries(maxLSQAllocation(lsqPolicy, LQEntries, params.numThreads,
-                  params.smtLSQThreshold)),
-      maxSQEntries(maxLSQAllocation(lsqPolicy, SQEntries, params.numThreads,
-                  params.smtLSQThreshold)),
-      dcachePort(this, cpu_ptr),
-      numThreads(params.numThreads)
-{
-    assert(numThreads > 0 && numThreads <= O3MaxThreads);
-
-    //**********************************************
-    //************ Handle SMT Parameters ***********
-    //**********************************************
-
-    /* Run SMT olicy checks. */
-        if (lsqPolicy == SMTQueuePolicy::Dynamic) {
-        DPRINTF(LSQ, "LSQ sharing policy set to Dynamic\n");
-    } else if (lsqPolicy == SMTQueuePolicy::Partitioned) {
-        DPRINTF(Fetch, "LSQ sharing policy set to Partitioned: "
-                "%i entries per LQ | %i entries per SQ\n",
-                maxLQEntries,maxSQEntries);
-    } else if (lsqPolicy == SMTQueuePolicy::Threshold) {
-
-        assert(params.smtLSQThreshold > params.LQEntries);
-        assert(params.smtLSQThreshold > params.SQEntries);
-
-        DPRINTF(LSQ, "LSQ sharing policy set to Threshold: "
-                "%i entries per LQ | %i entries per SQ\n",
-                maxLQEntries,maxSQEntries);
-    } else {
-        panic("Invalid LSQ sharing policy. Options are: Dynamic, "
-                    "Partitioned, Threshold");
-    }
-
-    thread.reserve(numThreads);
-    for (ThreadID tid = 0; tid < numThreads; tid++) {
-        thread.emplace_back(maxLQEntries, maxSQEntries);
-        thread[tid].init(cpu, iew_ptr, params, this, tid);
-        thread[tid].setDcachePort(&dcachePort);
-    }
-}
-
-
-template<class Impl>
-std::string
-LSQ<Impl>::name() const
-{
-    return iewStage->name() + ".lsq";
-}
-
-template<class Impl>
-void
-LSQ<Impl>::setActiveThreads(std::list<ThreadID> *at_ptr)
-{
-    activeThreads = at_ptr;
-    assert(activeThreads != 0);
-}
-
-template <class Impl>
-void
-LSQ<Impl>::drainSanityCheck() const
-{
-    assert(isDrained());
-
-    for (ThreadID tid = 0; tid < numThreads; tid++)
-        thread[tid].drainSanityCheck();
-}
-
-template <class Impl>
-bool
-LSQ<Impl>::isDrained() const
-{
-    bool drained(true);
-
-    if (!lqEmpty()) {
-        DPRINTF(Drain, "Not drained, LQ not empty.\n");
-        drained = false;
-    }
-
-    if (!sqEmpty()) {
-        DPRINTF(Drain, "Not drained, SQ not empty.\n");
-        drained = false;
-    }
-
-    return drained;
-}
-
-template <class Impl>
-void
-LSQ<Impl>::takeOverFrom()
-{
-    usedStorePorts = 0;
-    _cacheBlocked = false;
-
-    for (ThreadID tid = 0; tid < numThreads; tid++) {
-        thread[tid].takeOverFrom();
-    }
-}
-
-template <class Impl>
-void
-LSQ<Impl>::tick()
-{
-    // Re-issue loads which got blocked on the per-cycle load ports limit.
-    if (usedLoadPorts == cacheLoadPorts && !_cacheBlocked)
-        iewStage->cacheUnblocked();
-
-    usedLoadPorts = 0;
-    usedStorePorts = 0;
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::cacheBlocked() const
-{
-    return _cacheBlocked;
-}
-
-template<class Impl>
-void
-LSQ<Impl>::cacheBlocked(bool v)
-{
-    _cacheBlocked = v;
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::cachePortAvailable(bool is_load) const
-{
-    bool ret;
-    if (is_load) {
-        ret  = usedLoadPorts < cacheLoadPorts;
-    } else {
-        ret  = usedStorePorts < cacheStorePorts;
-    }
-    return ret;
-}
-
-template<class Impl>
-void
-LSQ<Impl>::cachePortBusy(bool is_load)
-{
-    assert(cachePortAvailable(is_load));
-    if (is_load) {
-        usedLoadPorts++;
-    } else {
-        usedStorePorts++;
-    }
-}
-
-template<class Impl>
-void
-LSQ<Impl>::insertLoad(const O3DynInstPtr &load_inst)
-{
-    ThreadID tid = load_inst->threadNumber;
-
-    thread[tid].insertLoad(load_inst);
-}
-
-template<class Impl>
-void
-LSQ<Impl>::insertStore(const O3DynInstPtr &store_inst)
-{
-    ThreadID tid = store_inst->threadNumber;
-
-    thread[tid].insertStore(store_inst);
-}
-
-template<class Impl>
-Fault
-LSQ<Impl>::executeLoad(const O3DynInstPtr &inst)
-{
-    ThreadID tid = inst->threadNumber;
-
-    return thread[tid].executeLoad(inst);
-}
-
-template<class Impl>
-Fault
-LSQ<Impl>::executeStore(const O3DynInstPtr &inst)
-{
-    ThreadID tid = inst->threadNumber;
-
-    return thread[tid].executeStore(inst);
-}
-
-template<class Impl>
-void
-LSQ<Impl>::commitLoads(InstSeqNum &youngest_inst, ThreadID tid)
-{
-    thread.at(tid).commitLoads(youngest_inst);
-}
-
-template<class Impl>
-void
-LSQ<Impl>::commitStores(InstSeqNum &youngest_inst, ThreadID tid)
-{
-    thread.at(tid).commitStores(youngest_inst);
-}
-
-template<class Impl>
-void
-LSQ<Impl>::writebackStores()
-{
-    std::list<ThreadID>::iterator threads = activeThreads->begin();
-    std::list<ThreadID>::iterator end = activeThreads->end();
-
-    while (threads != end) {
-        ThreadID tid = *threads++;
-
-        if (numStoresToWB(tid) > 0) {
-            DPRINTF(Writeback,"[tid:%i] Writing back stores. %i stores "
-                "available for Writeback.\n", tid, numStoresToWB(tid));
-        }
-
-        thread[tid].writebackStores();
-    }
-}
-
-template<class Impl>
-void
-LSQ<Impl>::squash(const InstSeqNum &squashed_num, ThreadID tid)
-{
-    thread.at(tid).squash(squashed_num);
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::violation()
-{
-    /* Answers: Does Anybody Have a Violation?*/
-    std::list<ThreadID>::iterator threads = activeThreads->begin();
-    std::list<ThreadID>::iterator end = activeThreads->end();
-
-    while (threads != end) {
-        ThreadID tid = *threads++;
-
-        if (thread[tid].violation())
-            return true;
-    }
-
-    return false;
-}
-
-template<class Impl>
-bool LSQ<Impl>::violation(ThreadID tid) { return thread.at(tid).violation(); }
-
-template<class Impl>
-O3DynInstPtr
-LSQ<Impl>::getMemDepViolator(ThreadID tid)
-{
-    return thread.at(tid).getMemDepViolator();
-}
-
-template<class Impl>
-int
-LSQ<Impl>::getLoadHead(ThreadID tid)
-{
-    return thread.at(tid).getLoadHead();
-}
-
-template<class Impl>
-InstSeqNum
-LSQ<Impl>::getLoadHeadSeqNum(ThreadID tid)
-{
-    return thread.at(tid).getLoadHeadSeqNum();
-}
-
-template<class Impl>
-int
-LSQ<Impl>::getStoreHead(ThreadID tid)
-{
-    return thread.at(tid).getStoreHead();
-}
-
-template<class Impl>
-InstSeqNum
-LSQ<Impl>::getStoreHeadSeqNum(ThreadID tid)
-{
-    return thread.at(tid).getStoreHeadSeqNum();
-}
-
-template<class Impl>
-int LSQ<Impl>::getCount(ThreadID tid) { return thread.at(tid).getCount(); }
-
-template<class Impl>
-int LSQ<Impl>::numLoads(ThreadID tid) { return thread.at(tid).numLoads(); }
-
-template<class Impl>
-int LSQ<Impl>::numStores(ThreadID tid) { return thread.at(tid).numStores(); }
-
-template<class Impl>
-int
-LSQ<Impl>::numHtmStarts(ThreadID tid) const
-{
-    if (tid == InvalidThreadID)
-        return 0;
-    else
-        return thread[tid].numHtmStarts();
-}
-template<class Impl>
-int
-LSQ<Impl>::numHtmStops(ThreadID tid) const
-{
-    if (tid == InvalidThreadID)
-        return 0;
-    else
-        return thread[tid].numHtmStops();
-}
-
-template<class Impl>
-void
-LSQ<Impl>::resetHtmStartsStops(ThreadID tid)
-{
-    if (tid != InvalidThreadID)
-        thread[tid].resetHtmStartsStops();
-}
-
-template<class Impl>
-uint64_t
-LSQ<Impl>::getLatestHtmUid(ThreadID tid) const
-{
-    if (tid == InvalidThreadID)
-        return 0;
-    else
-        return thread[tid].getLatestHtmUid();
-}
-
-template<class Impl>
-void
-LSQ<Impl>::setLastRetiredHtmUid(ThreadID tid, uint64_t htmUid)
-{
-    if (tid != InvalidThreadID)
-        thread[tid].setLastRetiredHtmUid(htmUid);
-}
-
-template <class Impl>
-void
-LSQ<Impl>::recvReqRetry()
-{
-    iewStage->cacheUnblocked();
-    cacheBlocked(false);
-
-    for (ThreadID tid : *activeThreads) {
-        thread[tid].recvRetry();
-    }
-}
-
-template <class Impl>
-void
-LSQ<Impl>::completeDataAccess(PacketPtr pkt)
-{
-    auto senderState = dynamic_cast<LSQSenderState*>(pkt->senderState);
-    thread[cpu->contextToThread(senderState->contextId())]
-        .completeDataAccess(pkt);
-}
-
-template <class Impl>
-bool
-LSQ<Impl>::recvTimingResp(PacketPtr pkt)
-{
-    if (pkt->isError())
-        DPRINTF(LSQ, "Got error packet back for address: %#X\n",
-                pkt->getAddr());
-
-    auto senderState = dynamic_cast<LSQSenderState*>(pkt->senderState);
-    panic_if(!senderState, "Got packet back with unknown sender state\n");
-
-    thread[cpu->contextToThread(senderState->contextId())].recvTimingResp(pkt);
-
-    if (pkt->isInvalidate()) {
-        // This response also contains an invalidate; e.g. this can be the case
-        // if cmd is ReadRespWithInvalidate.
-        //
-        // The calling order between completeDataAccess and checkSnoop matters.
-        // By calling checkSnoop after completeDataAccess, we ensure that the
-        // fault set by checkSnoop is not lost. Calling writeback (more
-        // specifically inst->completeAcc) in completeDataAccess overwrites
-        // fault, and in case this instruction requires squashing (as
-        // determined by checkSnoop), the ReExec fault set by checkSnoop would
-        // be lost otherwise.
-
-        DPRINTF(LSQ, "received invalidation with response for addr:%#x\n",
-                pkt->getAddr());
-
-        for (ThreadID tid = 0; tid < numThreads; tid++) {
-            thread[tid].checkSnoop(pkt);
-        }
-    }
-    // Update the LSQRequest state (this may delete the request)
-    senderState->request()->packetReplied();
-
-    return true;
-}
-
-template <class Impl>
-void
-LSQ<Impl>::recvTimingSnoopReq(PacketPtr pkt)
-{
-    DPRINTF(LSQ, "received pkt for addr:%#x %s\n", pkt->getAddr(),
-            pkt->cmdString());
-
-    // must be a snoop
-    if (pkt->isInvalidate()) {
-        DPRINTF(LSQ, "received invalidation for addr:%#x\n",
-                pkt->getAddr());
-        for (ThreadID tid = 0; tid < numThreads; tid++) {
-            thread[tid].checkSnoop(pkt);
-        }
-    }
-}
-
-template<class Impl>
-int
-LSQ<Impl>::getCount()
-{
-    unsigned total = 0;
-
-    std::list<ThreadID>::iterator threads = activeThreads->begin();
-    std::list<ThreadID>::iterator end = activeThreads->end();
-
-    while (threads != end) {
-        ThreadID tid = *threads++;
-
-        total += getCount(tid);
-    }
-
-    return total;
-}
-
-template<class Impl>
-int
-LSQ<Impl>::numLoads()
-{
-    unsigned total = 0;
-
-    std::list<ThreadID>::iterator threads = activeThreads->begin();
-    std::list<ThreadID>::iterator end = activeThreads->end();
-
-    while (threads != end) {
-        ThreadID tid = *threads++;
-
-        total += numLoads(tid);
-    }
-
-    return total;
-}
-
-template<class Impl>
-int
-LSQ<Impl>::numStores()
-{
-    unsigned total = 0;
-
-    std::list<ThreadID>::iterator threads = activeThreads->begin();
-    std::list<ThreadID>::iterator end = activeThreads->end();
-
-    while (threads != end) {
-        ThreadID tid = *threads++;
-
-        total += thread[tid].numStores();
-    }
-
-    return total;
-}
-
-template<class Impl>
-unsigned
-LSQ<Impl>::numFreeLoadEntries()
-{
-    unsigned total = 0;
-
-    std::list<ThreadID>::iterator threads = activeThreads->begin();
-    std::list<ThreadID>::iterator end = activeThreads->end();
-
-    while (threads != end) {
-        ThreadID tid = *threads++;
-
-        total += thread[tid].numFreeLoadEntries();
-    }
-
-    return total;
-}
-
-template<class Impl>
-unsigned
-LSQ<Impl>::numFreeStoreEntries()
-{
-    unsigned total = 0;
-
-    std::list<ThreadID>::iterator threads = activeThreads->begin();
-    std::list<ThreadID>::iterator end = activeThreads->end();
-
-    while (threads != end) {
-        ThreadID tid = *threads++;
-
-        total += thread[tid].numFreeStoreEntries();
-    }
-
-    return total;
-}
-
-template<class Impl>
-unsigned
-LSQ<Impl>::numFreeLoadEntries(ThreadID tid)
-{
-        return thread[tid].numFreeLoadEntries();
-}
-
-template<class Impl>
-unsigned
-LSQ<Impl>::numFreeStoreEntries(ThreadID tid)
-{
-        return thread[tid].numFreeStoreEntries();
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::isFull()
-{
-    std::list<ThreadID>::iterator threads = activeThreads->begin();
-    std::list<ThreadID>::iterator end = activeThreads->end();
-
-    while (threads != end) {
-        ThreadID tid = *threads++;
-
-        if (!(thread[tid].lqFull() || thread[tid].sqFull()))
-            return false;
-    }
-
-    return true;
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::isFull(ThreadID tid)
-{
-    //@todo: Change to Calculate All Entries for
-    //Dynamic Policy
-    if (lsqPolicy == SMTQueuePolicy::Dynamic)
-        return isFull();
-    else
-        return thread[tid].lqFull() || thread[tid].sqFull();
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::isEmpty() const
-{
-    return lqEmpty() && sqEmpty();
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::lqEmpty() const
-{
-    std::list<ThreadID>::const_iterator threads = activeThreads->begin();
-    std::list<ThreadID>::const_iterator end = activeThreads->end();
-
-    while (threads != end) {
-        ThreadID tid = *threads++;
-
-        if (!thread[tid].lqEmpty())
-            return false;
-    }
-
-    return true;
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::sqEmpty() const
-{
-    std::list<ThreadID>::const_iterator threads = activeThreads->begin();
-    std::list<ThreadID>::const_iterator end = activeThreads->end();
-
-    while (threads != end) {
-        ThreadID tid = *threads++;
-
-        if (!thread[tid].sqEmpty())
-            return false;
-    }
-
-    return true;
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::lqFull()
-{
-    std::list<ThreadID>::iterator threads = activeThreads->begin();
-    std::list<ThreadID>::iterator end = activeThreads->end();
-
-    while (threads != end) {
-        ThreadID tid = *threads++;
-
-        if (!thread[tid].lqFull())
-            return false;
-    }
-
-    return true;
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::lqFull(ThreadID tid)
-{
-    //@todo: Change to Calculate All Entries for
-    //Dynamic Policy
-    if (lsqPolicy == SMTQueuePolicy::Dynamic)
-        return lqFull();
-    else
-        return thread[tid].lqFull();
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::sqFull()
-{
-    std::list<ThreadID>::iterator threads = activeThreads->begin();
-    std::list<ThreadID>::iterator end = activeThreads->end();
-
-    while (threads != end) {
-        ThreadID tid = *threads++;
-
-        if (!sqFull(tid))
-            return false;
-    }
-
-    return true;
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::sqFull(ThreadID tid)
-{
-     //@todo: Change to Calculate All Entries for
-    //Dynamic Policy
-    if (lsqPolicy == SMTQueuePolicy::Dynamic)
-        return sqFull();
-    else
-        return thread[tid].sqFull();
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::isStalled()
-{
-    std::list<ThreadID>::iterator threads = activeThreads->begin();
-    std::list<ThreadID>::iterator end = activeThreads->end();
-
-    while (threads != end) {
-        ThreadID tid = *threads++;
-
-        if (!thread[tid].isStalled())
-            return false;
-    }
-
-    return true;
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::isStalled(ThreadID tid)
-{
-    if (lsqPolicy == SMTQueuePolicy::Dynamic)
-        return isStalled();
-    else
-        return thread[tid].isStalled();
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::hasStoresToWB()
-{
-    std::list<ThreadID>::iterator threads = activeThreads->begin();
-    std::list<ThreadID>::iterator end = activeThreads->end();
-
-    while (threads != end) {
-        ThreadID tid = *threads++;
-
-        if (hasStoresToWB(tid))
-            return true;
-    }
-
-    return false;
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::hasStoresToWB(ThreadID tid)
-{
-    return thread.at(tid).hasStoresToWB();
-}
-
-template<class Impl>
-int
-LSQ<Impl>::numStoresToWB(ThreadID tid)
-{
-    return thread.at(tid).numStoresToWB();
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::willWB()
-{
-    std::list<ThreadID>::iterator threads = activeThreads->begin();
-    std::list<ThreadID>::iterator end = activeThreads->end();
-
-    while (threads != end) {
-        ThreadID tid = *threads++;
-
-        if (willWB(tid))
-            return true;
-    }
-
-    return false;
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::willWB(ThreadID tid)
-{
-    return thread.at(tid).willWB();
-}
-
-template<class Impl>
-void
-LSQ<Impl>::dumpInsts() const
-{
-    std::list<ThreadID>::const_iterator threads = activeThreads->begin();
-    std::list<ThreadID>::const_iterator end = activeThreads->end();
-
-    while (threads != end) {
-        ThreadID tid = *threads++;
-
-        thread[tid].dumpInsts();
-    }
-}
-
-template<class Impl>
-void
-LSQ<Impl>::dumpInsts(ThreadID tid) const
-{
-    thread.at(tid).dumpInsts();
-}
-
-template<class Impl>
-Fault
-LSQ<Impl>::pushRequest(const O3DynInstPtr& inst, bool isLoad, uint8_t *data,
-                       unsigned int size, Addr addr, Request::Flags flags,
-                       uint64_t *res, AtomicOpFunctorPtr amo_op,
-                       const std::vector<bool>& byte_enable)
-{
-    // This comming request can be either load, store or atomic.
-    // Atomic request has a corresponding pointer to its atomic memory
-    // operation
-    GEM5_VAR_USED bool isAtomic = !isLoad && amo_op;
-
-    ThreadID tid = cpu->contextToThread(inst->contextId());
-    auto cacheLineSize = cpu->cacheLineSize();
-    bool needs_burst = transferNeedsBurst(addr, size, cacheLineSize);
-    LSQRequest* req = nullptr;
-
-    // Atomic requests that access data across cache line boundary are
-    // currently not allowed since the cache does not guarantee corresponding
-    // atomic memory operations to be executed atomically across a cache line.
-    // For ISAs such as x86 that supports cross-cache-line atomic instructions,
-    // the cache needs to be modified to perform atomic update to both cache
-    // lines. For now, such cross-line update is not supported.
-    assert(!isAtomic || (isAtomic && !needs_burst));
-
-    const bool htm_cmd = isLoad && (flags & Request::HTM_CMD);
-
-    if (inst->translationStarted()) {
-        req = inst->savedReq;
-        assert(req);
-    } else {
-        if (htm_cmd) {
-            assert(addr == 0x0lu);
-            assert(size == 8);
-            req = new HtmCmdRequest(&thread[tid], inst, flags);
-        } else if (needs_burst) {
-            req = new SplitDataRequest(&thread[tid], inst, isLoad, addr,
-                    size, flags, data, res);
-        } else {
-            req = new SingleDataRequest(&thread[tid], inst, isLoad, addr,
-                    size, flags, data, res, std::move(amo_op));
-        }
-        assert(req);
-        req->_byteEnable = byte_enable;
-        inst->setRequest();
-        req->taskId(cpu->taskId());
-
-        // There might be fault from a previous execution attempt if this is
-        // a strictly ordered load
-        inst->getFault() = NoFault;
-
-        req->initiateTranslation();
-    }
-
-    /* This is the place were instructions get the effAddr. */
-    if (req->isTranslationComplete()) {
-        if (req->isMemAccessRequired()) {
-            inst->effAddr = req->getVaddr();
-            inst->effSize = size;
-            inst->effAddrValid(true);
-
-            if (cpu->checker) {
-                inst->reqToVerify = std::make_shared<Request>(*req->request());
-            }
-            Fault fault;
-            if (isLoad)
-                fault = cpu->read(req, inst->lqIdx);
-            else
-                fault = cpu->write(req, data, inst->sqIdx);
-            // inst->getFault() may have the first-fault of a
-            // multi-access split request at this point.
-            // Overwrite that only if we got another type of fault
-            // (e.g. re-exec).
-            if (fault != NoFault)
-                inst->getFault() = fault;
-        } else if (isLoad) {
-            inst->setMemAccPredicate(false);
-            // Commit will have to clean up whatever happened.  Set this
-            // instruction as executed.
-            inst->setExecuted();
-        }
-    }
-
-    if (inst->traceData)
-        inst->traceData->setMem(addr, size, flags);
-
-    return inst->getFault();
-}
-
-template<class Impl>
-void
-LSQ<Impl>::SingleDataRequest::finish(const Fault &fault, const RequestPtr &req,
-        ThreadContext* tc, BaseTLB::Mode mode)
-{
-    _fault.push_back(fault);
-    numInTranslationFragments = 0;
-    numTranslatedFragments = 1;
-    /* If the instruction has been squahsed, let the request know
-     * as it may have to self-destruct. */
-    if (_inst->isSquashed()) {
-        this->squashTranslation();
-    } else {
-        _inst->strictlyOrdered(req->isStrictlyOrdered());
-
-        flags.set(Flag::TranslationFinished);
-        if (fault == NoFault) {
-            _inst->physEffAddr = req->getPaddr();
-            _inst->memReqFlags = req->getFlags();
-            if (req->isCondSwap()) {
-                assert(_res);
-                req->setExtraData(*_res);
-            }
-            setState(State::Request);
-        } else {
-            setState(State::Fault);
-        }
-
-        LSQRequest::_inst->fault = fault;
-        LSQRequest::_inst->translationCompleted(true);
-    }
-}
-
-template<class Impl>
-void
-LSQ<Impl>::SplitDataRequest::finish(const Fault &fault, const RequestPtr &req,
-        ThreadContext* tc, BaseTLB::Mode mode)
-{
-    int i;
-    for (i = 0; i < _requests.size() && _requests[i] != req; i++);
-    assert(i < _requests.size());
-    _fault[i] = fault;
-
-    numInTranslationFragments--;
-    numTranslatedFragments++;
-
-    if (fault == NoFault)
-        mainReq->setFlags(req->getFlags());
-
-    if (numTranslatedFragments == _requests.size()) {
-        if (_inst->isSquashed()) {
-            this->squashTranslation();
-        } else {
-            _inst->strictlyOrdered(mainReq->isStrictlyOrdered());
-            flags.set(Flag::TranslationFinished);
-            _inst->translationCompleted(true);
-
-            for (i = 0; i < _fault.size() && _fault[i] == NoFault; i++);
-            if (i > 0) {
-                _inst->physEffAddr = request(0)->getPaddr();
-                _inst->memReqFlags = mainReq->getFlags();
-                if (mainReq->isCondSwap()) {
-                    assert (i == _fault.size());
-                    assert(_res);
-                    mainReq->setExtraData(*_res);
-                }
-                if (i == _fault.size()) {
-                    _inst->fault = NoFault;
-                    setState(State::Request);
-                } else {
-                  _inst->fault = _fault[i];
-                  setState(State::PartialFault);
-                }
-            } else {
-                _inst->fault = _fault[0];
-                setState(State::Fault);
-            }
-        }
-
-    }
-}
-
-template<class Impl>
-void
-LSQ<Impl>::SingleDataRequest::initiateTranslation()
-{
-    assert(_requests.size() == 0);
-
-    this->addRequest(_addr, _size, _byteEnable);
-
-    if (_requests.size() > 0) {
-        _requests.back()->setReqInstSeqNum(_inst->seqNum);
-        _requests.back()->taskId(_taskId);
-        _inst->translationStarted(true);
-        setState(State::Translation);
-        flags.set(Flag::TranslationStarted);
-
-        _inst->savedReq = this;
-        sendFragmentToTranslation(0);
-    } else {
-        _inst->setMemAccPredicate(false);
-    }
-}
-
-template<class Impl>
-PacketPtr
-LSQ<Impl>::SplitDataRequest::mainPacket()
-{
-    return _mainPacket;
-}
-
-template<class Impl>
-RequestPtr
-LSQ<Impl>::SplitDataRequest::mainRequest()
-{
-    return mainReq;
-}
-
-template<class Impl>
-void
-LSQ<Impl>::SplitDataRequest::initiateTranslation()
-{
-    auto cacheLineSize = _port.cacheLineSize();
-    Addr base_addr = _addr;
-    Addr next_addr = addrBlockAlign(_addr + cacheLineSize, cacheLineSize);
-    Addr final_addr = addrBlockAlign(_addr + _size, cacheLineSize);
-    uint32_t size_so_far = 0;
-
-    mainReq = std::make_shared<Request>(base_addr,
-                _size, _flags, _inst->requestorId(),
-                _inst->instAddr(), _inst->contextId());
-    mainReq->setByteEnable(_byteEnable);
-
-    // Paddr is not used in mainReq. However, we will accumulate the flags
-    // from the sub requests into mainReq by calling setFlags() in finish().
-    // setFlags() assumes that paddr is set so flip the paddr valid bit here to
-    // avoid a potential assert in setFlags() when we call it from  finish().
-    mainReq->setPaddr(0);
-
-    /* Get the pre-fix, possibly unaligned. */
-    auto it_start = _byteEnable.begin();
-    auto it_end = _byteEnable.begin() + (next_addr - base_addr);
-    this->addRequest(base_addr, next_addr - base_addr,
-                     std::vector<bool>(it_start, it_end));
-    size_so_far = next_addr - base_addr;
-
-    /* We are block aligned now, reading whole blocks. */
-    base_addr = next_addr;
-    while (base_addr != final_addr) {
-        auto it_start = _byteEnable.begin() + size_so_far;
-        auto it_end = _byteEnable.begin() + size_so_far + cacheLineSize;
-        this->addRequest(base_addr, cacheLineSize,
-                         std::vector<bool>(it_start, it_end));
-        size_so_far += cacheLineSize;
-        base_addr += cacheLineSize;
-    }
-
-    /* Deal with the tail. */
-    if (size_so_far < _size) {
-        auto it_start = _byteEnable.begin() + size_so_far;
-        auto it_end = _byteEnable.end();
-        this->addRequest(base_addr, _size - size_so_far,
-                         std::vector<bool>(it_start, it_end));
-    }
-
-    if (_requests.size() > 0) {
-        /* Setup the requests and send them to translation. */
-        for (auto& r: _requests) {
-            r->setReqInstSeqNum(_inst->seqNum);
-            r->taskId(_taskId);
-        }
-
-        _inst->translationStarted(true);
-        setState(State::Translation);
-        flags.set(Flag::TranslationStarted);
-        this->_inst->savedReq = this;
-        numInTranslationFragments = 0;
-        numTranslatedFragments = 0;
-        _fault.resize(_requests.size());
-
-        for (uint32_t i = 0; i < _requests.size(); i++) {
-            sendFragmentToTranslation(i);
-        }
-    } else {
-        _inst->setMemAccPredicate(false);
-    }
-}
-
-template<class Impl>
-LSQ<Impl>::LSQRequest::LSQRequest(
-        LSQUnit *port, const O3DynInstPtr& inst, bool isLoad) :
-    _state(State::NotIssued), _senderState(nullptr),
-    _port(*port), _inst(inst), _data(nullptr),
-    _res(nullptr), _addr(0), _size(0), _flags(0),
-    _numOutstandingPackets(0), _amo_op(nullptr)
-{
-    flags.set(Flag::IsLoad, isLoad);
-    flags.set(Flag::WbStore,
-              _inst->isStoreConditional() || _inst->isAtomic());
-    flags.set(Flag::IsAtomic, _inst->isAtomic());
-    install();
-}
-
-template<class Impl>
-LSQ<Impl>::LSQRequest::LSQRequest(
-        LSQUnit *port, const O3DynInstPtr& inst, bool isLoad,
-        const Addr& addr, const uint32_t& size, const Request::Flags& flags_,
-           PacketDataPtr data, uint64_t* res, AtomicOpFunctorPtr amo_op)
-    : _state(State::NotIssued), _senderState(nullptr),
-    numTranslatedFragments(0),
-    numInTranslationFragments(0),
-    _port(*port), _inst(inst), _data(data),
-    _res(res), _addr(addr), _size(size),
-    _flags(flags_),
-    _numOutstandingPackets(0),
-    _amo_op(std::move(amo_op))
-{
-    flags.set(Flag::IsLoad, isLoad);
-    flags.set(Flag::WbStore,
-              _inst->isStoreConditional() || _inst->isAtomic());
-    flags.set(Flag::IsAtomic, _inst->isAtomic());
-    install();
-}
-
-template<class Impl>
-void
-LSQ<Impl>::LSQRequest::install()
-{
-    if (isLoad()) {
-        _port.loadQueue[_inst->lqIdx].setRequest(this);
-    } else {
-        // Store, StoreConditional, and Atomic requests are pushed
-        // to this storeQueue
-        _port.storeQueue[_inst->sqIdx].setRequest(this);
-    }
-}
-
-template<class Impl>
-bool LSQ<Impl>::LSQRequest::squashed() const { return _inst->isSquashed(); }
-
-template<class Impl>
-void
-LSQ<Impl>::LSQRequest::addRequest(Addr addr, unsigned size,
-           const std::vector<bool>& byte_enable)
-{
-    if (isAnyActiveElement(byte_enable.begin(), byte_enable.end())) {
-        auto request = std::make_shared<Request>(
-                addr, size, _flags, _inst->requestorId(),
-                _inst->instAddr(), _inst->contextId(),
-                std::move(_amo_op));
-        request->setByteEnable(byte_enable);
-        _requests.push_back(request);
-    }
-}
-
-template<class Impl>
-LSQ<Impl>::LSQRequest::~LSQRequest()
-{
-    assert(!isAnyOutstandingRequest());
-    _inst->savedReq = nullptr;
-    if (_senderState)
-        delete _senderState;
-
-    for (auto r: _packets)
-        delete r;
-};
-
-template<class Impl>
-void
-LSQ<Impl>::LSQRequest::sendFragmentToTranslation(int i)
-{
-    numInTranslationFragments++;
-    _port.getMMUPtr()->translateTiming(
-            this->request(i),
-            this->_inst->thread->getTC(), this,
-            this->isLoad() ? BaseTLB::Read : BaseTLB::Write);
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::SingleDataRequest::recvTimingResp(PacketPtr pkt)
-{
-    assert(_numOutstandingPackets == 1);
-    auto state = dynamic_cast<LSQSenderState*>(pkt->senderState);
-    flags.set(Flag::Complete);
-    state->outstanding--;
-    assert(pkt == _packets.front());
-    _port.completeDataAccess(pkt);
-    return true;
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::SplitDataRequest::recvTimingResp(PacketPtr pkt)
-{
-    auto state = dynamic_cast<LSQSenderState*>(pkt->senderState);
-    uint32_t pktIdx = 0;
-    while (pktIdx < _packets.size() && pkt != _packets[pktIdx])
-        pktIdx++;
-    assert(pktIdx < _packets.size());
-    numReceivedPackets++;
-    state->outstanding--;
-    if (numReceivedPackets == _packets.size()) {
-        flags.set(Flag::Complete);
-        /* Assemble packets. */
-        PacketPtr resp = isLoad()
-            ? Packet::createRead(mainReq)
-            : Packet::createWrite(mainReq);
-        if (isLoad())
-            resp->dataStatic(_inst->memData);
-        else
-            resp->dataStatic(_data);
-        resp->senderState = _senderState;
-        _port.completeDataAccess(resp);
-        delete resp;
-    }
-    return true;
-}
-
-template<class Impl>
-void
-LSQ<Impl>::SingleDataRequest::buildPackets()
-{
-    assert(_senderState);
-    /* Retries do not create new packets. */
-    if (_packets.size() == 0) {
-        _packets.push_back(
-                isLoad()
-                    ?  Packet::createRead(request())
-                    :  Packet::createWrite(request()));
-        _packets.back()->dataStatic(_inst->memData);
-        _packets.back()->senderState = _senderState;
-
-        // hardware transactional memory
-        // If request originates in a transaction (not necessarily a HtmCmd),
-        // then the packet should be marked as such.
-        if (_inst->inHtmTransactionalState()) {
-            _packets.back()->setHtmTransactional(
-                _inst->getHtmTransactionUid());
-
-            DPRINTF(HtmCpu,
-              "HTM %s pc=0x%lx - vaddr=0x%lx - paddr=0x%lx - htmUid=%u\n",
-              isLoad() ? "LD" : "ST",
-              _inst->instAddr(),
-              _packets.back()->req->hasVaddr() ?
-                  _packets.back()->req->getVaddr() : 0lu,
-              _packets.back()->getAddr(),
-              _inst->getHtmTransactionUid());
-        }
-    }
-    assert(_packets.size() == 1);
-}
-
-template<class Impl>
-void
-LSQ<Impl>::SplitDataRequest::buildPackets()
-{
-    /* Extra data?? */
-    Addr base_address = _addr;
-
-    if (_packets.size() == 0) {
-        /* New stuff */
-        if (isLoad()) {
-            _mainPacket = Packet::createRead(mainReq);
-            _mainPacket->dataStatic(_inst->memData);
-
-            // hardware transactional memory
-            // If request originates in a transaction,
-            // packet should be marked as such
-            if (_inst->inHtmTransactionalState()) {
-                _mainPacket->setHtmTransactional(
-                    _inst->getHtmTransactionUid());
-                DPRINTF(HtmCpu,
-                  "HTM LD.0 pc=0x%lx-vaddr=0x%lx-paddr=0x%lx-htmUid=%u\n",
-                  _inst->instAddr(),
-                  _mainPacket->req->hasVaddr() ?
-                      _mainPacket->req->getVaddr() : 0lu,
-                  _mainPacket->getAddr(),
-                  _inst->getHtmTransactionUid());
-            }
-        }
-        for (int i = 0; i < _requests.size() && _fault[i] == NoFault; i++) {
-            RequestPtr r = _requests[i];
-            PacketPtr pkt = isLoad() ? Packet::createRead(r)
-                                     : Packet::createWrite(r);
-            ptrdiff_t offset = r->getVaddr() - base_address;
-            if (isLoad()) {
-                pkt->dataStatic(_inst->memData + offset);
-            } else {
-                uint8_t* req_data = new uint8_t[r->getSize()];
-                std::memcpy(req_data,
-                        _inst->memData + offset,
-                        r->getSize());
-                pkt->dataDynamic(req_data);
-            }
-            pkt->senderState = _senderState;
-            _packets.push_back(pkt);
-
-            // hardware transactional memory
-            // If request originates in a transaction,
-            // packet should be marked as such
-            if (_inst->inHtmTransactionalState()) {
-                _packets.back()->setHtmTransactional(
-                    _inst->getHtmTransactionUid());
-                DPRINTF(HtmCpu,
-                  "HTM %s.%d pc=0x%lx-vaddr=0x%lx-paddr=0x%lx-htmUid=%u\n",
-                  isLoad() ? "LD" : "ST",
-                  i+1,
-                  _inst->instAddr(),
-                  _packets.back()->req->hasVaddr() ?
-                      _packets.back()->req->getVaddr() : 0lu,
-                  _packets.back()->getAddr(),
-                  _inst->getHtmTransactionUid());
-            }
-        }
-    }
-    assert(_packets.size() > 0);
-}
-
-template<class Impl>
-void
-LSQ<Impl>::SingleDataRequest::sendPacketToCache()
-{
-    assert(_numOutstandingPackets == 0);
-    if (lsqUnit()->trySendPacket(isLoad(), _packets.at(0)))
-        _numOutstandingPackets = 1;
-}
-
-template<class Impl>
-void
-LSQ<Impl>::SplitDataRequest::sendPacketToCache()
-{
-    /* Try to send the packets. */
-    while (numReceivedPackets + _numOutstandingPackets < _packets.size() &&
-            lsqUnit()->trySendPacket(isLoad(),
-                _packets.at(numReceivedPackets + _numOutstandingPackets))) {
-        _numOutstandingPackets++;
-    }
-}
-
-template<class Impl>
-Cycles
-LSQ<Impl>::SingleDataRequest::handleLocalAccess(
-        ThreadContext *thread, PacketPtr pkt)
-{
-    return pkt->req->localAccessor(thread, pkt);
-}
-
-template<class Impl>
-Cycles
-LSQ<Impl>::SplitDataRequest::handleLocalAccess(
-        ThreadContext *thread, PacketPtr mainPkt)
-{
-    Cycles delay(0);
-    unsigned offset = 0;
-
-    for (auto r: _requests) {
-        PacketPtr pkt =
-            new Packet(r, isLoad() ? MemCmd::ReadReq : MemCmd::WriteReq);
-        pkt->dataStatic(mainPkt->getPtr<uint8_t>() + offset);
-        Cycles d = r->localAccessor(thread, pkt);
-        if (d > delay)
-            delay = d;
-        offset += r->getSize();
-        delete pkt;
-    }
-    return delay;
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::SingleDataRequest::isCacheBlockHit(Addr blockAddr, Addr blockMask)
-{
-    return ( (LSQRequest::_requests[0]->getPaddr() & blockMask) == blockAddr);
-}
-
-/**
- * Caches may probe into the load-store queue to enforce memory ordering
- * guarantees. This method supports probes by providing a mechanism to compare
- * snoop messages with requests tracked by the load-store queue.
- *
- * Consistency models must enforce ordering constraints. TSO, for instance,
- * must prevent memory reorderings except stores which are reordered after
- * loads. The reordering restrictions negatively impact performance by
- * cutting down on memory level parallelism. However, the core can regain
- * performance by generating speculative loads. Speculative loads may issue
- * without affecting correctness if precautions are taken to handle invalid
- * memory orders. The load queue must squash under memory model violations.
- * Memory model violations may occur when block ownership is granted to
- * another core or the block cannot be accurately monitored by the load queue.
- */
-template<class Impl>
-bool
-LSQ<Impl>::SplitDataRequest::isCacheBlockHit(Addr blockAddr, Addr blockMask)
-{
-    bool is_hit = false;
-    for (auto &r: _requests) {
-       /**
-        * The load-store queue handles partial faults which complicates this
-        * method. Physical addresses must be compared between requests and
-        * snoops. Some requests will not have a valid physical address, since
-        * partial faults may have outstanding translations. Therefore, the
-        * existence of a valid request address must be checked before
-        * comparing block hits. We assume no pipeline squash is needed if a
-        * valid request address does not exist.
-        */
-        if (r->hasPaddr() && (r->getPaddr() & blockMask) == blockAddr) {
-            is_hit = true;
-            break;
-        }
-    }
-    return is_hit;
-}
-
-template <class Impl>
-bool
-LSQ<Impl>::DcachePort::recvTimingResp(PacketPtr pkt)
-{
-    return lsq->recvTimingResp(pkt);
-}
-
-template <class Impl>
-void
-LSQ<Impl>::DcachePort::recvTimingSnoopReq(PacketPtr pkt)
-{
-    for (ThreadID tid = 0; tid < cpu->numThreads; tid++) {
-        if (cpu->getCpuAddrMonitor(tid)->doMonitor(pkt)) {
-            cpu->wakeup(tid);
-        }
-    }
-    lsq->recvTimingSnoopReq(pkt);
-}
-
-template <class Impl>
-void
-LSQ<Impl>::DcachePort::recvReqRetry()
-{
-    lsq->recvReqRetry();
-}
-
-template<class Impl>
-LSQ<Impl>::HtmCmdRequest::HtmCmdRequest(LSQUnit* port,
-                  const O3DynInstPtr& inst,
-                  const Request::Flags& flags_) :
-    SingleDataRequest(port, inst, true, 0x0lu, 8, flags_,
-        nullptr, nullptr, nullptr)
-{
-    assert(_requests.size() == 0);
-
-    this->addRequest(_addr, _size, _byteEnable);
-
-    if (_requests.size() > 0) {
-        _requests.back()->setReqInstSeqNum(_inst->seqNum);
-        _requests.back()->taskId(_taskId);
-        _requests.back()->setPaddr(_addr);
-        _requests.back()->setInstCount(_inst->getCpuPtr()->totalInsts());
-
-        _inst->strictlyOrdered(_requests.back()->isStrictlyOrdered());
-        _inst->fault = NoFault;
-        _inst->physEffAddr = _requests.back()->getPaddr();
-        _inst->memReqFlags = _requests.back()->getFlags();
-        _inst->savedReq = this;
-
-        setState(State::Translation);
-    } else {
-        panic("unexpected behaviour");
-    }
-}
-
-template<class Impl>
-void
-LSQ<Impl>::HtmCmdRequest::initiateTranslation()
-{
-    // Transaction commands are implemented as loads to avoid significant
-    // changes to the cpu and memory interfaces
-    // The virtual and physical address uses a dummy value of 0x00
-    // Address translation does not really occur thus the code below
-
-    flags.set(Flag::TranslationStarted);
-    flags.set(Flag::TranslationFinished);
-
-    _inst->translationStarted(true);
-    _inst->translationCompleted(true);
-
-    setState(State::Request);
-}
-
-template<class Impl>
-void
-LSQ<Impl>::HtmCmdRequest::finish(const Fault &fault, const RequestPtr &req,
-        ThreadContext* tc, BaseTLB::Mode mode)
-{
-    panic("unexpected behaviour");
-}
-
-template <class Impl>
-Fault
-LSQ<Impl>::read(LSQRequest* req, int load_idx)
-{
-    ThreadID tid = cpu->contextToThread(req->request()->contextId());
-
-    return thread.at(tid).read(req, load_idx);
-}
-
-template <class Impl>
-Fault
-LSQ<Impl>::write(LSQRequest* req, uint8_t *data, int store_idx)
-{
-    ThreadID tid = cpu->contextToThread(req->request()->contextId());
-
-    return thread.at(tid).write(req, data, store_idx);
-}
-
-#endif//__CPU_O3_LSQ_IMPL_HH__
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index 9e3205ec60..142d3b9aaf 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -205,7 +205,7 @@ LSQUnit::LSQUnit(uint32_t lqEntries, uint32_t sqEntries)
 
 void
 LSQUnit::init(FullO3CPU<O3CPUImpl> *cpu_ptr, DefaultIEW<O3CPUImpl> *iew_ptr,
-        const DerivO3CPUParams &params, LSQ<O3CPUImpl> *lsq_ptr, unsigned id)
+        const DerivO3CPUParams &params, LSQ *lsq_ptr, unsigned id)
 {
     lsqID = id;
 
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index ffd3955652..9594344358 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -88,8 +88,8 @@ class LSQUnit
   public:
     static constexpr auto MaxDataBytes = MaxVecRegLenInBytes;
 
-    using LSQSenderState = typename LSQ<O3CPUImpl>::LSQSenderState;
-    using LSQRequest = typename LSQ<O3CPUImpl>::LSQRequest;
+    using LSQSenderState = LSQ::LSQSenderState;
+    using LSQRequest = LSQ::LSQRequest;
   private:
     class LSQEntry
     {
@@ -226,8 +226,7 @@ class LSQUnit
 
     /** Initializes the LSQ unit with the specified number of entries. */
     void init(FullO3CPU<O3CPUImpl> *cpu_ptr, DefaultIEW<O3CPUImpl> *iew_ptr,
-            const DerivO3CPUParams &params, LSQ<O3CPUImpl> *lsq_ptr,
-            unsigned id);
+            const DerivO3CPUParams &params, LSQ *lsq_ptr, unsigned id);
 
     /** Returns the name of the LSQ unit. */
     std::string name() const;
@@ -402,7 +401,7 @@ class LSQUnit
     DefaultIEW<O3CPUImpl> *iewStage;
 
     /** Pointer to the LSQ. */
-    LSQ<O3CPUImpl> *lsq;
+    LSQ *lsq;
 
     /** Pointer to the dcache port.  Used only for sending. */
     RequestPort *dcachePort;