cpu: Drop the DynInstPtr types from O3CPUImpl.

Aside from basic code editting, this also moves some methods from the .hh files to the _impl.hh files. It also changes the Checker CPU template to take the DynInstPtr type directly instead of through Impl since that was the only type it used anyway. Finally it sets up a header file which predeclares the O3DynInstPtr and O3DynInstConstPtr types so they can be used without having to also include the BaseO3DynInst class definition to break circular dependencies. Change-Id: I5ca6af38ec13e6e820abcdb3748412e4f7fc1c78 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/42101 Reviewed-by: Nathanael Premillieu <nathanael.premillieu@huawei.com> Maintainer: Gabe Black <gabe.black@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
2021-03-01 21:37:33 -08:00
parent 2caf2509a2
commit 2db8b308e0
34 changed files with 912 additions and 847 deletions
--- a/src/cpu/checker/cpu.hh
+++ b/src/cpu/checker/cpu.hh
@@ -51,6 +51,7 @@
 #include "cpu/base.hh"
 #include "cpu/exec_context.hh"
 #include "cpu/inst_res.hh"
+#include "cpu/o3/dyn_inst.hh"
 #include "cpu/pc_event.hh"
 #include "cpu/simple_thread.hh"
 #include "cpu/static_inst.hh"
@@ -559,12 +560,9 @@ class CheckerCPU : public BaseCPU, public ExecContext
 * template instantiations of the Checker must be placed at the bottom
 * of checker/cpu.cc.
 */
-template <class Impl>
+template <class DynInstPtr>
 class Checker : public CheckerCPU
 {
-  private:
-    typedef typename Impl::DynInstPtr DynInstPtr;
-
  public:
    Checker(const Params &p)
        : CheckerCPU(p), updateThisCycle(false), unverifiedInst(NULL)
--- a/src/cpu/checker/cpu_impl.hh
+++ b/src/cpu/checker/cpu_impl.hh
@@ -59,9 +59,9 @@
 #include "sim/sim_object.hh"
 #include "sim/stats.hh"

-template <class Impl>
+template <class DynInstPtr>
 void
-Checker<Impl>::advancePC(const Fault &fault)
+Checker<DynInstPtr>::advancePC(const Fault &fault)
 {
    if (fault != NoFault) {
        curMacroStaticInst = nullStaticInstPtr;
@@ -80,9 +80,9 @@ Checker<Impl>::advancePC(const Fault &fault)
 }
 //////////////////////////////////////////////////

-template <class Impl>
+template <class DynInstPtr>
 void
-Checker<Impl>::handlePendingInt()
+Checker<DynInstPtr>::handlePendingInt()
 {
    DPRINTF(Checker, "IRQ detected at PC: %s with %d insts in buffer\n",
                     thread->pcState(), instList.size());
@@ -114,9 +114,9 @@ Checker<Impl>::handlePendingInt()
    curMacroStaticInst = nullStaticInstPtr;
 }

-template <class Impl>
+template <class DynInstPtr>
 void
-Checker<Impl>::verify(const DynInstPtr &completed_inst)
+Checker<DynInstPtr>::verify(const DynInstPtr &completed_inst)
 {
    DynInstPtr inst;

@@ -428,22 +428,19 @@ Checker<Impl>::verify(const DynInstPtr &completed_inst)
    unverifiedInst = NULL;
 }

-template <class Impl>
+template <class DynInstPtr>
 void
-Checker<Impl>::switchOut()
+Checker<DynInstPtr>::switchOut()
 {
    instList.clear();
 }

-template <class Impl>
-void
-Checker<Impl>::takeOverFrom(BaseCPU *oldCPU)
-{
-}
+template <class DynInstPtr>
+void Checker<DynInstPtr>::takeOverFrom(BaseCPU *oldCPU) {}

-template <class Impl>
+template <class DynInstPtr>
 void
-Checker<Impl>::validateInst(const DynInstPtr &inst)
+Checker<DynInstPtr>::validateInst(const DynInstPtr &inst)
 {
    if (inst->instAddr() != thread->instAddr()) {
        warn("%lli: PCs do not match! Inst: %s, checker: %s",
@@ -462,9 +459,9 @@ Checker<Impl>::validateInst(const DynInstPtr &inst)
    }
 }

-template <class Impl>
+template <class DynInstPtr>
 void
-Checker<Impl>::validateExecution(const DynInstPtr &inst)
+Checker<DynInstPtr>::validateExecution(const DynInstPtr &inst)
 {
    InstResult checker_val;
    InstResult inst_val;
@@ -555,9 +552,9 @@ Checker<Impl>::validateExecution(const DynInstPtr &inst)
 // This function is weird, if it is called it means the Checker and
 // O3 have diverged, so panic is called for now.  It may be useful
 // to resynch states and continue if the divergence is a false positive
-template <class Impl>
+template <class DynInstPtr>
 void
-Checker<Impl>::validateState()
+Checker<DynInstPtr>::validateState()
 {
    if (updateThisCycle) {
        // Change this back to warn if divergences end up being false positives
@@ -580,10 +577,10 @@ Checker<Impl>::validateState()
    }
 }

-template <class Impl>
+template <class DynInstPtr>
 void
-Checker<Impl>::copyResult(const DynInstPtr &inst,
-                          const InstResult& mismatch_val, int start_idx)
+Checker<DynInstPtr>::copyResult(
+        const DynInstPtr &inst, const InstResult& mismatch_val, int start_idx)
 {
    // We've already popped one dest off the queue,
    // so do the fix-up then start with the next dest reg;
@@ -657,9 +654,9 @@ Checker<Impl>::copyResult(const DynInstPtr &inst,
    }
 }

-template <class Impl>
+template <class DynInstPtr>
 void
-Checker<Impl>::dumpAndExit(const DynInstPtr &inst)
+Checker<DynInstPtr>::dumpAndExit(const DynInstPtr &inst)
 {
    cprintf("Error detected, instruction information:\n");
    cprintf("PC:%s, nextPC:%#x\n[sn:%lli]\n[tid:%i]\n"
@@ -673,9 +670,9 @@ Checker<Impl>::dumpAndExit(const DynInstPtr &inst)
    CheckerCPU::dumpAndExit();
 }

-template <class Impl>
+template <class DynInstPtr>
 void
-Checker<Impl>::dumpInsts()
+Checker<DynInstPtr>::dumpInsts()
 {
    int num = 0;

--- a/src/cpu/o3/checker.cc
+++ b/src/cpu/o3/checker.cc
@@ -43,4 +43,4 @@
 #include "cpu/checker/cpu_impl.hh"

 template
-class Checker<O3CPUImpl>;
+class Checker<O3DynInstPtr>;
--- a/src/cpu/o3/checker.hh
+++ b/src/cpu/o3/checker.hh
@@ -48,10 +48,10 @@
 /**
 * Specific non-templated derived class used for SimObject configuration.
 */
-class O3Checker : public Checker<O3CPUImpl>
+class O3Checker : public Checker<O3DynInstPtr>
 {
  public:
-    O3Checker(const Params &p) : Checker<O3CPUImpl>(p)
+    O3Checker(const Params &p) : Checker<O3DynInstPtr>(p)
    {
        // The checker should check all instructions executed by the main
        // cpu and therefore any parameters for early exit don't make much
--- a/src/cpu/o3/comm.hh
+++ b/src/cpu/o3/comm.hh
@@ -47,6 +47,7 @@
 #include "arch/types.hh"
 #include "base/types.hh"
 #include "cpu/inst_seq.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/limits.hh"
 #include "sim/faults.hh"

@@ -54,11 +55,9 @@
 template<class Impl>
 struct DefaultFetchDefaultDecode
 {
-    typedef typename Impl::DynInstPtr DynInstPtr;
-
    int size;

-    DynInstPtr insts[O3MaxWidth];
+    O3DynInstPtr insts[O3MaxWidth];
    Fault fetchFault;
    InstSeqNum fetchFaultSN;
    bool clearFetchFault;
@@ -68,34 +67,28 @@ struct DefaultFetchDefaultDecode
 template<class Impl>
 struct DefaultDecodeDefaultRename
 {
-    typedef typename Impl::DynInstPtr DynInstPtr;
-
    int size;

-    DynInstPtr insts[O3MaxWidth];
+    O3DynInstPtr insts[O3MaxWidth];
 };

 /** Struct that defines the information passed from rename to IEW. */
 template<class Impl>
 struct DefaultRenameDefaultIEW
 {
-    typedef typename Impl::DynInstPtr DynInstPtr;
-
    int size;

-    DynInstPtr insts[O3MaxWidth];
+    O3DynInstPtr insts[O3MaxWidth];
 };

 /** Struct that defines the information passed from IEW to commit. */
 template<class Impl>
 struct DefaultIEWDefaultCommit
 {
-    typedef typename Impl::DynInstPtr DynInstPtr;
-
    int size;

-    DynInstPtr insts[O3MaxWidth];
-    DynInstPtr mispredictInst[O3MaxThreads];
+    O3DynInstPtr insts[O3MaxWidth];
+    O3DynInstPtr mispredictInst[O3MaxThreads];
    Addr mispredPC[O3MaxThreads];
    InstSeqNum squashedSeqNum[O3MaxThreads];
    TheISA::PCState pc[O3MaxThreads];
@@ -109,23 +102,20 @@ struct DefaultIEWDefaultCommit
 template<class Impl>
 struct IssueStruct
 {
-    typedef typename Impl::DynInstPtr DynInstPtr;
-
    int size;

-    DynInstPtr insts[O3MaxWidth];
+    O3DynInstPtr insts[O3MaxWidth];
 };

 /** Struct that defines all backwards communication. */
 template<class Impl>
 struct TimeBufStruct
 {
-    typedef typename Impl::DynInstPtr DynInstPtr;
-    struct decodeComm
+    struct DecodeComm
    {
        TheISA::PCState nextPC;
-        DynInstPtr mispredictInst;
-        DynInstPtr squashInst;
+        O3DynInstPtr mispredictInst;
+        O3DynInstPtr squashInst;
        InstSeqNum doneSeqNum;
        Addr mispredPC;
        uint64_t branchAddr;
@@ -136,15 +126,13 @@ struct TimeBufStruct
        bool branchTaken;
    };

-    decodeComm decodeInfo[O3MaxThreads];
+    DecodeComm decodeInfo[O3MaxThreads];

-    struct renameComm
-    {
-    };
+    struct RenameComm {};

-    renameComm renameInfo[O3MaxThreads];
+    RenameComm renameInfo[O3MaxThreads];

-    struct iewComm
+    struct IewComm
    {
        // Also eventually include skid buffer space.
        unsigned freeIQEntries;
@@ -161,9 +149,9 @@ struct TimeBufStruct
        bool usedLSQ;
    };

-    iewComm iewInfo[O3MaxThreads];
+    IewComm iewInfo[O3MaxThreads];

-    struct commitComm
+    struct CommitComm
    {
        /////////////////////////////////////////////////////////////////////
        // This code has been re-structured for better packing of variables
@@ -184,14 +172,14 @@ struct TimeBufStruct

        /// Provide fetch the instruction that mispredicted, if this
        /// pointer is not-null a misprediction occured
-        DynInstPtr mispredictInst;  // *F
+        O3DynInstPtr mispredictInst;  // *F

        /// Instruction that caused the a non-mispredict squash
-        DynInstPtr squashInst; // *F
+        O3DynInstPtr squashInst; // *F

        /// Hack for now to send back a strictly ordered access to the
        /// IEW stage.
-        DynInstPtr strictlyOrderedLoad; // *I
+        O3DynInstPtr strictlyOrderedLoad; // *I

        /// Communication specifically to the IQ to tell the IQ that it can
        /// schedule a non-speculative instruction.
@@ -227,7 +215,7 @@ struct TimeBufStruct

    };

-    commitComm commitInfo[O3MaxThreads];
+    CommitComm commitInfo[O3MaxThreads];

    bool decodeBlock[O3MaxThreads];
    bool decodeUnblock[O3MaxThreads];
--- a/src/cpu/o3/commit.hh
+++ b/src/cpu/o3/commit.hh
@@ -46,6 +46,7 @@
 #include "base/statistics.hh"
 #include "cpu/exetrace.hh"
 #include "cpu/inst_seq.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/iew.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/o3/rename_map.hh"
@@ -87,7 +88,6 @@ class DefaultCommit
  public:
    // Typedefs from the Impl.
    typedef typename Impl::O3CPU O3CPU;
-    typedef typename Impl::DynInstPtr DynInstPtr;
    typedef typename Impl::TimeStruct TimeStruct;
    typedef typename Impl::FetchStruct FetchStruct;
    typedef typename Impl::IEWStruct IEWStruct;
@@ -126,10 +126,10 @@ class DefaultCommit
    CommitPolicy commitPolicy;

    /** Probe Points. */
-    ProbePointArg<DynInstPtr> *ppCommit;
-    ProbePointArg<DynInstPtr> *ppCommitStall;
+    ProbePointArg<O3DynInstPtr> *ppCommit;
+    ProbePointArg<O3DynInstPtr> *ppCommitStall;
    /** To probe when an instruction is squashed */
-    ProbePointArg<DynInstPtr> *ppSquash;
+    ProbePointArg<O3DynInstPtr> *ppSquash;

    /** Mark the thread as processing a trap. */
    void processTrapEvent(ThreadID tid);
@@ -277,7 +277,7 @@ class DefaultCommit
     * @param tid ID of the thread to squash.
     * @param head_inst Instruction that requested the squash.
     */
-    void squashAfter(ThreadID tid, const DynInstPtr &head_inst);
+    void squashAfter(ThreadID tid, const O3DynInstPtr &head_inst);

    /** Handles processing an interrupt. */
    void handleInterrupt();
@@ -291,7 +291,7 @@ class DefaultCommit
    /** Tries to commit the head ROB instruction passed in.
     * @param head_inst The instruction to be committed.
     */
-    bool commitHead(const DynInstPtr &head_inst, unsigned inst_num);
+    bool commitHead(const O3DynInstPtr &head_inst, unsigned inst_num);

    /** Gets instructions from rename and inserts them into the ROB. */
    void getInsts();
@@ -385,7 +385,7 @@ class DefaultCommit
     * that caused a squash since this needs to be passed to the fetch
     * stage once squashing starts.
     */
-    DynInstPtr squashAfterInst[O3MaxThreads];
+    O3DynInstPtr squashAfterInst[O3MaxThreads];

    /** Priority List used for Commit Policy */
    std::list<ThreadID> priority_list;
@@ -472,7 +472,7 @@ class DefaultCommit
    bool avoidQuiesceLiveLock;

    /** Updates commit stats based on this instruction. */
-    void updateComInstStats(const DynInstPtr &inst);
+    void updateComInstStats(const O3DynInstPtr &inst);

    // HTM
    int htmStarts[O3MaxThreads];
--- a/src/cpu/o3/commit_impl.hh
+++ b/src/cpu/o3/commit_impl.hh
@@ -54,6 +54,7 @@
 #include "cpu/exetrace.hh"
 #include "cpu/null_static_inst.hh"
 #include "cpu/o3/commit.hh"
+#include "cpu/o3/dyn_inst.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/o3/thread_state.hh"
 #include "cpu/timebuf.hh"
@@ -140,9 +141,12 @@ template <class Impl>
 void
 DefaultCommit<Impl>::regProbePoints()
 {
-    ppCommit = new ProbePointArg<DynInstPtr>(cpu->getProbeManager(), "Commit");
-    ppCommitStall = new ProbePointArg<DynInstPtr>(cpu->getProbeManager(), "CommitStall");
-    ppSquash = new ProbePointArg<DynInstPtr>(cpu->getProbeManager(), "Squash");
+    ppCommit = new ProbePointArg<O3DynInstPtr>(
+            cpu->getProbeManager(), "Commit");
+    ppCommitStall = new ProbePointArg<O3DynInstPtr>(
+            cpu->getProbeManager(), "CommitStall");
+    ppSquash = new ProbePointArg<O3DynInstPtr>(
+            cpu->getProbeManager(), "Squash");
 }

 template <class Impl>
@@ -653,7 +657,7 @@ DefaultCommit<Impl>::squashFromSquashAfter(ThreadID tid)

 template <class Impl>
 void
-DefaultCommit<Impl>::squashAfter(ThreadID tid, const DynInstPtr &head_inst)
+DefaultCommit<Impl>::squashAfter(ThreadID tid, const O3DynInstPtr &head_inst)
 {
    DPRINTF(Commit, "Executing squash after for [tid:%i] inst [sn:%llu]\n",
            tid, head_inst->seqNum);
@@ -713,14 +717,14 @@ DefaultCommit<Impl>::tick()
            // will be active.
            _nextStatus = Active;

-            GEM5_VAR_USED const DynInstPtr &inst = rob->readHeadInst(tid);
+            GEM5_VAR_USED const O3DynInstPtr &inst = rob->readHeadInst(tid);

            DPRINTF(Commit,"[tid:%i] Instruction [sn:%llu] PC %s is head of"
                    " ROB and ready to commit\n",
                    tid, inst->seqNum, inst->pcState());

        } else if (!rob->isEmpty(tid)) {
-            const DynInstPtr &inst = rob->readHeadInst(tid);
+            const O3DynInstPtr &inst = rob->readHeadInst(tid);

            ppCommitStall->notify(inst);

@@ -1001,7 +1005,7 @@ DefaultCommit<Impl>::commitInsts()

    unsigned num_committed = 0;

-    DynInstPtr head_inst;
+    O3DynInstPtr head_inst;

    // Commit as many instructions as possible until the commit bandwidth
    // limit is reached, or it becomes impossible to commit any more.
@@ -1192,7 +1196,8 @@ DefaultCommit<Impl>::commitInsts()

 template <class Impl>
 bool
-DefaultCommit<Impl>::commitHead(const DynInstPtr &head_inst, unsigned inst_num)
+DefaultCommit<Impl>::commitHead(
+        const O3DynInstPtr &head_inst, unsigned inst_num)
 {
    assert(head_inst);

@@ -1391,7 +1396,7 @@ DefaultCommit<Impl>::getInsts()
    int insts_to_process = std::min((int)renameWidth, fromRename->size);

    for (int inst_num = 0; inst_num < insts_to_process; ++inst_num) {
-        const DynInstPtr &inst = fromRename->insts[inst_num];
+        const O3DynInstPtr &inst = fromRename->insts[inst_num];
        ThreadID tid = inst->threadNumber;

        if (!inst->isSquashed() &&
@@ -1438,7 +1443,7 @@ DefaultCommit<Impl>::markCompletedInsts()

 template <class Impl>
 void
-DefaultCommit<Impl>::updateComInstStats(const DynInstPtr &inst)
+DefaultCommit<Impl>::updateComInstStats(const O3DynInstPtr &inst)
 {
    ThreadID tid = inst->threadNumber;

@@ -1583,7 +1588,7 @@ DefaultCommit<Impl>::oldestReady()

            if (rob->isHeadReady(tid)) {

-                const DynInstPtr &head_inst = rob->readHeadInst(tid);
+                const O3DynInstPtr &head_inst = rob->readHeadInst(tid);

                if (first) {
                    oldest = tid;
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -136,7 +136,7 @@ FullO3CPU<Impl>::FullO3CPU(const DerivO3CPUParams &params)

    if (params.checker) {
        BaseCPU *temp_checker = params.checker;
-        checker = dynamic_cast<Checker<Impl> *>(temp_checker);
+        checker = dynamic_cast<Checker<O3DynInstPtr> *>(temp_checker);
        checker->setIcachePort(&this->fetch.getInstPort());
        checker->setSystem(params.system);
    } else {
@@ -378,8 +378,11 @@ FullO3CPU<Impl>::regProbePoints()
 {
    BaseCPU::regProbePoints();

-    ppInstAccessComplete = new ProbePointArg<PacketPtr>(getProbeManager(), "InstAccessComplete");
-    ppDataAccessComplete = new ProbePointArg<std::pair<DynInstPtr, PacketPtr> >(getProbeManager(), "DataAccessComplete");
+    ppInstAccessComplete = new ProbePointArg<PacketPtr>(
+            getProbeManager(), "InstAccessComplete");
+    ppDataAccessComplete = new ProbePointArg<
+        std::pair<O3DynInstPtr, PacketPtr>>(
+                getProbeManager(), "DataAccessComplete");

    fetch.regProbePoints();
    rename.regProbePoints();
@@ -1501,7 +1504,7 @@ FullO3CPU<Impl>::squashFromTC(ThreadID tid)

 template <class Impl>
 typename FullO3CPU<Impl>::ListIt
-FullO3CPU<Impl>::addInst(const DynInstPtr &inst)
+FullO3CPU<Impl>::addInst(const O3DynInstPtr &inst)
 {
    instList.push_back(inst);

@@ -1510,7 +1513,7 @@ FullO3CPU<Impl>::addInst(const DynInstPtr &inst)

 template <class Impl>
 void
-FullO3CPU<Impl>::instDone(ThreadID tid, const DynInstPtr &inst)
+FullO3CPU<Impl>::instDone(ThreadID tid, const O3DynInstPtr &inst)
 {
    // Keep an instruction count.
    if (!inst->isMicroop() || inst->isLastMicroop()) {
@@ -1530,7 +1533,7 @@ FullO3CPU<Impl>::instDone(ThreadID tid, const DynInstPtr &inst)

 template <class Impl>
 void
-FullO3CPU<Impl>::removeFrontInst(const DynInstPtr &inst)
+FullO3CPU<Impl>::removeFrontInst(const O3DynInstPtr &inst)
 {
    DPRINTF(O3CPU, "Removing committed instruction [tid:%i] PC %s "
            "[sn:%lli]\n",
@@ -1686,7 +1689,7 @@ FullO3CPU<Impl>::dumpInsts()
 /*
 template <class Impl>
 void
-FullO3CPU<Impl>::wakeDependents(const DynInstPtr &inst)
+FullO3CPU<Impl>::wakeDependents(const O3DynInstPtr &inst)
 {
    iew.wakeDependents(inst);
 }
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@@ -56,6 +56,7 @@
 #include "cpu/o3/comm.hh"
 #include "cpu/o3/commit.hh"
 #include "cpu/o3/decode.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/fetch.hh"
 #include "cpu/o3/free_list.hh"
 #include "cpu/o3/iew.hh"
@@ -100,13 +101,12 @@ class FullO3CPU : public BaseO3CPU
 {
  public:
    // Typedefs from the Impl here.
-    typedef typename Impl::DynInstPtr DynInstPtr;
    typedef typename Impl::O3CPU O3CPU;

    typedef O3ThreadState<Impl> ImplState;
    typedef O3ThreadState<Impl> Thread;

-    typedef typename std::list<DynInstPtr>::iterator ListIt;
+    typedef typename std::list<O3DynInstPtr>::iterator ListIt;

    friend class O3ThreadContext<Impl>;

@@ -184,7 +184,7 @@ class FullO3CPU : public BaseO3CPU
    ~FullO3CPU();

    ProbePointArg<PacketPtr> *ppInstAccessComplete;
-    ProbePointArg<std::pair<DynInstPtr, PacketPtr> > *ppDataAccessComplete;
+    ProbePointArg<std::pair<O3DynInstPtr, PacketPtr> > *ppDataAccessComplete;

    /** Register probe points. */
    void regProbePoints() override;
@@ -439,15 +439,15 @@ class FullO3CPU : public BaseO3CPU
    /** Function to add instruction onto the head of the list of the
     *  instructions.  Used when new instructions are fetched.
     */
-    ListIt addInst(const DynInstPtr &inst);
+    ListIt addInst(const O3DynInstPtr &inst);

    /** Function to tell the CPU that an instruction has completed. */
-    void instDone(ThreadID tid, const DynInstPtr &inst);
+    void instDone(ThreadID tid, const O3DynInstPtr &inst);

    /** Remove an instruction from the front end of the list.  There's
     *  no restriction on location of the instruction.
     */
-    void removeFrontInst(const DynInstPtr &inst);
+    void removeFrontInst(const O3DynInstPtr &inst);

    /** Remove all instructions that are not currently in the ROB.
     *  There's also an option to not squash delay slot instructions.*/
@@ -472,7 +472,7 @@ class FullO3CPU : public BaseO3CPU
 #endif

    /** List of all the instructions in flight. */
-    std::list<DynInstPtr> instList;
+    std::list<O3DynInstPtr> instList;

    /** List of all the instructions that will be removed at the end of this
     *  cycle.
@@ -624,7 +624,7 @@ class FullO3CPU : public BaseO3CPU
     * instruction results at run time.  This can be set to NULL if it
     * is not being used.
     */
-    Checker<Impl> *checker;
+    Checker<O3DynInstPtr> *checker;

    /** Pointer to the system. */
    System *system;
@@ -648,7 +648,7 @@ class FullO3CPU : public BaseO3CPU
    std::vector<ThreadID> tids;

    /** CPU pushRequest function, forwards request to LSQ. */
-    Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
+    Fault pushRequest(const O3DynInstPtr& inst, bool isLoad, uint8_t *data,
                      unsigned int size, Addr addr, Request::Flags flags,
                      uint64_t *res, AtomicOpFunctorPtr amo_op = nullptr,
                      const std::vector<bool>& byte_enable =
--- a/src/cpu/o3/decode.hh
+++ b/src/cpu/o3/decode.hh
@@ -44,6 +44,7 @@
 #include <queue>

 #include "base/statistics.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/timebuf.hh"

@@ -62,7 +63,6 @@ class DefaultDecode
  private:
    // Typedefs from the Impl.
    typedef typename Impl::O3CPU O3CPU;
-    typedef typename Impl::DynInstPtr DynInstPtr;
    typedef typename Impl::FetchStruct FetchStruct;
    typedef typename Impl::DecodeStruct DecodeStruct;
    typedef typename Impl::TimeStruct TimeStruct;
@@ -193,7 +193,7 @@ class DefaultDecode
    /** Squashes if there is a PC-relative branch that was predicted
     * incorrectly. Sends squash information back to fetch.
     */
-    void squash(const DynInstPtr &inst, ThreadID tid);
+    void squash(const O3DynInstPtr &inst, ThreadID tid);

  public:
    /** Squashes due to commit signalling a squash. Changes status to
@@ -235,10 +235,10 @@ class DefaultDecode
    typename TimeBuffer<FetchStruct>::wire fromFetch;

    /** Queue of all instructions coming from fetch this cycle. */
-    std::queue<DynInstPtr> insts[O3MaxThreads];
+    std::queue<O3DynInstPtr> insts[O3MaxThreads];

    /** Skid buffer between fetch and decode. */
-    std::queue<DynInstPtr> skidBuffer[O3MaxThreads];
+    std::queue<O3DynInstPtr> skidBuffer[O3MaxThreads];

    /** Variable that tracks if decode has written to the time buffer this
     * cycle. Used to tell CPU if there is activity this cycle.
@@ -285,7 +285,7 @@ class DefaultDecode
    Addr bdelayDoneSeqNum[O3MaxThreads];

    /** Instruction used for squashing branch (used for MIPS)*/
-    DynInstPtr squashInst[O3MaxThreads];
+    O3DynInstPtr squashInst[O3MaxThreads];

    /** Tells when their is a pending delay slot inst. to send
     *  to rename. If there is, then wait squash after the next
--- a/src/cpu/o3/decode_impl.hh
+++ b/src/cpu/o3/decode_impl.hh
@@ -46,6 +46,7 @@
 #include "config/the_isa.hh"
 #include "cpu/inst_seq.hh"
 #include "cpu/o3/decode.hh"
+#include "cpu/o3/dyn_inst.hh"
 #include "cpu/o3/limits.hh"
 #include "debug/Activity.hh"
 #include "debug/Decode.hh"
@@ -293,7 +294,7 @@ DefaultDecode<Impl>::unblock(ThreadID tid)

 template<class Impl>
 void
-DefaultDecode<Impl>::squash(const DynInstPtr &inst, ThreadID tid)
+DefaultDecode<Impl>::squash(const O3DynInstPtr &inst, ThreadID tid)
 {
    DPRINTF(Decode, "[tid:%i] [sn:%llu] Squashing due to incorrect branch "
            "prediction detected at decode.\n", tid, inst->seqNum);
@@ -395,7 +396,7 @@ template<class Impl>
 void
 DefaultDecode<Impl>::skidInsert(ThreadID tid)
 {
-    DynInstPtr inst = NULL;
+    O3DynInstPtr inst = NULL;

    while (!insts[tid].empty()) {
        inst = insts[tid].front();
@@ -655,7 +656,7 @@ DefaultDecode<Impl>::decodeInsts(ThreadID tid)
        ++stats.runCycles;
    }

-    std::queue<DynInstPtr>
+    std::queue<O3DynInstPtr>
        &insts_to_decode = decodeStatus[tid] == Unblocking ?
        skidBuffer[tid] : insts[tid];

@@ -664,7 +665,7 @@ DefaultDecode<Impl>::decodeInsts(ThreadID tid)
    while (insts_available > 0 && toRenameIndex < decodeWidth) {
        assert(!insts_to_decode.empty());

-        DynInstPtr inst = std::move(insts_to_decode.front());
+        O3DynInstPtr inst = std::move(insts_to_decode.front());

        insts_to_decode.pop();

--- a/src/cpu/o3/dyn_inst.hh
+++ b/src/cpu/o3/dyn_inst.hh
@@ -57,6 +57,7 @@
 #include "cpu/inst_res.hh"
 #include "cpu/inst_seq.hh"
 #include "cpu/o3/cpu.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/isa_specific.hh"
 #include "cpu/o3/lsq_unit.hh"
 #include "cpu/op_class.hh"
@@ -67,10 +68,6 @@

 class Packet;

-class BaseO3DynInst;
-
-using O3DynInstPtr = RefCountingPtr<BaseO3DynInst>;
-
 class BaseO3DynInst : public ExecContext, public RefCounted
 {
  public:
--- a/src/cpu/o3/dyn_inst_ptr.hh
+++ b/src/cpu/o3/dyn_inst_ptr.hh
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2010, 2016 ARM Limited
+ * Copyright (c) 2013 Advanced Micro Devices, Inc.
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CPU_O3_DYN_INST_PTR_HH__
+#define __CPU_O3_DYN_INST_PTR_HH__
+
+#include "base/refcnt.hh"
+
+class BaseO3DynInst;
+
+using O3DynInstPtr = RefCountingPtr<BaseO3DynInst>;
+using O3DynInstConstPtr = RefCountingPtr<const BaseO3DynInst>;
+
+#endif // __CPU_O3_DYN_INST_PTR_HH__
--- a/src/cpu/o3/fetch.hh
+++ b/src/cpu/o3/fetch.hh
@@ -44,6 +44,7 @@
 #include "arch/decoder.hh"
 #include "base/statistics.hh"
 #include "config/the_isa.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/pc_event.hh"
 #include "cpu/pred/bpred_unit.hh"
@@ -72,8 +73,6 @@ class DefaultFetch
 {
  public:
    /** Typedefs from Impl. */
-    typedef typename Impl::DynInst DynInst;
-    typedef typename Impl::DynInstPtr DynInstPtr;
    typedef typename Impl::O3CPU O3CPU;
    typedef typename Impl::FetchStruct FetchStruct;
    typedef typename Impl::TimeStruct TimeStruct;
@@ -207,7 +206,7 @@ class DefaultFetch
    std::list<ThreadID> priorityList;

    /** Probe points. */
-    ProbePointArg<DynInstPtr> *ppFetch;
+    ProbePointArg<O3DynInstPtr> *ppFetch;
    /** To probe when a fetch request is successfully sent. */
    ProbePointArg<RequestPtr> *ppFetchRequestSent;

@@ -294,7 +293,7 @@ class DefaultFetch
     * @param next_NPC Used for ISAs which use delay slots.
     * @return Whether or not a branch was predicted as taken.
     */
-    bool lookupAndUpdateNextPC(const DynInstPtr &inst, TheISA::PCState &pc);
+    bool lookupAndUpdateNextPC(const O3DynInstPtr &inst, TheISA::PCState &pc);

    /**
     * Fetches the cache line that contains the fetch PC.  Returns any
@@ -321,14 +320,14 @@ class DefaultFetch

    /** Squashes a specific thread and resets the PC. */
    inline void doSquash(const TheISA::PCState &newPC,
-                         const DynInstPtr squashInst, ThreadID tid);
+                         const O3DynInstPtr squashInst, ThreadID tid);

    /** Squashes a specific thread and resets the PC. Also tells the CPU to
     * remove any instructions between fetch and decode
     *  that should be sqaushed.
     */
    void squashFromDecode(const TheISA::PCState &newPC,
-                          const DynInstPtr squashInst,
+                          const O3DynInstPtr squashInst,
                          const InstSeqNum seq_num, ThreadID tid);

    /** Checks if a thread is stalled. */
@@ -344,7 +343,7 @@ class DefaultFetch
     * squash should be the commit stage.
     */
    void squash(const TheISA::PCState &newPC, const InstSeqNum seq_num,
-                DynInstPtr squashInst, ThreadID tid);
+                O3DynInstPtr squashInst, ThreadID tid);

    /** Ticks the fetch stage, processing all inputs signals and fetching
     * as many instructions as possible.
@@ -375,9 +374,9 @@ class DefaultFetch
    RequestPort &getInstPort() { return icachePort; }

  private:
-    DynInstPtr buildInst(ThreadID tid, StaticInstPtr staticInst,
-                         StaticInstPtr curMacroop, TheISA::PCState thisPC,
-                         TheISA::PCState nextPC, bool trace);
+    O3DynInstPtr buildInst(ThreadID tid, StaticInstPtr staticInst,
+                           StaticInstPtr curMacroop, TheISA::PCState thisPC,
+                           TheISA::PCState nextPC, bool trace);

    /** Returns the appropriate thread to fetch, given the fetch policy. */
    ThreadID getFetchingThread();
@@ -505,7 +504,7 @@ class DefaultFetch
    unsigned fetchQueueSize;

    /** Queue of fetched instructions. Per-thread to prevent HoL blocking. */
-    std::deque<DynInstPtr> fetchQueue[O3MaxThreads];
+    std::deque<O3DynInstPtr> fetchQueue[O3MaxThreads];

    /** Whether or not the fetch buffer data is valid. */
    bool fetchBufferValid[O3MaxThreads];
--- a/src/cpu/o3/fetch_impl.hh
+++ b/src/cpu/o3/fetch_impl.hh
@@ -150,7 +150,7 @@ template <class Impl>
 void
 DefaultFetch<Impl>::regProbePoints()
 {
-    ppFetch = new ProbePointArg<DynInstPtr>(cpu->getProbeManager(), "Fetch");
+    ppFetch = new ProbePointArg<O3DynInstPtr>(cpu->getProbeManager(), "Fetch");
    ppFetchRequestSent = new ProbePointArg<RequestPtr>(cpu->getProbeManager(),
                                                       "FetchRequest");

@@ -526,7 +526,7 @@ DefaultFetch<Impl>::deactivateThread(ThreadID tid)
 template <class Impl>
 bool
 DefaultFetch<Impl>::lookupAndUpdateNextPC(
-        const DynInstPtr &inst, TheISA::PCState &nextPC)
+        const O3DynInstPtr &inst, TheISA::PCState &nextPC)
 {
    // Do branch prediction check here.
    // A bit of a misnomer...next_PC is actually the current PC until
@@ -706,7 +706,7 @@ DefaultFetch<Impl>::finishTranslation(const Fault &fault,

        DPRINTF(Fetch, "[tid:%i] Translation faulted, building noop.\n", tid);
        // We will use a nop in ordier to carry the fault.
-        DynInstPtr instruction = buildInst(tid, nopStaticInstPtr, nullptr,
+        O3DynInstPtr instruction = buildInst(tid, nopStaticInstPtr, nullptr,
                fetchPC, fetchPC, false);
        instruction->setNotAnInst();

@@ -729,7 +729,7 @@ DefaultFetch<Impl>::finishTranslation(const Fault &fault,
 template <class Impl>
 inline void
 DefaultFetch<Impl>::doSquash(const TheISA::PCState &newPC,
-                             const DynInstPtr squashInst, ThreadID tid)
+                             const O3DynInstPtr squashInst, ThreadID tid)
 {
    DPRINTF(Fetch, "[tid:%i] Squashing, setting PC to: %s.\n",
            tid, newPC);
@@ -781,7 +781,7 @@ DefaultFetch<Impl>::doSquash(const TheISA::PCState &newPC,
 template<class Impl>
 void
 DefaultFetch<Impl>::squashFromDecode(const TheISA::PCState &newPC,
-                                     const DynInstPtr squashInst,
+                                     const O3DynInstPtr squashInst,
                                     const InstSeqNum seq_num, ThreadID tid)
 {
    DPRINTF(Fetch, "[tid:%i] Squashing from decode.\n", tid);
@@ -851,7 +851,7 @@ DefaultFetch<Impl>::updateFetchStatus()
 template <class Impl>
 void
 DefaultFetch<Impl>::squash(const TheISA::PCState &newPC,
-                           const InstSeqNum seq_num, DynInstPtr squashInst,
+                           const InstSeqNum seq_num, O3DynInstPtr squashInst,
                           ThreadID tid)
 {
    DPRINTF(Fetch, "[tid:%i] Squash from commit.\n", tid);
@@ -1070,7 +1070,7 @@ DefaultFetch<Impl>::checkSignalsAndUpdate(ThreadID tid)
 }

 template<class Impl>
-typename Impl::DynInstPtr
+O3DynInstPtr
 DefaultFetch<Impl>::buildInst(ThreadID tid, StaticInstPtr staticInst,
                              StaticInstPtr curMacroop, TheISA::PCState thisPC,
                              TheISA::PCState nextPC, bool trace)
@@ -1079,8 +1079,8 @@ DefaultFetch<Impl>::buildInst(ThreadID tid, StaticInstPtr staticInst,
    InstSeqNum seq = cpu->getAndIncrementInstSeq();

    // Create a new DynInst from the instruction fetched.
-    DynInstPtr instruction =
-        new DynInst(staticInst, curMacroop, thisPC, nextPC, seq, cpu);
+    O3DynInstPtr instruction =
+        new BaseO3DynInst(staticInst, curMacroop, thisPC, nextPC, seq, cpu);
    instruction->setTid(tid);

    instruction->setThreadState(cpu->thread[tid]);
@@ -1297,7 +1297,7 @@ DefaultFetch<Impl>::fetch(bool &status_change)
                newMacro |= staticInst->isLastMicroop();
            }

-            DynInstPtr instruction =
+            O3DynInstPtr instruction =
                buildInst(tid, staticInst, curMacroop,
                          thisPC, nextPC, true);

--- a/src/cpu/o3/iew.hh
+++ b/src/cpu/o3/iew.hh
@@ -46,6 +46,7 @@

 #include "base/statistics.hh"
 #include "cpu/o3/comm.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/inst_queue.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/o3/lsq.hh"
@@ -81,7 +82,6 @@ class DefaultIEW
 {
  private:
    //Typedefs from Impl
-    typedef typename Impl::DynInstPtr DynInstPtr;
    typedef typename Impl::O3CPU O3CPU;
    typedef typename Impl::TimeStruct TimeStruct;
    typedef typename Impl::IEWStruct IEWStruct;
@@ -120,12 +120,12 @@ class DefaultIEW
    StageStatus wbStatus;

    /** Probe points. */
-    ProbePointArg<DynInstPtr> *ppMispredict;
-    ProbePointArg<DynInstPtr> *ppDispatch;
+    ProbePointArg<O3DynInstPtr> *ppMispredict;
+    ProbePointArg<O3DynInstPtr> *ppDispatch;
    /** To probe when instruction execution begins. */
-    ProbePointArg<DynInstPtr> *ppExecute;
+    ProbePointArg<O3DynInstPtr> *ppExecute;
    /** To probe when instruction execution is complete. */
-    ProbePointArg<DynInstPtr> *ppToCommit;
+    ProbePointArg<O3DynInstPtr> *ppToCommit;

  public:
    /** Constructs a DefaultIEW with the given parameters. */
@@ -171,24 +171,24 @@ class DefaultIEW
    void squash(ThreadID tid);

    /** Wakes all dependents of a completed instruction. */
-    void wakeDependents(const DynInstPtr &inst);
+    void wakeDependents(const O3DynInstPtr &inst);

    /** Tells memory dependence unit that a memory instruction needs to be
     * rescheduled. It will re-execute once replayMemInst() is called.
     */
-    void rescheduleMemInst(const DynInstPtr &inst);
+    void rescheduleMemInst(const O3DynInstPtr &inst);

    /** Re-executes all rescheduled memory instructions. */
-    void replayMemInst(const DynInstPtr &inst);
+    void replayMemInst(const O3DynInstPtr &inst);

    /** Moves memory instruction onto the list of cache blocked instructions */
-    void blockMemInst(const DynInstPtr &inst);
+    void blockMemInst(const O3DynInstPtr &inst);

    /** Notifies that the cache has become unblocked */
    void cacheUnblocked();

    /** Sends an instruction to commit through the time buffer. */
-    void instToCommit(const DynInstPtr &inst);
+    void instToCommit(const O3DynInstPtr &inst);

    /** Inserts unused instructions of a thread into the skid buffer. */
    void skidInsert(ThreadID tid);
@@ -226,7 +226,7 @@ class DefaultIEW
    bool hasStoresToWB(ThreadID tid) { return ldstQueue.hasStoresToWB(tid); }

    /** Check misprediction  */
-    void checkMisprediction(const DynInstPtr &inst);
+    void checkMisprediction(const O3DynInstPtr &inst);

    // hardware transactional memory
    // For debugging purposes, it is useful to keep track of the most recent
@@ -242,12 +242,12 @@ class DefaultIEW
    /** Sends commit proper information for a squash due to a branch
     * mispredict.
     */
-    void squashDueToBranch(const DynInstPtr &inst, ThreadID tid);
+    void squashDueToBranch(const O3DynInstPtr &inst, ThreadID tid);

    /** Sends commit proper information for a squash due to a memory order
     * violation.
     */
-    void squashDueToMemOrder(const DynInstPtr &inst, ThreadID tid);
+    void squashDueToMemOrder(const O3DynInstPtr &inst, ThreadID tid);

    /** Sets Dispatch to blocked, and signals back to other stages to block. */
    void block(ThreadID tid);
@@ -301,7 +301,7 @@ class DefaultIEW

  private:
    /** Updates execution stats based on the instruction. */
-    void updateExeInstStats(const DynInstPtr &inst);
+    void updateExeInstStats(const O3DynInstPtr &inst);

    /** Pointer to main time buffer used for backwards communication. */
    TimeBuffer<TimeStruct> *timeBuffer;
@@ -337,10 +337,10 @@ class DefaultIEW
    typename TimeBuffer<IEWStruct>::wire toCommit;

    /** Queue of all instructions coming from rename this cycle. */
-    std::queue<DynInstPtr> insts[O3MaxThreads];
+    std::queue<O3DynInstPtr> insts[O3MaxThreads];

    /** Skid buffer between rename and IEW. */
-    std::queue<DynInstPtr> skidBuffer[O3MaxThreads];
+    std::queue<O3DynInstPtr> skidBuffer[O3MaxThreads];

    /** Scoreboard pointer. */
    Scoreboard* scoreboard;
--- a/src/cpu/o3/iew_impl.hh
+++ b/src/cpu/o3/iew_impl.hh
@@ -50,6 +50,7 @@

 #include "config/the_isa.hh"
 #include "cpu/checker/cpu.hh"
+#include "cpu/o3/dyn_inst.hh"
 #include "cpu/o3/fu_pool.hh"
 #include "cpu/o3/iew.hh"
 #include "cpu/o3/limits.hh"
@@ -122,20 +123,22 @@ template <class Impl>
 void
 DefaultIEW<Impl>::regProbePoints()
 {
-    ppDispatch = new ProbePointArg<DynInstPtr>(cpu->getProbeManager(), "Dispatch");
-    ppMispredict = new ProbePointArg<DynInstPtr>(cpu->getProbeManager(), "Mispredict");
+    ppDispatch = new ProbePointArg<O3DynInstPtr>(
+            cpu->getProbeManager(), "Dispatch");
+    ppMispredict = new ProbePointArg<O3DynInstPtr>(
+            cpu->getProbeManager(), "Mispredict");
    /**
     * Probe point with dynamic instruction as the argument used to probe when
     * an instruction starts to execute.
     */
-    ppExecute = new ProbePointArg<DynInstPtr>(cpu->getProbeManager(),
-                                              "Execute");
+    ppExecute = new ProbePointArg<O3DynInstPtr>(
+            cpu->getProbeManager(), "Execute");
    /**
     * Probe point with dynamic instruction as the argument used to probe when
     * an instruction execution completes and it is marked ready to commit.
     */
-    ppToCommit = new ProbePointArg<DynInstPtr>(cpu->getProbeManager(),
-                                               "ToCommit");
+    ppToCommit = new ProbePointArg<O3DynInstPtr>(
+            cpu->getProbeManager(), "ToCommit");
 }

 template <class Impl>
@@ -461,7 +464,7 @@ DefaultIEW<Impl>::squash(ThreadID tid)

 template<class Impl>
 void
-DefaultIEW<Impl>::squashDueToBranch(const DynInstPtr& inst, ThreadID tid)
+DefaultIEW<Impl>::squashDueToBranch(const O3DynInstPtr& inst, ThreadID tid)
 {
    DPRINTF(IEW, "[tid:%i] [sn:%llu] Squashing from a specific instruction,"
            " PC: %s "
@@ -487,7 +490,7 @@ DefaultIEW<Impl>::squashDueToBranch(const DynInstPtr& inst, ThreadID tid)

 template<class Impl>
 void
-DefaultIEW<Impl>::squashDueToMemOrder(const DynInstPtr& inst, ThreadID tid)
+DefaultIEW<Impl>::squashDueToMemOrder(const O3DynInstPtr& inst, ThreadID tid)
 {
    DPRINTF(IEW, "[tid:%i] Memory violation, squashing violator and younger "
            "insts, PC: %s [sn:%llu].\n", tid, inst->pcState(), inst->seqNum);
@@ -550,28 +553,28 @@ DefaultIEW<Impl>::unblock(ThreadID tid)

 template<class Impl>
 void
-DefaultIEW<Impl>::wakeDependents(const DynInstPtr& inst)
+DefaultIEW<Impl>::wakeDependents(const O3DynInstPtr& inst)
 {
    instQueue.wakeDependents(inst);
 }

 template<class Impl>
 void
-DefaultIEW<Impl>::rescheduleMemInst(const DynInstPtr& inst)
+DefaultIEW<Impl>::rescheduleMemInst(const O3DynInstPtr& inst)
 {
    instQueue.rescheduleMemInst(inst);
 }

 template<class Impl>
 void
-DefaultIEW<Impl>::replayMemInst(const DynInstPtr& inst)
+DefaultIEW<Impl>::replayMemInst(const O3DynInstPtr& inst)
 {
    instQueue.replayMemInst(inst);
 }

 template<class Impl>
 void
-DefaultIEW<Impl>::blockMemInst(const DynInstPtr& inst)
+DefaultIEW<Impl>::blockMemInst(const O3DynInstPtr& inst)
 {
    instQueue.blockMemInst(inst);
 }
@@ -585,7 +588,7 @@ DefaultIEW<Impl>::cacheUnblocked()

 template<class Impl>
 void
-DefaultIEW<Impl>::instToCommit(const DynInstPtr& inst)
+DefaultIEW<Impl>::instToCommit(const O3DynInstPtr& inst)
 {
    // This function should not be called after writebackInsts in a
    // single cycle.  That will cause problems with an instruction
@@ -630,7 +633,7 @@ template<class Impl>
 void
 DefaultIEW<Impl>::skidInsert(ThreadID tid)
 {
-    DynInstPtr inst = NULL;
+    O3DynInstPtr inst = NULL;

    while (!insts[tid].empty()) {
        inst = insts[tid].front();
@@ -927,13 +930,13 @@ DefaultIEW<Impl>::dispatchInsts(ThreadID tid)
 {
    // Obtain instructions from skid buffer if unblocking, or queue from rename
    // otherwise.
-    std::queue<DynInstPtr> &insts_to_dispatch =
+    std::queue<O3DynInstPtr> &insts_to_dispatch =
        dispatchStatus[tid] == Unblocking ?
        skidBuffer[tid] : insts[tid];

    int insts_to_add = insts_to_dispatch.size();

-    DynInstPtr inst;
+    O3DynInstPtr inst;
    bool add_to_iq = false;
    int dis_num_inst = 0;

@@ -1208,7 +1211,7 @@ DefaultIEW<Impl>::executeInsts()

        DPRINTF(IEW, "Execute: Executing instructions from IQ.\n");

-        DynInstPtr inst = instQueue.getInstToExecute();
+        O3DynInstPtr inst = instQueue.getInstToExecute();

        DPRINTF(IEW, "Execute: Processing PC %s, [tid:%i] [sn:%llu].\n",
                inst->pcState(), inst->threadNumber,inst->seqNum);
@@ -1372,7 +1375,7 @@ DefaultIEW<Impl>::executeInsts()
                // If there was an ordering violation, then get the
                // DynInst that caused the violation.  Note that this
                // clears the violation signal.
-                DynInstPtr violator;
+                O3DynInstPtr violator;
                violator = ldstQueue.getMemDepViolator(tid);

                DPRINTF(IEW, "LDSTQ detected a violation. Violator PC: %s "
@@ -1396,7 +1399,7 @@ DefaultIEW<Impl>::executeInsts()
            if (ldstQueue.violation(tid)) {
                assert(inst->isMemRef());

-                DynInstPtr violator = ldstQueue.getMemDepViolator(tid);
+                O3DynInstPtr violator = ldstQueue.getMemDepViolator(tid);

                DPRINTF(IEW, "LDSTQ detected a violation.  Violator PC: "
                        "%s, inst PC: %s.  Addr is: %#x.\n",
@@ -1439,7 +1442,7 @@ DefaultIEW<Impl>::writebackInsts()
    // as part of backwards communication.
    for (int inst_num = 0; inst_num < wbWidth &&
             toCommit->insts[inst_num]; inst_num++) {
-        DynInstPtr inst = toCommit->insts[inst_num];
+        O3DynInstPtr inst = toCommit->insts[inst_num];
        ThreadID tid = inst->threadNumber;

        DPRINTF(IEW, "Sending instructions to commit, [sn:%lli] PC %s.\n",
@@ -1610,7 +1613,7 @@ DefaultIEW<Impl>::tick()

 template <class Impl>
 void
-DefaultIEW<Impl>::updateExeInstStats(const DynInstPtr& inst)
+DefaultIEW<Impl>::updateExeInstStats(const O3DynInstPtr& inst)
 {
    ThreadID tid = inst->threadNumber;

@@ -1642,7 +1645,7 @@ DefaultIEW<Impl>::updateExeInstStats(const DynInstPtr& inst)

 template <class Impl>
 void
-DefaultIEW<Impl>::checkMisprediction(const DynInstPtr& inst)
+DefaultIEW<Impl>::checkMisprediction(const O3DynInstPtr& inst)
 {
    ThreadID tid = inst->threadNumber;

--- a/src/cpu/o3/impl.hh
+++ b/src/cpu/o3/impl.hh
@@ -32,8 +32,6 @@
 #include "cpu/o3/comm.hh"

 // Forward declarations.
-class BaseO3DynInst;
-
 template <class Impl>
 class FullO3CPU;

@@ -66,15 +64,6 @@ struct O3CPUImpl
    typedef TimeBufStruct<O3CPUImpl> TimeStruct;


-    /** The DynInst type to be used. */
-    typedef BaseO3DynInst DynInst;
-
-    /** The refcounted DynInst pointer to be used.  In most cases this is
-     *  what should be used, and not DynInst *.
-     */
-    typedef RefCountingPtr<DynInst> DynInstPtr;
-    typedef RefCountingPtr<const DynInst> DynInstConstPtr;
-
    /** The O3CPU type to be used. */
    typedef FullO3CPU<O3CPUImpl> O3CPU;

--- a/src/cpu/o3/inst_queue.hh
+++ b/src/cpu/o3/inst_queue.hh
@@ -51,6 +51,7 @@
 #include "base/types.hh"
 #include "cpu/inst_seq.hh"
 #include "cpu/o3/dep_graph.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/o3/mem_dep_unit.hh"
 #include "cpu/o3/store_set.hh"
@@ -89,19 +90,18 @@ class InstructionQueue
  public:
    //Typedefs from the Impl.
    typedef typename Impl::O3CPU O3CPU;
-    typedef typename Impl::DynInstPtr DynInstPtr;
    typedef typename Impl::IssueStruct IssueStruct;
    typedef typename Impl::TimeStruct TimeStruct;

    // Typedef of iterator through the list of instructions.
-    typedef typename std::list<DynInstPtr>::iterator ListIt;
+    typedef typename std::list<O3DynInstPtr>::iterator ListIt;

    /** FU completion event class. */
    class FUCompletion : public Event
    {
      private:
        /** Executing instruction. */
-        DynInstPtr inst;
+        O3DynInstPtr inst;

        /** Index of the FU used for executing. */
        int fuIdx;
@@ -116,7 +116,7 @@ class InstructionQueue

      public:
        /** Construct a FU completion event. */
-        FUCompletion(const DynInstPtr &_inst, int fu_idx,
+        FUCompletion(const O3DynInstPtr &_inst, int fu_idx,
                     InstructionQueue<Impl> *iq_ptr);

        virtual void process();
@@ -177,40 +177,43 @@ class InstructionQueue
    bool hasReadyInsts();

    /** Inserts a new instruction into the IQ. */
-    void insert(const DynInstPtr &new_inst);
+    void insert(const O3DynInstPtr &new_inst);

    /** Inserts a new, non-speculative instruction into the IQ. */
-    void insertNonSpec(const DynInstPtr &new_inst);
+    void insertNonSpec(const O3DynInstPtr &new_inst);

    /** Inserts a memory or write barrier into the IQ to make sure
     *  loads and stores are ordered properly.
     */
-    void insertBarrier(const DynInstPtr &barr_inst);
+    void insertBarrier(const O3DynInstPtr &barr_inst);

    /** Returns the oldest scheduled instruction, and removes it from
     * the list of instructions waiting to execute.
     */
-    DynInstPtr getInstToExecute();
+    O3DynInstPtr getInstToExecute();

    /** Gets a memory instruction that was referred due to a delayed DTB
     *  translation if it is now ready to execute.  NULL if none available.
     */
-    DynInstPtr getDeferredMemInstToExecute();
+    O3DynInstPtr getDeferredMemInstToExecute();

    /** Gets a memory instruction that was blocked on the cache. NULL if none
     *  available.
     */
-    DynInstPtr getBlockedMemInstToExecute();
+    O3DynInstPtr getBlockedMemInstToExecute();

    /**
     * Records the instruction as the producer of a register without
     * adding it to the rest of the IQ.
     */
-    void recordProducer(const DynInstPtr &inst)
-    { addToProducers(inst); }
+    void
+    recordProducer(const O3DynInstPtr &inst)
+    {
+        addToProducers(inst);
+    }

    /** Process FU completion event. */
-    void processFUCompletion(const DynInstPtr &inst, int fu_idx);
+    void processFUCompletion(const O3DynInstPtr &inst, int fu_idx);

    /**
     * Schedules ready instructions, adding the ready ones (oldest first) to
@@ -228,34 +231,35 @@ class InstructionQueue
    void commit(const InstSeqNum &inst, ThreadID tid = 0);

    /** Wakes all dependents of a completed instruction. */
-    int wakeDependents(const DynInstPtr &completed_inst);
+    int wakeDependents(const O3DynInstPtr &completed_inst);

    /** Adds a ready memory instruction to the ready list. */
-    void addReadyMemInst(const DynInstPtr &ready_inst);
+    void addReadyMemInst(const O3DynInstPtr &ready_inst);

    /**
     * Reschedules a memory instruction. It will be ready to issue once
     * replayMemInst() is called.
     */
-    void rescheduleMemInst(const DynInstPtr &resched_inst);
+    void rescheduleMemInst(const O3DynInstPtr &resched_inst);

    /** Replays a memory instruction. It must be rescheduled first. */
-    void replayMemInst(const DynInstPtr &replay_inst);
+    void replayMemInst(const O3DynInstPtr &replay_inst);

    /**
     * Defers a memory instruction when its DTB translation incurs a hw
     * page table walk.
     */
-    void deferMemInst(const DynInstPtr &deferred_inst);
+    void deferMemInst(const O3DynInstPtr &deferred_inst);

    /**  Defers a memory instruction when it is cache blocked. */
-    void blockMemInst(const DynInstPtr &blocked_inst);
+    void blockMemInst(const O3DynInstPtr &blocked_inst);

    /**  Notify instruction queue that a previous blockage has resolved */
    void cacheUnblocked();

    /** Indicates an ordering violation between a store and a load. */
-    void violation(const DynInstPtr &store, const DynInstPtr &faulting_load);
+    void violation(const O3DynInstPtr &store,
+            const O3DynInstPtr &faulting_load);

    /**
     * Squashes instructions for a thread. Squashing information is obtained
@@ -310,23 +314,23 @@ class InstructionQueue
    //////////////////////////////////////

    /** List of all the instructions in the IQ (some of which may be issued). */
-    std::list<DynInstPtr> instList[O3MaxThreads];
+    std::list<O3DynInstPtr> instList[O3MaxThreads];

    /** List of instructions that are ready to be executed. */
-    std::list<DynInstPtr> instsToExecute;
+    std::list<O3DynInstPtr> instsToExecute;

    /** List of instructions waiting for their DTB translation to
     *  complete (hw page table walk in progress).
     */
-    std::list<DynInstPtr> deferredMemInsts;
+    std::list<O3DynInstPtr> deferredMemInsts;

    /** List of instructions that have been cache blocked. */
-    std::list<DynInstPtr> blockedMemInsts;
+    std::list<O3DynInstPtr> blockedMemInsts;

    /** List of instructions that were cache blocked, but a retry has been seen
     * since, so they can now be retried. May fail again go on the blocked list.
     */
-    std::list<DynInstPtr> retryMemInsts;
+    std::list<O3DynInstPtr> retryMemInsts;

    /**
     * Struct for comparing entries to be added to the priority queue.
@@ -335,16 +339,14 @@ class InstructionQueue
     * numbers (and hence are older) will be at the top of the
     * priority queue.
     */
-    struct pqCompare
+    struct PqCompare
    {
-        bool operator() (const DynInstPtr &lhs, const DynInstPtr &rhs) const
-        {
-            return lhs->seqNum > rhs->seqNum;
-        }
+        bool operator()(const O3DynInstPtr &lhs,
+                const O3DynInstPtr &rhs) const;
    };

-    typedef std::priority_queue<DynInstPtr, std::vector<DynInstPtr>, pqCompare>
-    ReadyInstQueue;
+    typedef std::priority_queue<
+        O3DynInstPtr, std::vector<O3DynInstPtr>, PqCompare> ReadyInstQueue;

    /** List of ready instructions, per op class.  They are separated by op
     *  class to allow for easy mapping to FUs.
@@ -358,9 +360,9 @@ class InstructionQueue
     *  the sequence number will be available.  Thus it is most efficient to be
     *  able to search by the sequence number alone.
     */
-    std::map<InstSeqNum, DynInstPtr> nonSpecInsts;
+    std::map<InstSeqNum, O3DynInstPtr> nonSpecInsts;

-    typedef typename std::map<InstSeqNum, DynInstPtr>::iterator NonSpecMapIt;
+    typedef typename std::map<InstSeqNum, O3DynInstPtr>::iterator NonSpecMapIt;

    /** Entry for the list age ordering by op class. */
    struct ListOrderEntry
@@ -397,7 +399,7 @@ class InstructionQueue
     */
    void moveToYoungerInst(ListOrderIt age_order_it);

-    DependencyGraph<DynInstPtr> dependGraph;
+    DependencyGraph<O3DynInstPtr> dependGraph;

    //////////////////////////////////////
    // Various parameters
@@ -450,13 +452,13 @@ class InstructionQueue
    std::vector<bool> regScoreboard;

    /** Adds an instruction to the dependency graph, as a consumer. */
-    bool addToDependents(const DynInstPtr &new_inst);
+    bool addToDependents(const O3DynInstPtr &new_inst);

    /** Adds an instruction to the dependency graph, as a producer. */
-    void addToProducers(const DynInstPtr &new_inst);
+    void addToProducers(const O3DynInstPtr &new_inst);

    /** Moves an instruction to the ready queue if it is ready. */
-    void addIfReady(const DynInstPtr &inst);
+    void addIfReady(const O3DynInstPtr &inst);

    /** Debugging function to count how many entries are in the IQ.  It does
     *  a linear walk through the instructions, so do not call this function
--- a/src/cpu/o3/inst_queue_impl.hh
+++ b/src/cpu/o3/inst_queue_impl.hh
@@ -46,6 +46,7 @@
 #include <vector>

 #include "base/logging.hh"
+#include "cpu/o3/dyn_inst.hh"
 #include "cpu/o3/fu_pool.hh"
 #include "cpu/o3/inst_queue.hh"
 #include "cpu/o3/limits.hh"
@@ -59,7 +60,7 @@
 using std::list;

 template <class Impl>
-InstructionQueue<Impl>::FUCompletion::FUCompletion(const DynInstPtr &_inst,
+InstructionQueue<Impl>::FUCompletion::FUCompletion(const O3DynInstPtr &_inst,
    int fu_idx, InstructionQueue<Impl> *iq_ptr)
    : Event(Stat_Event_Pri, AutoDelete),
      inst(_inst), fuIdx(fu_idx), iqPtr(iq_ptr), freeFU(false)
@@ -576,7 +577,7 @@ InstructionQueue<Impl>::hasReadyInsts()

 template <class Impl>
 void
-InstructionQueue<Impl>::insert(const DynInstPtr &new_inst)
+InstructionQueue<Impl>::insert(const O3DynInstPtr &new_inst)
 {
    if (new_inst->isFloating()) {
        iqIOStats.fpInstQueueWrites++;
@@ -622,7 +623,7 @@ InstructionQueue<Impl>::insert(const DynInstPtr &new_inst)

 template <class Impl>
 void
-InstructionQueue<Impl>::insertNonSpec(const DynInstPtr &new_inst)
+InstructionQueue<Impl>::insertNonSpec(const O3DynInstPtr &new_inst)
 {
    // @todo: Clean up this code; can do it by setting inst as unable
    // to issue, then calling normal insert on the inst.
@@ -669,7 +670,7 @@ InstructionQueue<Impl>::insertNonSpec(const DynInstPtr &new_inst)

 template <class Impl>
 void
-InstructionQueue<Impl>::insertBarrier(const DynInstPtr &barr_inst)
+InstructionQueue<Impl>::insertBarrier(const O3DynInstPtr &barr_inst)
 {
    memDepUnit[barr_inst->threadNumber].insertBarrier(barr_inst);

@@ -677,11 +678,11 @@ InstructionQueue<Impl>::insertBarrier(const DynInstPtr &barr_inst)
 }

 template <class Impl>
-typename Impl::DynInstPtr
+O3DynInstPtr
 InstructionQueue<Impl>::getInstToExecute()
 {
    assert(!instsToExecute.empty());
-    DynInstPtr inst = std::move(instsToExecute.front());
+    O3DynInstPtr inst = std::move(instsToExecute.front());
    instsToExecute.pop_front();
    if (inst->isFloating()) {
        iqIOStats.fpInstQueueReads++;
@@ -748,7 +749,8 @@ InstructionQueue<Impl>::moveToYoungerInst(ListOrderIt list_order_it)

 template <class Impl>
 void
-InstructionQueue<Impl>::processFUCompletion(const DynInstPtr &inst, int fu_idx)
+InstructionQueue<Impl>::processFUCompletion(
+        const O3DynInstPtr &inst, int fu_idx)
 {
    DPRINTF(IQ, "Processing FU completion [sn:%llu]\n", inst->seqNum);
    assert(!cpu->switchedOut());
@@ -779,7 +781,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()

    IssueStruct *i2e_info = issueToExecuteQueue->access(0);

-    DynInstPtr mem_inst;
+    O3DynInstPtr mem_inst;
    while ((mem_inst = std::move(getDeferredMemInstToExecute()))) {
        addReadyMemInst(mem_inst);
    }
@@ -806,7 +808,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()

        assert(!readyInsts[op_class].empty());

-        DynInstPtr issuing_inst = readyInsts[op_class].top();
+        O3DynInstPtr issuing_inst = readyInsts[op_class].top();

        if (issuing_inst->isFloating()) {
            iqIOStats.fpInstQueueReads++;
@@ -986,7 +988,7 @@ InstructionQueue<Impl>::commit(const InstSeqNum &inst, ThreadID tid)

 template <class Impl>
 int
-InstructionQueue<Impl>::wakeDependents(const DynInstPtr &completed_inst)
+InstructionQueue<Impl>::wakeDependents(const O3DynInstPtr &completed_inst)
 {
    int dependents = 0;

@@ -1054,7 +1056,7 @@ InstructionQueue<Impl>::wakeDependents(const DynInstPtr &completed_inst)

        //Go through the dependency chain, marking the registers as
        //ready within the waiting instructions.
-        DynInstPtr dep_inst = dependGraph.pop(dest_reg->flatIndex());
+        O3DynInstPtr dep_inst = dependGraph.pop(dest_reg->flatIndex());

        while (dep_inst) {
            DPRINTF(IQ, "Waking up a dependent instruction, [sn:%llu] "
@@ -1086,7 +1088,7 @@ InstructionQueue<Impl>::wakeDependents(const DynInstPtr &completed_inst)

 template <class Impl>
 void
-InstructionQueue<Impl>::addReadyMemInst(const DynInstPtr &ready_inst)
+InstructionQueue<Impl>::addReadyMemInst(const O3DynInstPtr &ready_inst)
 {
    OpClass op_class = ready_inst->opClass();

@@ -1109,7 +1111,7 @@ InstructionQueue<Impl>::addReadyMemInst(const DynInstPtr &ready_inst)

 template <class Impl>
 void
-InstructionQueue<Impl>::rescheduleMemInst(const DynInstPtr &resched_inst)
+InstructionQueue<Impl>::rescheduleMemInst(const O3DynInstPtr &resched_inst)
 {
    DPRINTF(IQ, "Rescheduling mem inst [sn:%llu]\n", resched_inst->seqNum);

@@ -1123,21 +1125,21 @@ InstructionQueue<Impl>::rescheduleMemInst(const DynInstPtr &resched_inst)

 template <class Impl>
 void
-InstructionQueue<Impl>::replayMemInst(const DynInstPtr &replay_inst)
+InstructionQueue<Impl>::replayMemInst(const O3DynInstPtr &replay_inst)
 {
    memDepUnit[replay_inst->threadNumber].replay();
 }

 template <class Impl>
 void
-InstructionQueue<Impl>::deferMemInst(const DynInstPtr &deferred_inst)
+InstructionQueue<Impl>::deferMemInst(const O3DynInstPtr &deferred_inst)
 {
    deferredMemInsts.push_back(deferred_inst);
 }

 template <class Impl>
 void
-InstructionQueue<Impl>::blockMemInst(const DynInstPtr &blocked_inst)
+InstructionQueue<Impl>::blockMemInst(const O3DynInstPtr &blocked_inst)
 {
    blocked_inst->clearIssued();
    blocked_inst->clearCanIssue();
@@ -1154,13 +1156,13 @@ InstructionQueue<Impl>::cacheUnblocked()
 }

 template <class Impl>
-typename Impl::DynInstPtr
+O3DynInstPtr
 InstructionQueue<Impl>::getDeferredMemInstToExecute()
 {
    for (ListIt it = deferredMemInsts.begin(); it != deferredMemInsts.end();
         ++it) {
        if ((*it)->translationCompleted() || (*it)->isSquashed()) {
-            DynInstPtr mem_inst = std::move(*it);
+            O3DynInstPtr mem_inst = std::move(*it);
            deferredMemInsts.erase(it);
            return mem_inst;
        }
@@ -1169,13 +1171,13 @@ InstructionQueue<Impl>::getDeferredMemInstToExecute()
 }

 template <class Impl>
-typename Impl::DynInstPtr
+O3DynInstPtr
 InstructionQueue<Impl>::getBlockedMemInstToExecute()
 {
    if (retryMemInsts.empty()) {
        return nullptr;
    } else {
-        DynInstPtr mem_inst = std::move(retryMemInsts.front());
+        O3DynInstPtr mem_inst = std::move(retryMemInsts.front());
        retryMemInsts.pop_front();
        return mem_inst;
    }
@@ -1183,8 +1185,8 @@ InstructionQueue<Impl>::getBlockedMemInstToExecute()

 template <class Impl>
 void
-InstructionQueue<Impl>::violation(const DynInstPtr &store,
-                                  const DynInstPtr &faulting_load)
+InstructionQueue<Impl>::violation(const O3DynInstPtr &store,
+                                  const O3DynInstPtr &faulting_load)
 {
    iqIOStats.intInstQueueWrites++;
    memDepUnit[store->threadNumber].violation(store, faulting_load);
@@ -1223,7 +1225,7 @@ InstructionQueue<Impl>::doSquash(ThreadID tid)
    while (squash_it != instList[tid].end() &&
           (*squash_it)->seqNum > squashedSeqNum[tid]) {

-        DynInstPtr squashed_inst = (*squash_it);
+        O3DynInstPtr squashed_inst = (*squash_it);
        if (squashed_inst->isFloating()) {
            iqIOStats.fpInstQueueWrites++;
        } else if (squashed_inst->isVector()) {
@@ -1329,7 +1331,7 @@ InstructionQueue<Impl>::doSquash(ThreadID tid)
        // IQ clears out the heads of the dependency graph only when
        // instructions reach writeback stage. If an instruction is squashed
        // before writeback stage, its head of dependency graph would not be
-        // cleared out; it holds the instruction's DynInstPtr. This prevents
+        // cleared out; it holds the instruction's O3DynInstPtr. This prevents
        // freeing the squashed instruction's DynInst.
        // Thus, we need to manually clear out the squashed instructions' heads
        // of dependency graph.
@@ -1352,7 +1354,15 @@ InstructionQueue<Impl>::doSquash(ThreadID tid)

 template <class Impl>
 bool
-InstructionQueue<Impl>::addToDependents(const DynInstPtr &new_inst)
+InstructionQueue<Impl>::PqCompare::operator()(
+        const O3DynInstPtr &lhs, const O3DynInstPtr &rhs) const
+{
+    return lhs->seqNum > rhs->seqNum;
+}
+
+template <class Impl>
+bool
+InstructionQueue<Impl>::addToDependents(const O3DynInstPtr &new_inst)
 {
    // Loop through the instruction's source registers, adding
    // them to the dependency list if they are not ready.
@@ -1400,7 +1410,7 @@ InstructionQueue<Impl>::addToDependents(const DynInstPtr &new_inst)

 template <class Impl>
 void
-InstructionQueue<Impl>::addToProducers(const DynInstPtr &new_inst)
+InstructionQueue<Impl>::addToProducers(const O3DynInstPtr &new_inst)
 {
    // Nothing really needs to be marked when an instruction becomes
    // the producer of a register's value, but for convenience a ptr
@@ -1436,7 +1446,7 @@ InstructionQueue<Impl>::addToProducers(const DynInstPtr &new_inst)

 template <class Impl>
 void
-InstructionQueue<Impl>::addIfReady(const DynInstPtr &inst)
+InstructionQueue<Impl>::addIfReady(const O3DynInstPtr &inst)
 {
    // If the instruction now has all of its source registers
    // available, then add it to the list of ready instructions.
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -53,6 +53,8 @@
 #include "base/flags.hh"
 #include "base/types.hh"
 #include "cpu/inst_seq.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
+#include "cpu/o3/impl.hh"
 #include "cpu/utils.hh"
 #include "enums/SMTQueuePolicy.hh"
 #include "mem/port.hh"
@@ -74,7 +76,6 @@ class LSQ
 {
  public:
    typedef typename Impl::O3CPU O3CPU;
-    typedef typename Impl::DynInstPtr DynInstPtr;

    class LSQRequest;
    /** Derived class to hold any sender state the LSQ needs. */
@@ -93,7 +94,7 @@ class LSQ
      public:

        /** Instruction which initiated the access to memory. */
-        DynInstPtr inst;
+        O3DynInstPtr inst;
        /** The main packet from a split load, used during writeback. */
        PacketPtr mainPkt;
        /** A second packet from a split store that needs sending. */
@@ -113,7 +114,7 @@ class LSQ
         * case the SenderState knows.
         */
        bool deleted;
-        ContextID contextId() { return inst->contextId(); }
+        ContextID contextId();

        /** Completes a packet and returns whether the access is finished. */
        inline bool isComplete() { return outstanding == 0; }
@@ -293,7 +294,7 @@ class LSQ

      public:
        LSQUnit<Impl>& _port;
-        const DynInstPtr _inst;
+        const O3DynInstPtr _inst;
        uint32_t _taskId;
        PacketDataPtr _data;
        std::vector<PacketPtr> _packets;
@@ -308,38 +309,11 @@ class LSQ
        AtomicOpFunctorPtr _amo_op;
      protected:
        LSQUnit<Impl>* lsqUnit() { return &_port; }
-        LSQRequest(LSQUnit<Impl> *port, const DynInstPtr& inst, bool isLoad) :
-            _state(State::NotIssued), _senderState(nullptr),
-            _port(*port), _inst(inst), _data(nullptr),
-            _res(nullptr), _addr(0), _size(0), _flags(0),
-            _numOutstandingPackets(0), _amo_op(nullptr)
-        {
-            flags.set(Flag::IsLoad, isLoad);
-            flags.set(Flag::WbStore,
-                      _inst->isStoreConditional() || _inst->isAtomic());
-            flags.set(Flag::IsAtomic, _inst->isAtomic());
-            install();
-        }
-        LSQRequest(LSQUnit<Impl>* port, const DynInstPtr& inst, bool isLoad,
-                   const Addr& addr, const uint32_t& size,
-                   const Request::Flags& flags_,
-                   PacketDataPtr data = nullptr, uint64_t* res = nullptr,
-                   AtomicOpFunctorPtr amo_op = nullptr)
-            : _state(State::NotIssued), _senderState(nullptr),
-            numTranslatedFragments(0),
-            numInTranslationFragments(0),
-            _port(*port), _inst(inst), _data(data),
-            _res(res), _addr(addr), _size(size),
-            _flags(flags_),
-            _numOutstandingPackets(0),
-            _amo_op(std::move(amo_op))
-        {
-            flags.set(Flag::IsLoad, isLoad);
-            flags.set(Flag::WbStore,
-                      _inst->isStoreConditional() || _inst->isAtomic());
-            flags.set(Flag::IsAtomic, _inst->isAtomic());
-            install();
-        }
+        LSQRequest(LSQUnit<Impl>* port, const O3DynInstPtr& inst, bool isLoad);
+        LSQRequest(LSQUnit<Impl>* port, const O3DynInstPtr& inst, bool isLoad,
+                const Addr& addr, const uint32_t& size,
+                const Request::Flags& flags_, PacketDataPtr data=nullptr,
+                uint64_t* res=nullptr, AtomicOpFunctorPtr amo_op=nullptr);

        bool
        isLoad() const
@@ -354,21 +328,9 @@ class LSQ
        }

        /** Install the request in the LQ/SQ. */
-        void install()
-        {
-            if (isLoad()) {
-                _port.loadQueue[_inst->lqIdx].setRequest(this);
-            } else {
-                // Store, StoreConditional, and Atomic requests are pushed
-                // to this storeQueue
-                _port.storeQueue[_inst->sqIdx].setRequest(this);
-            }
-        }
-        virtual bool
-        squashed() const override
-        {
-            return _inst->isSquashed();
-        }
+        void install();
+
+        bool squashed() const override;

        /**
         * Test if the LSQRequest has been released, i.e. self-owned.
@@ -391,7 +353,8 @@ class LSQ
         * but there is any in-flight translation request to the TLB or access
         * request to the memory.
         */
-        void release(Flag reason)
+        void
+        release(Flag reason)
        {
            assert(reason == Flag::LSQEntryFreed || reason == Flag::Discarded);
            if (!isAnyOutstandingRequest()) {
@@ -410,35 +373,14 @@ class LSQ
         * The request is only added if the mask is empty or if there is at
         * least an active element in it.
         */
-        void
-        addRequest(Addr addr, unsigned size,
-                   const std::vector<bool>& byte_enable)
-        {
-            if (isAnyActiveElement(byte_enable.begin(), byte_enable.end())) {
-                auto request = std::make_shared<Request>(
-                        addr, size, _flags, _inst->requestorId(),
-                        _inst->instAddr(), _inst->contextId(),
-                        std::move(_amo_op));
-                request->setByteEnable(byte_enable);
-                _requests.push_back(request);
-            }
-        }
+        void addRequest(Addr addr, unsigned size,
+                const std::vector<bool>& byte_enable);

        /** Destructor.
         * The LSQRequest owns the request. If the packet has already been
         * sent, the sender state will be deleted upon receiving the reply.
         */
-        virtual ~LSQRequest()
-        {
-            assert(!isAnyOutstandingRequest());
-            _inst->savedReq = nullptr;
-            if (_senderState)
-                delete _senderState;
-
-            for (auto r: _packets)
-                delete r;
-        };
-
+        virtual ~LSQRequest();

      public:
        /** Convenience getters/setters. */
@@ -450,7 +392,7 @@ class LSQ
            request()->setContext(context_id);
        }

-        const DynInstPtr&
+        const O3DynInstPtr&
        instruction()
        {
            return _inst;
@@ -728,7 +670,7 @@ class LSQ
        using LSQRequest::_numOutstandingPackets;
        using LSQRequest::_amo_op;
      public:
-        SingleDataRequest(LSQUnit<Impl>* port, const DynInstPtr& inst,
+        SingleDataRequest(LSQUnit<Impl>* port, const O3DynInstPtr& inst,
                bool isLoad, const Addr& addr, const uint32_t& size,
                const Request::Flags& flags_, PacketDataPtr data=nullptr,
                uint64_t* res=nullptr, AtomicOpFunctorPtr amo_op=nullptr) :
@@ -766,7 +708,7 @@ class LSQ
      using LSQRequest::flags;
      using LSQRequest::setState;
    public:
-      HtmCmdRequest(LSQUnit<Impl>* port, const DynInstPtr& inst,
+      HtmCmdRequest(LSQUnit<Impl>* port, const O3DynInstPtr& inst,
              const Request::Flags& flags_);
      inline virtual ~HtmCmdRequest() {}
      virtual void initiateTranslation();
@@ -813,7 +755,7 @@ class LSQ
        PacketPtr _mainPacket;

      public:
-        SplitDataRequest(LSQUnit<Impl>* port, const DynInstPtr& inst,
+        SplitDataRequest(LSQUnit<Impl>* port, const O3DynInstPtr& inst,
                bool isLoad, const Addr& addr, const uint32_t& size,
                const Request::Flags & flags_, PacketDataPtr data=nullptr,
                uint64_t* res=nullptr) :
@@ -876,15 +818,15 @@ class LSQ
    void tick();

    /** Inserts a load into the LSQ. */
-    void insertLoad(const DynInstPtr &load_inst);
+    void insertLoad(const O3DynInstPtr &load_inst);
    /** Inserts a store into the LSQ. */
-    void insertStore(const DynInstPtr &store_inst);
+    void insertStore(const O3DynInstPtr &store_inst);

    /** Executes a load. */
-    Fault executeLoad(const DynInstPtr &inst);
+    Fault executeLoad(const O3DynInstPtr &inst);

    /** Executes a store. */
-    Fault executeStore(const DynInstPtr &inst);
+    Fault executeStore(const O3DynInstPtr &inst);

    /**
     * Commits loads up until the given sequence number for a specific thread.
@@ -924,7 +866,7 @@ class LSQ
    bool violation(ThreadID tid) { return thread.at(tid).violation(); }

    /** Gets the instruction that caused the memory ordering violation. */
-    DynInstPtr
+    O3DynInstPtr
    getMemDepViolator(ThreadID tid)
    {
        return thread.at(tid).getMemDepViolator();
@@ -1103,7 +1045,7 @@ class LSQ

    void recvTimingSnoopReq(PacketPtr pkt);

-    Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
+    Fault pushRequest(const O3DynInstPtr& inst, bool isLoad, uint8_t *data,
                      unsigned int size, Addr addr, Request::Flags flags,
                      uint64_t *res, AtomicOpFunctorPtr amo_op,
                      const std::vector<bool>& byte_enable);
--- a/src/cpu/o3/lsq_impl.hh
+++ b/src/cpu/o3/lsq_impl.hh
@@ -49,6 +49,7 @@
 #include "base/compiler.hh"
 #include "base/logging.hh"
 #include "cpu/o3/cpu.hh"
+#include "cpu/o3/dyn_inst.hh"
 #include "cpu/o3/iew.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/o3/lsq.hh"
@@ -59,6 +60,13 @@
 #include "debug/Writeback.hh"
 #include "params/DerivO3CPU.hh"

+template <class Impl>
+ContextID
+LSQ<Impl>::LSQSenderState::contextId()
+{
+    return inst->contextId();
+}
+
 template <class Impl>
 LSQ<Impl>::LSQ(O3CPU *cpu_ptr, DefaultIEW<Impl> *iew_ptr,
        const DerivO3CPUParams &params)
@@ -220,7 +228,7 @@ LSQ<Impl>::cachePortBusy(bool is_load)

 template<class Impl>
 void
-LSQ<Impl>::insertLoad(const DynInstPtr &load_inst)
+LSQ<Impl>::insertLoad(const O3DynInstPtr &load_inst)
 {
    ThreadID tid = load_inst->threadNumber;

@@ -229,7 +237,7 @@ LSQ<Impl>::insertLoad(const DynInstPtr &load_inst)

 template<class Impl>
 void
-LSQ<Impl>::insertStore(const DynInstPtr &store_inst)
+LSQ<Impl>::insertStore(const O3DynInstPtr &store_inst)
 {
    ThreadID tid = store_inst->threadNumber;

@@ -238,7 +246,7 @@ LSQ<Impl>::insertStore(const DynInstPtr &store_inst)

 template<class Impl>
 Fault
-LSQ<Impl>::executeLoad(const DynInstPtr &inst)
+LSQ<Impl>::executeLoad(const O3DynInstPtr &inst)
 {
    ThreadID tid = inst->threadNumber;

@@ -247,7 +255,7 @@ LSQ<Impl>::executeLoad(const DynInstPtr &inst)

 template<class Impl>
 Fault
-LSQ<Impl>::executeStore(const DynInstPtr &inst)
+LSQ<Impl>::executeStore(const O3DynInstPtr &inst)
 {
    ThreadID tid = inst->threadNumber;

@@ -676,7 +684,7 @@ LSQ<Impl>::dumpInsts() const

 template<class Impl>
 Fault
-LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
+LSQ<Impl>::pushRequest(const O3DynInstPtr& inst, bool isLoad, uint8_t *data,
                       unsigned int size, Addr addr, Request::Flags flags,
                       uint64_t *res, AtomicOpFunctorPtr amo_op,
                       const std::vector<bool>& byte_enable)
@@ -951,6 +959,85 @@ LSQ<Impl>::SplitDataRequest::initiateTranslation()
    }
 }

+template<class Impl>
+LSQ<Impl>::LSQRequest::LSQRequest(
+        LSQUnit<Impl> *port, const O3DynInstPtr& inst, bool isLoad) :
+    _state(State::NotIssued), _senderState(nullptr),
+    _port(*port), _inst(inst), _data(nullptr),
+    _res(nullptr), _addr(0), _size(0), _flags(0),
+    _numOutstandingPackets(0), _amo_op(nullptr)
+{
+    flags.set(Flag::IsLoad, isLoad);
+    flags.set(Flag::WbStore,
+              _inst->isStoreConditional() || _inst->isAtomic());
+    flags.set(Flag::IsAtomic, _inst->isAtomic());
+    install();
+}
+
+template<class Impl>
+LSQ<Impl>::LSQRequest::LSQRequest(
+        LSQUnit<Impl>* port, const O3DynInstPtr& inst, bool isLoad,
+        const Addr& addr, const uint32_t& size, const Request::Flags& flags_,
+           PacketDataPtr data, uint64_t* res, AtomicOpFunctorPtr amo_op)
+    : _state(State::NotIssued), _senderState(nullptr),
+    numTranslatedFragments(0),
+    numInTranslationFragments(0),
+    _port(*port), _inst(inst), _data(data),
+    _res(res), _addr(addr), _size(size),
+    _flags(flags_),
+    _numOutstandingPackets(0),
+    _amo_op(std::move(amo_op))
+{
+    flags.set(Flag::IsLoad, isLoad);
+    flags.set(Flag::WbStore,
+              _inst->isStoreConditional() || _inst->isAtomic());
+    flags.set(Flag::IsAtomic, _inst->isAtomic());
+    install();
+}
+
+template<class Impl>
+void
+LSQ<Impl>::LSQRequest::install()
+{
+    if (isLoad()) {
+        _port.loadQueue[_inst->lqIdx].setRequest(this);
+    } else {
+        // Store, StoreConditional, and Atomic requests are pushed
+        // to this storeQueue
+        _port.storeQueue[_inst->sqIdx].setRequest(this);
+    }
+}
+
+template<class Impl>
+bool LSQ<Impl>::LSQRequest::squashed() const { return _inst->isSquashed(); }
+
+template<class Impl>
+void
+LSQ<Impl>::LSQRequest::addRequest(Addr addr, unsigned size,
+           const std::vector<bool>& byte_enable)
+{
+    if (isAnyActiveElement(byte_enable.begin(), byte_enable.end())) {
+        auto request = std::make_shared<Request>(
+                addr, size, _flags, _inst->requestorId(),
+                _inst->instAddr(), _inst->contextId(),
+                std::move(_amo_op));
+        request->setByteEnable(byte_enable);
+        _requests.push_back(request);
+    }
+}
+
+template<class Impl>
+LSQ<Impl>::LSQRequest::~LSQRequest()
+{
+    assert(!isAnyOutstandingRequest());
+    _inst->savedReq = nullptr;
+    if (_senderState)
+        delete _senderState;
+
+    for (auto r: _packets)
+        delete r;
+};
+
 template<class Impl>
 void
 LSQ<Impl>::LSQRequest::sendFragmentToTranslation(int i)
@@ -1226,7 +1313,7 @@ LSQ<Impl>::DcachePort::recvReqRetry()

 template<class Impl>
 LSQ<Impl>::HtmCmdRequest::HtmCmdRequest(LSQUnit<Impl>* port,
-                  const DynInstPtr& inst,
+                  const O3DynInstPtr& inst,
                  const Request::Flags& flags_) :
    SingleDataRequest(port, inst, true, 0x0lu, 8, flags_,
        nullptr, nullptr, nullptr)
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -53,6 +53,7 @@
 #include "arch/locked_mem.hh"
 #include "config/the_isa.hh"
 #include "cpu/inst_seq.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/lsq.hh"
 #include "cpu/timebuf.hh"
 #include "debug/HtmCpu.hh"
@@ -85,7 +86,6 @@ class LSQUnit
    static constexpr auto MaxDataBytes = MaxVecRegLenInBytes;

    typedef typename Impl::O3CPU O3CPU;
-    typedef typename Impl::DynInstPtr DynInstPtr;
    typedef typename Impl::IssueStruct IssueStruct;

    using LSQSenderState = typename LSQ<Impl>::LSQSenderState;
@@ -95,23 +95,17 @@ class LSQUnit
    {
      private:
        /** The instruction. */
-        DynInstPtr inst;
+        O3DynInstPtr inst;
        /** The request. */
-        LSQRequest* req;
+        LSQRequest* req = nullptr;
        /** The size of the operation. */
-        uint32_t _size;
+        uint32_t _size = 0;
        /** Valid entry. */
-        bool _valid;
-      public:
-        /** Constructs an empty store queue entry. */
-        LSQEntry()
-            : inst(nullptr), req(nullptr), _size(0), _valid(false)
-        {
-        }
+        bool _valid = false;

+      public:
        ~LSQEntry()
        {
-            inst = nullptr;
            if (req != nullptr) {
                req->freeLSQEntry();
                req = nullptr;
@@ -131,13 +125,14 @@ class LSQUnit
        }

        void
-        set(const DynInstPtr& inst)
+        set(const O3DynInstPtr& inst)
        {
            assert(!_valid);
            this->inst = inst;
            _valid = true;
            _size = 0;
        }
+
        LSQRequest* request() { return req; }
        void setRequest(LSQRequest* r) { req = r; }
        bool hasRequest() { return req != nullptr; }
@@ -146,7 +141,7 @@ class LSQUnit
        bool valid() const { return _valid; }
        uint32_t& size() { return _size; }
        const uint32_t& size() const { return _size; }
-        const DynInstPtr& instruction() const { return inst; }
+        const O3DynInstPtr& instruction() const { return inst; }
        /** @} */
    };

@@ -156,32 +151,27 @@ class LSQUnit
        /** The store data. */
        char _data[MaxDataBytes];
        /** Whether or not the store can writeback. */
-        bool _canWB;
+        bool _canWB = false;
        /** Whether or not the store is committed. */
-        bool _committed;
+        bool _committed = false;
        /** Whether or not the store is completed. */
-        bool _completed;
+        bool _completed = false;
        /** Does this request write all zeros and thus doesn't
         * have any data attached to it. Used for cache block zero
         * style instructs (ARM DC ZVA; ALPHA WH64)
         */
-        bool _isAllZeros;
+        bool _isAllZeros = false;
+
      public:
        static constexpr size_t DataSize = sizeof(_data);
        /** Constructs an empty store queue entry. */
        SQEntry()
-            : _canWB(false), _committed(false), _completed(false),
-              _isAllZeros(false)
        {
            std::memset(_data, 0, DataSize);
        }

-        ~SQEntry()
-        {
-        }
-
        void
-        set(const DynInstPtr& inst)
+        set(const O3DynInstPtr& inst)
        {
            LSQEntry::set(inst);
        }
@@ -192,6 +182,7 @@ class LSQUnit
            LSQEntry::clear();
            _canWB = _completed = _committed = _isAllZeros = false;
        }
+
        /** Member accessors. */
        /** @{ */
        bool& canWB() { return _canWB; }
@@ -250,11 +241,11 @@ class LSQUnit
    void takeOverFrom();

    /** Inserts an instruction. */
-    void insert(const DynInstPtr &inst);
+    void insert(const O3DynInstPtr &inst);
    /** Inserts a load instruction. */
-    void insertLoad(const DynInstPtr &load_inst);
+    void insertLoad(const O3DynInstPtr &load_inst);
    /** Inserts a store instruction. */
-    void insertStore(const DynInstPtr &store_inst);
+    void insertStore(const O3DynInstPtr &store_inst);

    /** Check for ordering violations in the LSQ. For a store squash if we
     * ever find a conflicting load. For a load, only squash if we
@@ -263,7 +254,7 @@ class LSQUnit
     * @param inst the instruction to check
     */
    Fault checkViolations(typename LoadQueue::iterator& loadIt,
-            const DynInstPtr& inst);
+            const O3DynInstPtr& inst);

    /** Check if an incoming invalidate hits in the lsq on a load
     * that might have issued out of order wrt another load beacuse
@@ -272,11 +263,11 @@ class LSQUnit
    void checkSnoop(PacketPtr pkt);

    /** Executes a load instruction. */
-    Fault executeLoad(const DynInstPtr &inst);
+    Fault executeLoad(const O3DynInstPtr &inst);

    Fault executeLoad(int lq_idx) { panic("Not implemented"); return NoFault; }
    /** Executes a store instruction. */
-    Fault executeStore(const DynInstPtr &inst);
+    Fault executeStore(const O3DynInstPtr &inst);

    /** Commits the head load. */
    void commitLoad();
@@ -302,7 +293,7 @@ class LSQUnit
    bool violation() { return memDepViolator; }

    /** Returns the memory ordering violator. */
-    DynInstPtr getMemDepViolator();
+    O3DynInstPtr getMemDepViolator();

    /** Returns the number of free LQ entries. */
    unsigned numFreeLoadEntries();
@@ -378,7 +369,7 @@ class LSQUnit
    void resetState();

    /** Writes back the instruction, sending it to IEW. */
-    void writeback(const DynInstPtr &inst, PacketPtr pkt);
+    void writeback(const O3DynInstPtr &inst, PacketPtr pkt);

    /** Try to finish a previously blocked write back attempt */
    void writebackBlockedStore();
@@ -460,7 +451,7 @@ class LSQUnit
    {
      public:
        /** Constructs a writeback event. */
-        WritebackEvent(const DynInstPtr &_inst, PacketPtr pkt,
+        WritebackEvent(const O3DynInstPtr &_inst, PacketPtr pkt,
                LSQUnit *lsq_ptr);

        /** Processes the writeback event. */
@@ -471,7 +462,7 @@ class LSQUnit

      private:
        /** Instruction whose results are being written back. */
-        DynInstPtr inst;
+        O3DynInstPtr inst;

        /** The packet that would have been sent to memory. */
        PacketPtr pkt;
@@ -552,7 +543,7 @@ class LSQUnit
    bool storeInFlight;

    /** The oldest load that caused a memory ordering violation. */
-    DynInstPtr memDepViolator;
+    O3DynInstPtr memDepViolator;

    /** Whether or not there is a packet that couldn't be sent because of
     * a lack of cache ports. */
@@ -634,357 +625,4 @@ class LSQUnit
    typedef CircularQueue<SQEntry> SQueue;
 };

-template <class Impl>
-Fault
-LSQUnit<Impl>::read(LSQRequest *req, int load_idx)
-{
-    LQEntry& load_req = loadQueue[load_idx];
-    const DynInstPtr& load_inst = load_req.instruction();
-
-    load_req.setRequest(req);
-    assert(load_inst);
-
-    assert(!load_inst->isExecuted());
-
-    // Make sure this isn't a strictly ordered load
-    // A bit of a hackish way to get strictly ordered accesses to work
-    // only if they're at the head of the LSQ and are ready to commit
-    // (at the head of the ROB too).
-
-    if (req->mainRequest()->isStrictlyOrdered() &&
-        (load_idx != loadQueue.head() || !load_inst->isAtCommit())) {
-        // Tell IQ/mem dep unit that this instruction will need to be
-        // rescheduled eventually
-        iewStage->rescheduleMemInst(load_inst);
-        load_inst->clearIssued();
-        load_inst->effAddrValid(false);
-        ++stats.rescheduledLoads;
-        DPRINTF(LSQUnit, "Strictly ordered load [sn:%lli] PC %s\n",
-                load_inst->seqNum, load_inst->pcState());
-
-        // Must delete request now that it wasn't handed off to
-        // memory.  This is quite ugly.  @todo: Figure out the proper
-        // place to really handle request deletes.
-        load_req.setRequest(nullptr);
-        req->discard();
-        return std::make_shared<GenericISA::M5PanicFault>(
-            "Strictly ordered load [sn:%llx] PC %s\n",
-            load_inst->seqNum, load_inst->pcState());
-    }
-
-    DPRINTF(LSQUnit, "Read called, load idx: %i, store idx: %i, "
-            "storeHead: %i addr: %#x%s\n",
-            load_idx - 1, load_inst->sqIt._idx, storeQueue.head() - 1,
-            req->mainRequest()->getPaddr(), req->isSplit() ? " split" : "");
-
-    if (req->mainRequest()->isLLSC()) {
-        // Disable recording the result temporarily.  Writing to misc
-        // regs normally updates the result, but this is not the
-        // desired behavior when handling store conditionals.
-        load_inst->recordResult(false);
-        TheISA::handleLockedRead(load_inst.get(), req->mainRequest());
-        load_inst->recordResult(true);
-    }
-
-    if (req->mainRequest()->isLocalAccess()) {
-        assert(!load_inst->memData);
-        assert(!load_inst->inHtmTransactionalState());
-        load_inst->memData = new uint8_t[MaxDataBytes];
-
-        ThreadContext *thread = cpu->tcBase(lsqID);
-        PacketPtr main_pkt = new Packet(req->mainRequest(), MemCmd::ReadReq);
-
-        main_pkt->dataStatic(load_inst->memData);
-
-        Cycles delay = req->mainRequest()->localAccessor(thread, main_pkt);
-
-        WritebackEvent *wb = new WritebackEvent(load_inst, main_pkt, this);
-        cpu->schedule(wb, cpu->clockEdge(delay));
-        return NoFault;
-    }
-
-    // hardware transactional memory
-    if (req->mainRequest()->isHTMStart() || req->mainRequest()->isHTMCommit())
-    {
-        // don't want to send nested transactionStarts and
-        // transactionStops outside of core, e.g. to Ruby
-        if (req->mainRequest()->getFlags().isSet(Request::NO_ACCESS)) {
-            Cycles delay(0);
-            PacketPtr data_pkt =
-                new Packet(req->mainRequest(), MemCmd::ReadReq);
-
-            // Allocate memory if this is the first time a load is issued.
-            if (!load_inst->memData) {
-                load_inst->memData =
-                    new uint8_t[req->mainRequest()->getSize()];
-                // sanity checks espect zero in request's data
-                memset(load_inst->memData, 0, req->mainRequest()->getSize());
-            }
-
-            data_pkt->dataStatic(load_inst->memData);
-            if (load_inst->inHtmTransactionalState()) {
-                data_pkt->setHtmTransactional(
-                    load_inst->getHtmTransactionUid());
-            }
-            data_pkt->makeResponse();
-
-            WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this);
-            cpu->schedule(wb, cpu->clockEdge(delay));
-            return NoFault;
-        }
-    }
-
-    // Check the SQ for any previous stores that might lead to forwarding
-    auto store_it = load_inst->sqIt;
-    assert (store_it >= storeWBIt);
-    // End once we've reached the top of the LSQ
-    while (store_it != storeWBIt) {
-        // Move the index to one younger
-        store_it--;
-        assert(store_it->valid());
-        assert(store_it->instruction()->seqNum < load_inst->seqNum);
-        int store_size = store_it->size();
-
-        // Cache maintenance instructions go down via the store
-        // path but they carry no data and they shouldn't be
-        // considered for forwarding
-        if (store_size != 0 && !store_it->instruction()->strictlyOrdered() &&
-            !(store_it->request()->mainRequest() &&
-              store_it->request()->mainRequest()->isCacheMaintenance())) {
-            assert(store_it->instruction()->effAddrValid());
-
-            // Check if the store data is within the lower and upper bounds of
-            // addresses that the request needs.
-            auto req_s = req->mainRequest()->getVaddr();
-            auto req_e = req_s + req->mainRequest()->getSize();
-            auto st_s = store_it->instruction()->effAddr;
-            auto st_e = st_s + store_size;
-
-            bool store_has_lower_limit = req_s >= st_s;
-            bool store_has_upper_limit = req_e <= st_e;
-            bool lower_load_has_store_part = req_s < st_e;
-            bool upper_load_has_store_part = req_e > st_s;
-
-            auto coverage = AddrRangeCoverage::NoAddrRangeCoverage;
-
-            // If the store entry is not atomic (atomic does not have valid
-            // data), the store has all of the data needed, and
-            // the load is not LLSC, then
-            // we can forward data from the store to the load
-            if (!store_it->instruction()->isAtomic() &&
-                store_has_lower_limit && store_has_upper_limit &&
-                !req->mainRequest()->isLLSC()) {
-
-                const auto& store_req = store_it->request()->mainRequest();
-                coverage = store_req->isMasked() ?
-                    AddrRangeCoverage::PartialAddrRangeCoverage :
-                    AddrRangeCoverage::FullAddrRangeCoverage;
-            } else if (
-                // This is the partial store-load forwarding case where a store
-                // has only part of the load's data and the load isn't LLSC
-                (!req->mainRequest()->isLLSC() &&
-                 ((store_has_lower_limit && lower_load_has_store_part) ||
-                  (store_has_upper_limit && upper_load_has_store_part) ||
-                  (lower_load_has_store_part && upper_load_has_store_part))) ||
-                // The load is LLSC, and the store has all or part of the
-                // load's data
-                (req->mainRequest()->isLLSC() &&
-                 ((store_has_lower_limit || upper_load_has_store_part) &&
-                  (store_has_upper_limit || lower_load_has_store_part))) ||
-                // The store entry is atomic and has all or part of the load's
-                // data
-                (store_it->instruction()->isAtomic() &&
-                 ((store_has_lower_limit || upper_load_has_store_part) &&
-                  (store_has_upper_limit || lower_load_has_store_part)))) {
-
-                coverage = AddrRangeCoverage::PartialAddrRangeCoverage;
-            }
-
-            if (coverage == AddrRangeCoverage::FullAddrRangeCoverage) {
-                // Get shift amount for offset into the store's data.
-                int shift_amt = req->mainRequest()->getVaddr() -
-                    store_it->instruction()->effAddr;
-
-                // Allocate memory if this is the first time a load is issued.
-                if (!load_inst->memData) {
-                    load_inst->memData =
-                        new uint8_t[req->mainRequest()->getSize()];
-                }
-                if (store_it->isAllZeros())
-                    memset(load_inst->memData, 0,
-                            req->mainRequest()->getSize());
-                else
-                    memcpy(load_inst->memData,
-                        store_it->data() + shift_amt,
-                        req->mainRequest()->getSize());
-
-                DPRINTF(LSQUnit, "Forwarding from store idx %i to load to "
-                        "addr %#x\n", store_it._idx,
-                        req->mainRequest()->getVaddr());
-
-                PacketPtr data_pkt = new Packet(req->mainRequest(),
-                        MemCmd::ReadReq);
-                data_pkt->dataStatic(load_inst->memData);
-
-                // hardware transactional memory
-                // Store to load forwarding within a transaction
-                // This should be okay because the store will be sent to
-                // the memory subsystem and subsequently get added to the
-                // write set of the transaction. The write set has a stronger
-                // property than the read set, so the load doesn't necessarily
-                // have to be there.
-                assert(!req->mainRequest()->isHTMCmd());
-                if (load_inst->inHtmTransactionalState()) {
-                    assert (!storeQueue[store_it._idx].completed());
-                    assert (
-                        storeQueue[store_it._idx].instruction()->
-                          inHtmTransactionalState());
-                    assert (
-                        load_inst->getHtmTransactionUid() ==
-                        storeQueue[store_it._idx].instruction()->
-                          getHtmTransactionUid());
-                    data_pkt->setHtmTransactional(
-                        load_inst->getHtmTransactionUid());
-                    DPRINTF(HtmCpu, "HTM LD (ST2LDF) "
-                      "pc=0x%lx - vaddr=0x%lx - "
-                      "paddr=0x%lx - htmUid=%u\n",
-                      load_inst->instAddr(),
-                      data_pkt->req->hasVaddr() ?
-                        data_pkt->req->getVaddr() : 0lu,
-                      data_pkt->getAddr(),
-                      load_inst->getHtmTransactionUid());
-                }
-
-                if (req->isAnyOutstandingRequest()) {
-                    assert(req->_numOutstandingPackets > 0);
-                    // There are memory requests packets in flight already.
-                    // This may happen if the store was not complete the
-                    // first time this load got executed. Signal the senderSate
-                    // that response packets should be discarded.
-                    req->discardSenderState();
-                }
-
-                WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt,
-                        this);
-
-                // We'll say this has a 1 cycle load-store forwarding latency
-                // for now.
-                // @todo: Need to make this a parameter.
-                cpu->schedule(wb, curTick());
-
-                // Don't need to do anything special for split loads.
-                ++stats.forwLoads;
-
-                return NoFault;
-            } else if (coverage == AddrRangeCoverage::PartialAddrRangeCoverage) {
-                // If it's already been written back, then don't worry about
-                // stalling on it.
-                if (store_it->completed()) {
-                    panic("Should not check one of these");
-                    continue;
-                }
-
-                // Must stall load and force it to retry, so long as it's the
-                // oldest load that needs to do so.
-                if (!stalled ||
-                    (stalled &&
-                     load_inst->seqNum <
-                     loadQueue[stallingLoadIdx].instruction()->seqNum)) {
-                    stalled = true;
-                    stallingStoreIsn = store_it->instruction()->seqNum;
-                    stallingLoadIdx = load_idx;
-                }
-
-                // Tell IQ/mem dep unit that this instruction will need to be
-                // rescheduled eventually
-                iewStage->rescheduleMemInst(load_inst);
-                load_inst->clearIssued();
-                load_inst->effAddrValid(false);
-                ++stats.rescheduledLoads;
-
-                // Do not generate a writeback event as this instruction is not
-                // complete.
-                DPRINTF(LSQUnit, "Load-store forwarding mis-match. "
-                        "Store idx %i to load addr %#x\n",
-                        store_it._idx, req->mainRequest()->getVaddr());
-
-                // Must discard the request.
-                req->discard();
-                load_req.setRequest(nullptr);
-                return NoFault;
-            }
-        }
-    }
-
-    // If there's no forwarding case, then go access memory
-    DPRINTF(LSQUnit, "Doing memory access for inst [sn:%lli] PC %s\n",
-            load_inst->seqNum, load_inst->pcState());
-
-    // Allocate memory if this is the first time a load is issued.
-    if (!load_inst->memData) {
-        load_inst->memData = new uint8_t[req->mainRequest()->getSize()];
-    }
-
-
-    // hardware transactional memory
-    if (req->mainRequest()->isHTMCmd()) {
-        // this is a simple sanity check
-        // the Ruby cache controller will set
-        // memData to 0x0ul if successful.
-        *load_inst->memData = (uint64_t) 0x1ull;
-    }
-
-    // For now, load throughput is constrained by the number of
-    // load FUs only, and loads do not consume a cache port (only
-    // stores do).
-    // @todo We should account for cache port contention
-    // and arbitrate between loads and stores.
-
-    // if we the cache is not blocked, do cache access
-    if (req->senderState() == nullptr) {
-        LQSenderState *state = new LQSenderState(
-                loadQueue.getIterator(load_idx));
-        state->isLoad = true;
-        state->inst = load_inst;
-        state->isSplit = req->isSplit();
-        req->senderState(state);
-    }
-    req->buildPackets();
-    req->sendPacketToCache();
-    if (!req->isSent())
-        iewStage->blockMemInst(load_inst);
-
-    return NoFault;
-}
-
-template <class Impl>
-Fault
-LSQUnit<Impl>::write(LSQRequest *req, uint8_t *data, int store_idx)
-{
-    assert(storeQueue[store_idx].valid());
-
-    DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x | storeHead:%i "
-            "[sn:%llu]\n",
-            store_idx - 1, req->request()->getPaddr(), storeQueue.head() - 1,
-            storeQueue[store_idx].instruction()->seqNum);
-
-    storeQueue[store_idx].setRequest(req);
-    unsigned size = req->_size;
-    storeQueue[store_idx].size() = size;
-    bool store_no_data =
-        req->mainRequest()->getFlags() & Request::STORE_NO_DATA;
-    storeQueue[store_idx].isAllZeros() = store_no_data;
-    assert(size <= SQEntry::DataSize || store_no_data);
-
-    // copy data into the storeQueue only if the store request has valid data
-    if (!(req->request()->getFlags() & Request::CACHE_BLOCK_ZERO) &&
-        !req->request()->isCacheMaintenance() &&
-        !req->request()->isAtomic())
-        memcpy(storeQueue[store_idx].data(), data, size);
-
-    // This function only writes the data to the store queue, so no fault
-    // can happen here.
-    return NoFault;
-}
-
 #endif // __CPU_O3_LSQ_UNIT_HH__
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -60,7 +60,7 @@
 #include "mem/request.hh"

 template<class Impl>
-LSQUnit<Impl>::WritebackEvent::WritebackEvent(const DynInstPtr &_inst,
+LSQUnit<Impl>::WritebackEvent::WritebackEvent(const O3DynInstPtr &_inst,
        PacketPtr _pkt, LSQUnit *lsq_ptr)
    : Event(Default_Pri, AutoDelete),
      inst(_inst), pkt(_pkt), lsqPtr(lsq_ptr)
@@ -112,7 +112,7 @@ void
 LSQUnit<Impl>::completeDataAccess(PacketPtr pkt)
 {
    LSQSenderState *state = dynamic_cast<LSQSenderState *>(pkt->senderState);
-    DynInstPtr inst = state->inst;
+    O3DynInstPtr inst = state->inst;

    // hardware transactional memory
    // sanity check
@@ -317,7 +317,7 @@ LSQUnit<Impl>::takeOverFrom()

 template <class Impl>
 void
-LSQUnit<Impl>::insert(const DynInstPtr &inst)
+LSQUnit<Impl>::insert(const O3DynInstPtr &inst)
 {
    assert(inst->isMemRef());

@@ -334,7 +334,7 @@ LSQUnit<Impl>::insert(const DynInstPtr &inst)

 template <class Impl>
 void
-LSQUnit<Impl>::insertLoad(const DynInstPtr &load_inst)
+LSQUnit<Impl>::insertLoad(const O3DynInstPtr &load_inst)
 {
    assert(!loadQueue.full());
    assert(loads < loadQueue.capacity());
@@ -397,7 +397,7 @@ LSQUnit<Impl>::insertLoad(const DynInstPtr &load_inst)

 template <class Impl>
 void
-LSQUnit<Impl>::insertStore(const DynInstPtr& store_inst)
+LSQUnit<Impl>::insertStore(const O3DynInstPtr& store_inst)
 {
    // Make sure it is not full before inserting an instruction.
    assert(!storeQueue.full());
@@ -418,10 +418,10 @@ LSQUnit<Impl>::insertStore(const DynInstPtr& store_inst)
 }

 template <class Impl>
-typename Impl::DynInstPtr
+O3DynInstPtr
 LSQUnit<Impl>::getMemDepViolator()
 {
-    DynInstPtr temp = memDepViolator;
+    O3DynInstPtr temp = memDepViolator;

    memDepViolator = NULL;

@@ -475,7 +475,7 @@ LSQUnit<Impl>::checkSnoop(PacketPtr pkt)

    Addr invalidate_addr = pkt->getAddr() & cacheBlockMask;

-    DynInstPtr ld_inst = iter->instruction();
+    O3DynInstPtr ld_inst = iter->instruction();
    assert(ld_inst);
    LSQRequest *req = iter->request();

@@ -535,7 +535,7 @@ LSQUnit<Impl>::checkSnoop(PacketPtr pkt)
 template <class Impl>
 Fault
 LSQUnit<Impl>::checkViolations(typename LoadQueue::iterator& loadIt,
-        const DynInstPtr& inst)
+        const O3DynInstPtr& inst)
 {
    Addr inst_eff_addr1 = inst->effAddr >> depCheckShift;
    Addr inst_eff_addr2 = (inst->effAddr + inst->effSize - 1) >> depCheckShift;
@@ -546,7 +546,7 @@ LSQUnit<Impl>::checkViolations(typename LoadQueue::iterator& loadIt,
     * like the implementation that came before it, we're overly conservative.
     */
    while (loadIt != loadQueue.end()) {
-        DynInstPtr ld_inst = loadIt->instruction();
+        O3DynInstPtr ld_inst = loadIt->instruction();
        if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) {
            ++loadIt;
            continue;
@@ -615,7 +615,7 @@ LSQUnit<Impl>::checkViolations(typename LoadQueue::iterator& loadIt,

 template <class Impl>
 Fault
-LSQUnit<Impl>::executeLoad(const DynInstPtr &inst)
+LSQUnit<Impl>::executeLoad(const O3DynInstPtr &inst)
 {
    // Execute a specific load.
    Fault load_fault = NoFault;
@@ -682,7 +682,7 @@ LSQUnit<Impl>::executeLoad(const DynInstPtr &inst)

 template <class Impl>
 Fault
-LSQUnit<Impl>::executeStore(const DynInstPtr &store_inst)
+LSQUnit<Impl>::executeStore(const O3DynInstPtr &store_inst)
 {
    // Make sure that a store exists.
    assert(stores != 0);
@@ -837,7 +837,7 @@ LSQUnit<Impl>::writebackStores()
        assert(storeWBIt->hasRequest());
        assert(!storeWBIt->committed());

-        DynInstPtr inst = storeWBIt->instruction();
+        O3DynInstPtr inst = storeWBIt->instruction();
        LSQRequest* req = storeWBIt->request();

        // Process store conditionals or store release after all previous
@@ -1095,7 +1095,7 @@ LSQUnit<Impl>::storePostSend()

 template <class Impl>
 void
-LSQUnit<Impl>::writeback(const DynInstPtr &inst, PacketPtr pkt)
+LSQUnit<Impl>::writeback(const O3DynInstPtr &inst, PacketPtr pkt)
 {
    iewStage->wakeCPU();

@@ -1170,7 +1170,7 @@ LSQUnit<Impl>::completeStore(typename StoreQueue::iterator store_idx)

    /* We 'need' a copy here because we may clear the entry from the
     * store queue. */
-    DynInstPtr store_inst = store_idx->instruction();
+    O3DynInstPtr store_inst = store_idx->instruction();
    if (store_idx == storeQueue.begin()) {
        do {
            storeQueue.front().clear();
@@ -1279,7 +1279,7 @@ LSQUnit<Impl>::dumpInsts() const
    cprintf("Load queue: ");

    for (const auto& e: loadQueue) {
-        const DynInstPtr &inst(e.instruction());
+        const O3DynInstPtr &inst(e.instruction());
        cprintf("%s.[sn:%llu] ", inst->pcState(), inst->seqNum);
    }
    cprintf("\n");
@@ -1288,7 +1288,7 @@ LSQUnit<Impl>::dumpInsts() const
    cprintf("Store queue: ");

    for (const auto& e: storeQueue) {
-        const DynInstPtr &inst(e.instruction());
+        const O3DynInstPtr &inst(e.instruction());
        cprintf("%s.[sn:%llu] ", inst->pcState(), inst->seqNum);
    }

@@ -1302,4 +1302,358 @@ LSQUnit<Impl>::cacheLineSize()
    return cpu->cacheLineSize();
 }

+template <class Impl>
+Fault
+LSQUnit<Impl>::read(LSQRequest *req, int load_idx)
+{
+    LQEntry& load_req = loadQueue[load_idx];
+    const O3DynInstPtr& load_inst = load_req.instruction();
+
+    load_req.setRequest(req);
+    assert(load_inst);
+
+    assert(!load_inst->isExecuted());
+
+    // Make sure this isn't a strictly ordered load
+    // A bit of a hackish way to get strictly ordered accesses to work
+    // only if they're at the head of the LSQ and are ready to commit
+    // (at the head of the ROB too).
+
+    if (req->mainRequest()->isStrictlyOrdered() &&
+        (load_idx != loadQueue.head() || !load_inst->isAtCommit())) {
+        // Tell IQ/mem dep unit that this instruction will need to be
+        // rescheduled eventually
+        iewStage->rescheduleMemInst(load_inst);
+        load_inst->clearIssued();
+        load_inst->effAddrValid(false);
+        ++stats.rescheduledLoads;
+        DPRINTF(LSQUnit, "Strictly ordered load [sn:%lli] PC %s\n",
+                load_inst->seqNum, load_inst->pcState());
+
+        // Must delete request now that it wasn't handed off to
+        // memory.  This is quite ugly.  @todo: Figure out the proper
+        // place to really handle request deletes.
+        load_req.setRequest(nullptr);
+        req->discard();
+        return std::make_shared<GenericISA::M5PanicFault>(
+            "Strictly ordered load [sn:%llx] PC %s\n",
+            load_inst->seqNum, load_inst->pcState());
+    }
+
+    DPRINTF(LSQUnit, "Read called, load idx: %i, store idx: %i, "
+            "storeHead: %i addr: %#x%s\n",
+            load_idx - 1, load_inst->sqIt._idx, storeQueue.head() - 1,
+            req->mainRequest()->getPaddr(), req->isSplit() ? " split" : "");
+
+    if (req->mainRequest()->isLLSC()) {
+        // Disable recording the result temporarily.  Writing to misc
+        // regs normally updates the result, but this is not the
+        // desired behavior when handling store conditionals.
+        load_inst->recordResult(false);
+        TheISA::handleLockedRead(load_inst.get(), req->mainRequest());
+        load_inst->recordResult(true);
+    }
+
+    if (req->mainRequest()->isLocalAccess()) {
+        assert(!load_inst->memData);
+        assert(!load_inst->inHtmTransactionalState());
+        load_inst->memData = new uint8_t[MaxDataBytes];
+
+        ThreadContext *thread = cpu->tcBase(lsqID);
+        PacketPtr main_pkt = new Packet(req->mainRequest(), MemCmd::ReadReq);
+
+        main_pkt->dataStatic(load_inst->memData);
+
+        Cycles delay = req->mainRequest()->localAccessor(thread, main_pkt);
+
+        WritebackEvent *wb = new WritebackEvent(load_inst, main_pkt, this);
+        cpu->schedule(wb, cpu->clockEdge(delay));
+        return NoFault;
+    }
+
+    // hardware transactional memory
+    if (req->mainRequest()->isHTMStart() || req->mainRequest()->isHTMCommit())
+    {
+        // don't want to send nested transactionStarts and
+        // transactionStops outside of core, e.g. to Ruby
+        if (req->mainRequest()->getFlags().isSet(Request::NO_ACCESS)) {
+            Cycles delay(0);
+            PacketPtr data_pkt =
+                new Packet(req->mainRequest(), MemCmd::ReadReq);
+
+            // Allocate memory if this is the first time a load is issued.
+            if (!load_inst->memData) {
+                load_inst->memData =
+                    new uint8_t[req->mainRequest()->getSize()];
+                // sanity checks espect zero in request's data
+                memset(load_inst->memData, 0, req->mainRequest()->getSize());
+            }
+
+            data_pkt->dataStatic(load_inst->memData);
+            if (load_inst->inHtmTransactionalState()) {
+                data_pkt->setHtmTransactional(
+                    load_inst->getHtmTransactionUid());
+            }
+            data_pkt->makeResponse();
+
+            WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this);
+            cpu->schedule(wb, cpu->clockEdge(delay));
+            return NoFault;
+        }
+    }
+
+    // Check the SQ for any previous stores that might lead to forwarding
+    auto store_it = load_inst->sqIt;
+    assert (store_it >= storeWBIt);
+    // End once we've reached the top of the LSQ
+    while (store_it != storeWBIt) {
+        // Move the index to one younger
+        store_it--;
+        assert(store_it->valid());
+        assert(store_it->instruction()->seqNum < load_inst->seqNum);
+        int store_size = store_it->size();
+
+        // Cache maintenance instructions go down via the store
+        // path but they carry no data and they shouldn't be
+        // considered for forwarding
+        if (store_size != 0 && !store_it->instruction()->strictlyOrdered() &&
+            !(store_it->request()->mainRequest() &&
+              store_it->request()->mainRequest()->isCacheMaintenance())) {
+            assert(store_it->instruction()->effAddrValid());
+
+            // Check if the store data is within the lower and upper bounds of
+            // addresses that the request needs.
+            auto req_s = req->mainRequest()->getVaddr();
+            auto req_e = req_s + req->mainRequest()->getSize();
+            auto st_s = store_it->instruction()->effAddr;
+            auto st_e = st_s + store_size;
+
+            bool store_has_lower_limit = req_s >= st_s;
+            bool store_has_upper_limit = req_e <= st_e;
+            bool lower_load_has_store_part = req_s < st_e;
+            bool upper_load_has_store_part = req_e > st_s;
+
+            auto coverage = AddrRangeCoverage::NoAddrRangeCoverage;
+
+            // If the store entry is not atomic (atomic does not have valid
+            // data), the store has all of the data needed, and
+            // the load is not LLSC, then
+            // we can forward data from the store to the load
+            if (!store_it->instruction()->isAtomic() &&
+                store_has_lower_limit && store_has_upper_limit &&
+                !req->mainRequest()->isLLSC()) {
+
+                const auto& store_req = store_it->request()->mainRequest();
+                coverage = store_req->isMasked() ?
+                    AddrRangeCoverage::PartialAddrRangeCoverage :
+                    AddrRangeCoverage::FullAddrRangeCoverage;
+            } else if (
+                // This is the partial store-load forwarding case where a store
+                // has only part of the load's data and the load isn't LLSC
+                (!req->mainRequest()->isLLSC() &&
+                 ((store_has_lower_limit && lower_load_has_store_part) ||
+                  (store_has_upper_limit && upper_load_has_store_part) ||
+                  (lower_load_has_store_part && upper_load_has_store_part))) ||
+                // The load is LLSC, and the store has all or part of the
+                // load's data
+                (req->mainRequest()->isLLSC() &&
+                 ((store_has_lower_limit || upper_load_has_store_part) &&
+                  (store_has_upper_limit || lower_load_has_store_part))) ||
+                // The store entry is atomic and has all or part of the load's
+                // data
+                (store_it->instruction()->isAtomic() &&
+                 ((store_has_lower_limit || upper_load_has_store_part) &&
+                  (store_has_upper_limit || lower_load_has_store_part)))) {
+
+                coverage = AddrRangeCoverage::PartialAddrRangeCoverage;
+            }
+
+            if (coverage == AddrRangeCoverage::FullAddrRangeCoverage) {
+                // Get shift amount for offset into the store's data.
+                int shift_amt = req->mainRequest()->getVaddr() -
+                    store_it->instruction()->effAddr;
+
+                // Allocate memory if this is the first time a load is issued.
+                if (!load_inst->memData) {
+                    load_inst->memData =
+                        new uint8_t[req->mainRequest()->getSize()];
+                }
+                if (store_it->isAllZeros())
+                    memset(load_inst->memData, 0,
+                            req->mainRequest()->getSize());
+                else
+                    memcpy(load_inst->memData,
+                        store_it->data() + shift_amt,
+                        req->mainRequest()->getSize());
+
+                DPRINTF(LSQUnit, "Forwarding from store idx %i to load to "
+                        "addr %#x\n", store_it._idx,
+                        req->mainRequest()->getVaddr());
+
+                PacketPtr data_pkt = new Packet(req->mainRequest(),
+                        MemCmd::ReadReq);
+                data_pkt->dataStatic(load_inst->memData);
+
+                // hardware transactional memory
+                // Store to load forwarding within a transaction
+                // This should be okay because the store will be sent to
+                // the memory subsystem and subsequently get added to the
+                // write set of the transaction. The write set has a stronger
+                // property than the read set, so the load doesn't necessarily
+                // have to be there.
+                assert(!req->mainRequest()->isHTMCmd());
+                if (load_inst->inHtmTransactionalState()) {
+                    assert (!storeQueue[store_it._idx].completed());
+                    assert (
+                        storeQueue[store_it._idx].instruction()->
+                          inHtmTransactionalState());
+                    assert (
+                        load_inst->getHtmTransactionUid() ==
+                        storeQueue[store_it._idx].instruction()->
+                          getHtmTransactionUid());
+                    data_pkt->setHtmTransactional(
+                        load_inst->getHtmTransactionUid());
+                    DPRINTF(HtmCpu, "HTM LD (ST2LDF) "
+                      "pc=0x%lx - vaddr=0x%lx - "
+                      "paddr=0x%lx - htmUid=%u\n",
+                      load_inst->instAddr(),
+                      data_pkt->req->hasVaddr() ?
+                        data_pkt->req->getVaddr() : 0lu,
+                      data_pkt->getAddr(),
+                      load_inst->getHtmTransactionUid());
+                }
+
+                if (req->isAnyOutstandingRequest()) {
+                    assert(req->_numOutstandingPackets > 0);
+                    // There are memory requests packets in flight already.
+                    // This may happen if the store was not complete the
+                    // first time this load got executed. Signal the senderSate
+                    // that response packets should be discarded.
+                    req->discardSenderState();
+                }
+
+                WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt,
+                        this);
+
+                // We'll say this has a 1 cycle load-store forwarding latency
+                // for now.
+                // @todo: Need to make this a parameter.
+                cpu->schedule(wb, curTick());
+
+                // Don't need to do anything special for split loads.
+                ++stats.forwLoads;
+
+                return NoFault;
+            } else if (
+                    coverage == AddrRangeCoverage::PartialAddrRangeCoverage) {
+                // If it's already been written back, then don't worry about
+                // stalling on it.
+                if (store_it->completed()) {
+                    panic("Should not check one of these");
+                    continue;
+                }
+
+                // Must stall load and force it to retry, so long as it's the
+                // oldest load that needs to do so.
+                if (!stalled ||
+                    (stalled &&
+                     load_inst->seqNum <
+                     loadQueue[stallingLoadIdx].instruction()->seqNum)) {
+                    stalled = true;
+                    stallingStoreIsn = store_it->instruction()->seqNum;
+                    stallingLoadIdx = load_idx;
+                }
+
+                // Tell IQ/mem dep unit that this instruction will need to be
+                // rescheduled eventually
+                iewStage->rescheduleMemInst(load_inst);
+                load_inst->clearIssued();
+                load_inst->effAddrValid(false);
+                ++stats.rescheduledLoads;
+
+                // Do not generate a writeback event as this instruction is not
+                // complete.
+                DPRINTF(LSQUnit, "Load-store forwarding mis-match. "
+                        "Store idx %i to load addr %#x\n",
+                        store_it._idx, req->mainRequest()->getVaddr());
+
+                // Must discard the request.
+                req->discard();
+                load_req.setRequest(nullptr);
+                return NoFault;
+            }
+        }
+    }
+
+    // If there's no forwarding case, then go access memory
+    DPRINTF(LSQUnit, "Doing memory access for inst [sn:%lli] PC %s\n",
+            load_inst->seqNum, load_inst->pcState());
+
+    // Allocate memory if this is the first time a load is issued.
+    if (!load_inst->memData) {
+        load_inst->memData = new uint8_t[req->mainRequest()->getSize()];
+    }
+
+
+    // hardware transactional memory
+    if (req->mainRequest()->isHTMCmd()) {
+        // this is a simple sanity check
+        // the Ruby cache controller will set
+        // memData to 0x0ul if successful.
+        *load_inst->memData = (uint64_t) 0x1ull;
+    }
+
+    // For now, load throughput is constrained by the number of
+    // load FUs only, and loads do not consume a cache port (only
+    // stores do).
+    // @todo We should account for cache port contention
+    // and arbitrate between loads and stores.
+
+    // if we the cache is not blocked, do cache access
+    if (req->senderState() == nullptr) {
+        LQSenderState *state = new LQSenderState(
+                loadQueue.getIterator(load_idx));
+        state->isLoad = true;
+        state->inst = load_inst;
+        state->isSplit = req->isSplit();
+        req->senderState(state);
+    }
+    req->buildPackets();
+    req->sendPacketToCache();
+    if (!req->isSent())
+        iewStage->blockMemInst(load_inst);
+
+    return NoFault;
+}
+
+template <class Impl>
+Fault
+LSQUnit<Impl>::write(LSQRequest *req, uint8_t *data, int store_idx)
+{
+    assert(storeQueue[store_idx].valid());
+
+    DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x | storeHead:%i "
+            "[sn:%llu]\n",
+            store_idx - 1, req->request()->getPaddr(), storeQueue.head() - 1,
+            storeQueue[store_idx].instruction()->seqNum);
+
+    storeQueue[store_idx].setRequest(req);
+    unsigned size = req->_size;
+    storeQueue[store_idx].size() = size;
+    bool store_no_data =
+        req->mainRequest()->getFlags() & Request::STORE_NO_DATA;
+    storeQueue[store_idx].isAllZeros() = store_no_data;
+    assert(size <= SQEntry::DataSize || store_no_data);
+
+    // copy data into the storeQueue only if the store request has valid data
+    if (!(req->request()->getFlags() & Request::CACHE_BLOCK_ZERO) &&
+        !req->request()->isCacheMaintenance() &&
+        !req->request()->isAtomic())
+        memcpy(storeQueue[store_idx].data(), data, size);
+
+    // This function only writes the data to the store queue, so no fault
+    // can happen here.
+    return NoFault;
+}
+
 #endif//__CPU_O3_LSQ_UNIT_IMPL_HH__
--- a/src/cpu/o3/mem_dep_unit.hh
+++ b/src/cpu/o3/mem_dep_unit.hh
@@ -49,6 +49,7 @@

 #include "base/statistics.hh"
 #include "cpu/inst_seq.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/limits.hh"
 #include "debug/MemDepUnit.hh"

@@ -85,8 +86,6 @@ class MemDepUnit
    std::string _name;

  public:
-    typedef typename Impl::DynInstPtr DynInstPtr;
-    typedef typename Impl::DynInstConstPtr DynInstConstPtr;
    typedef typename Impl::O3CPU O3CPU;

    /** Empty constructor. Must call init() prior to using in this case. */
@@ -117,22 +116,22 @@ class MemDepUnit
    void setIQ(InstructionQueue<Impl> *iq_ptr);

    /** Inserts a memory instruction. */
-    void insert(const DynInstPtr &inst);
+    void insert(const O3DynInstPtr &inst);

    /** Inserts a non-speculative memory instruction. */
-    void insertNonSpec(const DynInstPtr &inst);
+    void insertNonSpec(const O3DynInstPtr &inst);

    /** Inserts a barrier instruction. */
-    void insertBarrier(const DynInstPtr &barr_inst);
+    void insertBarrier(const O3DynInstPtr &barr_inst);

    /** Indicate that an instruction has its registers ready. */
-    void regsReady(const DynInstPtr &inst);
+    void regsReady(const O3DynInstPtr &inst);

    /** Indicate that a non-speculative instruction is ready. */
-    void nonSpecInstReady(const DynInstPtr &inst);
+    void nonSpecInstReady(const O3DynInstPtr &inst);

    /** Reschedules an instruction to be re-executed. */
-    void reschedule(const DynInstPtr &inst);
+    void reschedule(const O3DynInstPtr &inst);

    /** Replays all instructions that have been rescheduled by moving them to
     *  the ready list.
@@ -140,7 +139,7 @@ class MemDepUnit
    void replay();

    /** Notifies completion of an instruction. */
-    void completeInst(const DynInstPtr &inst);
+    void completeInst(const O3DynInstPtr &inst);

    /** Squashes all instructions up until a given sequence number for a
     *  specific thread.
@@ -148,11 +147,11 @@ class MemDepUnit
    void squash(const InstSeqNum &squashed_num, ThreadID tid);

    /** Indicates an ordering violation between a store and a younger load. */
-    void violation(const DynInstPtr &store_inst,
-                   const DynInstPtr &violating_load);
+    void violation(const O3DynInstPtr &store_inst,
+                   const O3DynInstPtr &violating_load);

    /** Issues the given instruction */
-    void issue(const DynInstPtr &inst);
+    void issue(const O3DynInstPtr &inst);

    /** Debugging function to dump the lists of instructions. */
    void dumpLists();
@@ -160,12 +159,12 @@ class MemDepUnit
  private:

    /** Completes a memory instruction. */
-    void completed(const DynInstPtr &inst);
+    void completed(const O3DynInstPtr &inst);

    /** Wakes any dependents of a memory instruction. */
-    void wakeDependents(const DynInstPtr &inst);
+    void wakeDependents(const O3DynInstPtr &inst);

-    typedef typename std::list<DynInstPtr>::iterator ListIt;
+    typedef typename std::list<O3DynInstPtr>::iterator ListIt;

    class MemDepEntry;

@@ -179,7 +178,7 @@ class MemDepUnit
    {
      public:
        /** Constructs a memory dependence entry. */
-        MemDepEntry(const DynInstPtr &new_inst)
+        MemDepEntry(const O3DynInstPtr &new_inst)
            : inst(new_inst), regsReady(false), memDeps(0),
              completed(false), squashed(false)
        {
@@ -209,7 +208,7 @@ class MemDepUnit
        std::string name() const { return "memdepentry"; }

        /** The instruction being tracked. */
-        DynInstPtr inst;
+        O3DynInstPtr inst;

        /** The iterator to the instruction's location inside the list. */
        ListIt listIt;
@@ -235,10 +234,10 @@ class MemDepUnit
    };

    /** Finds the memory dependence entry in the hash map. */
-    inline MemDepEntryPtr &findInHash(const DynInstConstPtr& inst);
+    MemDepEntryPtr &findInHash(const O3DynInstConstPtr& inst);

    /** Moves an entry to the ready list. */
-    inline void moveToReady(MemDepEntryPtr &ready_inst_entry);
+    void moveToReady(MemDepEntryPtr &ready_inst_entry);

    typedef std::unordered_map<InstSeqNum, MemDepEntryPtr, SNHash> MemDepHash;

@@ -248,10 +247,10 @@ class MemDepUnit
    MemDepHash memDepHash;

    /** A list of all instructions in the memory dependence unit. */
-    std::list<DynInstPtr> instList[O3MaxThreads];
+    std::list<O3DynInstPtr> instList[O3MaxThreads];

    /** A list of all instructions that are going to be replayed. */
-    std::list<DynInstPtr> instsToReplay;
+    std::list<O3DynInstPtr> instsToReplay;

    /** The memory dependence predictor.  It is accessed upon new
     *  instructions being added to the IQ, and responds by telling
@@ -273,7 +272,7 @@ class MemDepUnit
    bool hasStoreBarrier() const { return !storeBarrierSNs.empty(); }

    /** Inserts the SN of a barrier inst. to the list of tracked barriers */
-    void insertBarrierSN(const DynInstPtr &barr_inst);
+    void insertBarrierSN(const O3DynInstPtr &barr_inst);

    /** Pointer to the IQ. */
    InstructionQueue<Impl> *iqPtr;
--- a/src/cpu/o3/mem_dep_unit_impl.hh
+++ b/src/cpu/o3/mem_dep_unit_impl.hh
@@ -172,7 +172,7 @@ MemDepUnit<MemDepPred, Impl>::setIQ(InstructionQueue<Impl> *iq_ptr)

 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::insertBarrierSN(const DynInstPtr &barr_inst)
+MemDepUnit<MemDepPred, Impl>::insertBarrierSN(const O3DynInstPtr &barr_inst)
 {
    InstSeqNum barr_sn = barr_inst->seqNum;

@@ -205,7 +205,7 @@ MemDepUnit<MemDepPred, Impl>::insertBarrierSN(const DynInstPtr &barr_inst)

 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::insert(const DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::insert(const O3DynInstPtr &inst)
 {
    ThreadID tid = inst->threadNumber;

@@ -316,7 +316,7 @@ MemDepUnit<MemDepPred, Impl>::insert(const DynInstPtr &inst)

 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::insertNonSpec(const DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::insertNonSpec(const O3DynInstPtr &inst)
 {
    insertBarrier(inst);

@@ -338,7 +338,7 @@ MemDepUnit<MemDepPred, Impl>::insertNonSpec(const DynInstPtr &inst)

 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::insertBarrier(const DynInstPtr &barr_inst)
+MemDepUnit<MemDepPred, Impl>::insertBarrier(const O3DynInstPtr &barr_inst)
 {
    ThreadID tid = barr_inst->threadNumber;

@@ -361,7 +361,7 @@ MemDepUnit<MemDepPred, Impl>::insertBarrier(const DynInstPtr &barr_inst)

 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::regsReady(const DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::regsReady(const O3DynInstPtr &inst)
 {
    DPRINTF(MemDepUnit, "Marking registers as ready for "
            "instruction PC %s [sn:%lli].\n",
@@ -384,7 +384,7 @@ MemDepUnit<MemDepPred, Impl>::regsReady(const DynInstPtr &inst)

 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::nonSpecInstReady(const DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::nonSpecInstReady(const O3DynInstPtr &inst)
 {
    DPRINTF(MemDepUnit, "Marking non speculative "
            "instruction PC %s as ready [sn:%lli].\n",
@@ -397,7 +397,7 @@ MemDepUnit<MemDepPred, Impl>::nonSpecInstReady(const DynInstPtr &inst)

 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::reschedule(const DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::reschedule(const O3DynInstPtr &inst)
 {
    instsToReplay.push_back(inst);
 }
@@ -406,7 +406,7 @@ template <class MemDepPred, class Impl>
 void
 MemDepUnit<MemDepPred, Impl>::replay()
 {
-    DynInstPtr temp_inst;
+    O3DynInstPtr temp_inst;

    // For now this replay function replays all waiting memory ops.
    while (!instsToReplay.empty()) {
@@ -425,7 +425,7 @@ MemDepUnit<MemDepPred, Impl>::replay()

 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::completed(const DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::completed(const O3DynInstPtr &inst)
 {
    DPRINTF(MemDepUnit, "Completed mem instruction PC %s [sn:%lli].\n",
            inst->pcState(), inst->seqNum);
@@ -449,7 +449,7 @@ MemDepUnit<MemDepPred, Impl>::completed(const DynInstPtr &inst)

 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::completeInst(const DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::completeInst(const O3DynInstPtr &inst)
 {
    wakeDependents(inst);
    completed(inst);
@@ -481,7 +481,7 @@ MemDepUnit<MemDepPred, Impl>::completeInst(const DynInstPtr &inst)

 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::wakeDependents(const DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::wakeDependents(const O3DynInstPtr &inst)
 {
    // Only stores, atomics and barriers have dependents.
    if (!inst->isStore() && !inst->isAtomic() && !inst->isReadBarrier() &&
@@ -570,8 +570,8 @@ MemDepUnit<MemDepPred, Impl>::squash(const InstSeqNum &squashed_num,

 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::violation(const DynInstPtr &store_inst,
-                                        const DynInstPtr &violating_load)
+MemDepUnit<MemDepPred, Impl>::violation(const O3DynInstPtr &store_inst,
+                                        const O3DynInstPtr &violating_load)
 {
    DPRINTF(MemDepUnit, "Passing violating PCs to store sets,"
            " load: %#x, store: %#x\n", violating_load->instAddr(),
@@ -582,7 +582,7 @@ MemDepUnit<MemDepPred, Impl>::violation(const DynInstPtr &store_inst,

 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::issue(const DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::issue(const O3DynInstPtr &inst)
 {
    DPRINTF(MemDepUnit, "Issuing instruction PC %#x [sn:%lli].\n",
            inst->instAddr(), inst->seqNum);
@@ -592,7 +592,7 @@ MemDepUnit<MemDepPred, Impl>::issue(const DynInstPtr &inst)

 template <class MemDepPred, class Impl>
 inline typename MemDepUnit<MemDepPred,Impl>::MemDepEntryPtr &
-MemDepUnit<MemDepPred, Impl>::findInHash(const DynInstConstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::findInHash(const O3DynInstConstPtr &inst)
 {
    MemDepHashIt hash_it = memDepHash.find(inst->seqNum);

--- a/src/cpu/o3/probe/elastic_trace.cc
+++ b/src/cpu/o3/probe/elastic_trace.cc
@@ -40,6 +40,7 @@
 #include "base/callback.hh"
 #include "base/output.hh"
 #include "base/trace.hh"
+#include "cpu/o3/dyn_inst.hh"
 #include "cpu/reg_class.hh"
 #include "debug/ElasticTrace.hh"
 #include "mem/packet.hh"
@@ -124,21 +125,21 @@ ElasticTrace::regEtraceListeners()
    listeners.push_back(new ProbeListenerArg<ElasticTrace, RequestPtr>(this,
                        "FetchRequest", &ElasticTrace::fetchReqTrace));
    listeners.push_back(new ProbeListenerArg<ElasticTrace,
-            DynInstConstPtr>(this, "Execute",
+            O3DynInstConstPtr>(this, "Execute",
                &ElasticTrace::recordExecTick));
    listeners.push_back(new ProbeListenerArg<ElasticTrace,
-            DynInstConstPtr>(this, "ToCommit",
+            O3DynInstConstPtr>(this, "ToCommit",
                &ElasticTrace::recordToCommTick));
    listeners.push_back(new ProbeListenerArg<ElasticTrace,
-            DynInstConstPtr>(this, "Rename",
+            O3DynInstConstPtr>(this, "Rename",
                &ElasticTrace::updateRegDep));
    listeners.push_back(new ProbeListenerArg<ElasticTrace, SeqNumRegPair>(this,
                        "SquashInRename", &ElasticTrace::removeRegDepMapEntry));
    listeners.push_back(new ProbeListenerArg<ElasticTrace,
-            DynInstConstPtr>(this, "Squash",
+            O3DynInstConstPtr>(this, "Squash",
                &ElasticTrace::addSquashedInst));
    listeners.push_back(new ProbeListenerArg<ElasticTrace,
-            DynInstConstPtr>(this, "Commit",
+            O3DynInstConstPtr>(this, "Commit",
                &ElasticTrace::addCommittedInst));
    allProbesReg = true;
 }
@@ -166,7 +167,7 @@ ElasticTrace::fetchReqTrace(const RequestPtr &req)
 }

 void
-ElasticTrace::recordExecTick(const DynInstConstPtr& dyn_inst)
+ElasticTrace::recordExecTick(const O3DynInstConstPtr& dyn_inst)
 {

    // In a corner case, a retired instruction is propagated backward to the
@@ -203,7 +204,7 @@ ElasticTrace::recordExecTick(const DynInstConstPtr& dyn_inst)
 }

 void
-ElasticTrace::recordToCommTick(const DynInstConstPtr& dyn_inst)
+ElasticTrace::recordToCommTick(const O3DynInstConstPtr& dyn_inst)
 {
    // If tracing has just been enabled then the instruction at this stage of
    // execution is far enough that we cannot gather info about its past like
@@ -224,7 +225,7 @@ ElasticTrace::recordToCommTick(const DynInstConstPtr& dyn_inst)
 }

 void
-ElasticTrace::updateRegDep(const DynInstConstPtr& dyn_inst)
+ElasticTrace::updateRegDep(const O3DynInstConstPtr& dyn_inst)
 {
    // Get the sequence number of the instruction
    InstSeqNum seq_num = dyn_inst->seqNum;
@@ -303,7 +304,7 @@ ElasticTrace::removeRegDepMapEntry(const SeqNumRegPair &inst_reg_pair)
 }

 void
-ElasticTrace::addSquashedInst(const DynInstConstPtr& head_inst)
+ElasticTrace::addSquashedInst(const O3DynInstConstPtr& head_inst)
 {
    // If the squashed instruction was squashed before being processed by
    // execute stage then it will not be in the temporary store. In this case
@@ -331,7 +332,7 @@ ElasticTrace::addSquashedInst(const DynInstConstPtr& head_inst)
 }

 void
-ElasticTrace::addCommittedInst(const DynInstConstPtr& head_inst)
+ElasticTrace::addCommittedInst(const O3DynInstConstPtr& head_inst)
 {
    DPRINTFR(ElasticTrace, "Attempt to add committed inst [sn:%lli]\n",
                head_inst->seqNum);
@@ -390,7 +391,7 @@ ElasticTrace::addCommittedInst(const DynInstConstPtr& head_inst)
 }

 void
-ElasticTrace::addDepTraceRecord(const DynInstConstPtr& head_inst,
+ElasticTrace::addDepTraceRecord(const O3DynInstConstPtr& head_inst,
                                InstExecInfo* exec_info_ptr, bool commit)
 {
    // Create a record to assign dynamic intruction related fields.
@@ -652,7 +653,7 @@ ElasticTrace::hasCompCompleted(TraceInfo* past_record,
 }

 void
-ElasticTrace::clearTempStoreUntil(const DynInstConstPtr& head_inst)
+ElasticTrace::clearTempStoreUntil(const O3DynInstConstPtr& head_inst)
 {
    // Clear from temp store starting with the execution info object
    // corresponding the head_inst and continue clearing by decrementing the
--- a/src/cpu/o3/probe/elastic_trace.hh
+++ b/src/cpu/o3/probe/elastic_trace.hh
@@ -50,7 +50,7 @@
 #include <unordered_map>
 #include <utility>

-#include "cpu/o3/dyn_inst.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/impl.hh"
 #include "mem/request.hh"
 #include "params/ElasticTrace.hh"
@@ -85,8 +85,6 @@ class ElasticTrace : public ProbeListenerObject
 {

  public:
-    typedef typename O3CPUImpl::DynInstPtr DynInstPtr;
-    typedef typename O3CPUImpl::DynInstConstPtr DynInstConstPtr;
    typedef typename std::pair<InstSeqNum, RegIndex> SeqNumRegPair;

    /** Trace record types corresponding to instruction node types */
@@ -129,7 +127,7 @@ class ElasticTrace : public ProbeListenerObject
     *
     * @param dyn_inst pointer to dynamic instruction in flight
     */
-    void recordExecTick(const DynInstConstPtr& dyn_inst);
+    void recordExecTick(const O3DynInstConstPtr& dyn_inst);

    /**
     * Populate the timestamp field in an InstExecInfo object for an
@@ -138,7 +136,7 @@ class ElasticTrace : public ProbeListenerObject
     *
     * @param dyn_inst pointer to dynamic instruction in flight
     */
-    void recordToCommTick(const DynInstConstPtr& dyn_inst);
+    void recordToCommTick(const O3DynInstConstPtr& dyn_inst);

    /**
     * Record a Read After Write physical register dependency if there has
@@ -149,7 +147,7 @@ class ElasticTrace : public ProbeListenerObject
     *
     * @param dyn_inst pointer to dynamic instruction in flight
     */
-    void updateRegDep(const DynInstConstPtr& dyn_inst);
+    void updateRegDep(const O3DynInstConstPtr& dyn_inst);

    /**
     * When an instruction gets squashed the destination register mapped to it
@@ -166,14 +164,14 @@ class ElasticTrace : public ProbeListenerObject
     *
     * @param head_inst pointer to dynamic instruction to be squashed
     */
-    void addSquashedInst(const DynInstConstPtr& head_inst);
+    void addSquashedInst(const O3DynInstConstPtr& head_inst);

    /**
     * Add an instruction that is at the head of the ROB and is committed.
     *
     * @param head_inst pointer to dynamic instruction to be committed
     */
-    void addCommittedInst(const DynInstConstPtr& head_inst);
+    void addCommittedInst(const O3DynInstConstPtr& head_inst);

    /** Event to trigger registering this listener for all probe points. */
    EventFunctionWrapper regEtraceListenersEvent;
@@ -379,7 +377,7 @@ class ElasticTrace : public ProbeListenerObject
     * @param exec_info_ptr Pointer to InstExecInfo for that instruction
     * @param commit        True if instruction is committed, false if squashed
     */
-    void addDepTraceRecord(const DynInstConstPtr& head_inst,
+    void addDepTraceRecord(const O3DynInstConstPtr& head_inst,
                           InstExecInfo* exec_info_ptr, bool commit);

    /**
@@ -388,7 +386,7 @@ class ElasticTrace : public ProbeListenerObject
     *
     * @param head_inst pointer to dynamic instruction
     */
-    void clearTempStoreUntil(const DynInstConstPtr& head_inst);
+    void clearTempStoreUntil(const O3DynInstConstPtr& head_inst);

    /**
     * Calculate the computational delay between an instruction and a
--- a/src/cpu/o3/probe/simple_trace.cc
+++ b/src/cpu/o3/probe/simple_trace.cc
@@ -38,16 +38,17 @@
 #include "cpu/o3/probe/simple_trace.hh"

 #include "base/trace.hh"
+#include "cpu/o3/dyn_inst.hh"
 #include "debug/SimpleTrace.hh"

-void SimpleTrace::traceCommit(const O3CPUImpl::DynInstConstPtr& dynInst)
+void SimpleTrace::traceCommit(const O3DynInstConstPtr& dynInst)
 {
    DPRINTFR(SimpleTrace, "[%s]: Commit 0x%08x %s.\n", name(),
             dynInst->instAddr(),
             dynInst->staticInst->disassemble(dynInst->instAddr()));
 }

-void SimpleTrace::traceFetch(const O3CPUImpl::DynInstConstPtr& dynInst)
+void SimpleTrace::traceFetch(const O3DynInstConstPtr& dynInst)
 {
    DPRINTFR(SimpleTrace, "[%s]: Fetch 0x%08x %s.\n", name(),
             dynInst->instAddr(),
@@ -57,7 +58,7 @@ void SimpleTrace::traceFetch(const O3CPUImpl::DynInstConstPtr& dynInst)
 void SimpleTrace::regProbeListeners()
 {
    typedef ProbeListenerArg<SimpleTrace,
-            O3CPUImpl::DynInstConstPtr> DynInstListener;
+            O3DynInstConstPtr> DynInstListener;
    listeners.push_back(new DynInstListener(this, "Commit",
                &SimpleTrace::traceCommit));
    listeners.push_back(new DynInstListener(this, "Fetch",
--- a/src/cpu/o3/probe/simple_trace.hh
+++ b/src/cpu/o3/probe/simple_trace.hh
@@ -44,7 +44,7 @@
 #ifndef __CPU_O3_PROBE_SIMPLE_TRACE_HH__
 #define __CPU_O3_PROBE_SIMPLE_TRACE_HH__

-#include "cpu/o3/dyn_inst.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/impl.hh"
 #include "params/SimpleTrace.hh"
 #include "sim/probe/probe.hh"
@@ -69,8 +69,8 @@ class SimpleTrace : public ProbeListenerObject
    }

  private:
-    void traceFetch(const O3CPUImpl::DynInstConstPtr& dynInst);
-    void traceCommit(const O3CPUImpl::DynInstConstPtr& dynInst);
+    void traceFetch(const O3DynInstConstPtr& dynInst);
+    void traceCommit(const O3DynInstConstPtr& dynInst);

 };
 #endif//__CPU_O3_PROBE_SIMPLE_TRACE_HH__
--- a/src/cpu/o3/rename.hh
+++ b/src/cpu/o3/rename.hh
@@ -48,6 +48,7 @@
 #include "base/statistics.hh"
 #include "config/the_isa.hh"
 #include "cpu/o3/commit.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/free_list.hh"
 #include "cpu/o3/iew.hh"
 #include "cpu/o3/limits.hh"
@@ -73,7 +74,6 @@ class DefaultRename
 {
  public:
    // Typedefs from the Impl.
-    typedef typename Impl::DynInstPtr DynInstPtr;
    typedef typename Impl::O3CPU O3CPU;
    typedef typename Impl::DecodeStruct DecodeStruct;
    typedef typename Impl::RenameStruct RenameStruct;
@@ -83,7 +83,7 @@ class DefaultRename
    // be added to the front of the queue, which is the only reason for
    // using a deque instead of a queue. (Most other stages use a
    // queue)
-    typedef std::deque<DynInstPtr> InstQueue;
+    typedef std::deque<O3DynInstPtr> InstQueue;

  public:
    /** Overall rename status. Used to determine if the CPU can
@@ -117,7 +117,7 @@ class DefaultRename
    /** Probe points. */
    typedef typename std::pair<InstSeqNum, PhysRegIdPtr> SeqNumRegPair;
    /** To probe when register renaming for an instruction is complete */
-    ProbePointArg<DynInstPtr> *ppRename;
+    ProbePointArg<O3DynInstPtr> *ppRename;
    /**
     * To probe when an instruction is squashed and the register mapping
     * for it needs to be undone
@@ -248,22 +248,22 @@ class DefaultRename
    void removeFromHistory(InstSeqNum inst_seq_num, ThreadID tid);

    /** Renames the source registers of an instruction. */
-    inline void renameSrcRegs(const DynInstPtr &inst, ThreadID tid);
+    void renameSrcRegs(const O3DynInstPtr &inst, ThreadID tid);

    /** Renames the destination registers of an instruction. */
-    inline void renameDestRegs(const DynInstPtr &inst, ThreadID tid);
+    void renameDestRegs(const O3DynInstPtr &inst, ThreadID tid);

    /** Calculates the number of free ROB entries for a specific thread. */
-    inline int calcFreeROBEntries(ThreadID tid);
+    int calcFreeROBEntries(ThreadID tid);

    /** Calculates the number of free IQ entries for a specific thread. */
-    inline int calcFreeIQEntries(ThreadID tid);
+    int calcFreeIQEntries(ThreadID tid);

    /** Calculates the number of free LQ entries for a specific thread. */
-    inline int calcFreeLQEntries(ThreadID tid);
+    int calcFreeLQEntries(ThreadID tid);

    /** Calculates the number of free SQ entries for a specific thread. */
-    inline int calcFreeSQEntries(ThreadID tid);
+    int calcFreeSQEntries(ThreadID tid);

    /** Returns the number of valid instructions coming from decode. */
    unsigned validInsts();
@@ -417,7 +417,7 @@ class DefaultRename
    Stalls stalls[O3MaxThreads];

    /** The serialize instruction that rename has stalled on. */
-    DynInstPtr serializeInst[O3MaxThreads];
+    O3DynInstPtr serializeInst[O3MaxThreads];

    /** Records if rename needs to serialize on the next instruction for any
     * thread.
--- a/src/cpu/o3/rename_impl.hh
+++ b/src/cpu/o3/rename_impl.hh
@@ -177,7 +177,8 @@ template <class Impl>
 void
 DefaultRename<Impl>::regProbePoints()
 {
-    ppRename = new ProbePointArg<DynInstPtr>(cpu->getProbeManager(), "Rename");
+    ppRename = new ProbePointArg<O3DynInstPtr>(
+            cpu->getProbeManager(), "Rename");
    ppSquashInRename = new ProbePointArg<SeqNumRegPair>(cpu->getProbeManager(),
                                                        "SquashInRename");
 }
@@ -612,11 +613,12 @@ DefaultRename<Impl>::renameInsts(ThreadID tid)

        assert(!insts_to_rename.empty());

-        DynInstPtr inst = insts_to_rename.front();
+        O3DynInstPtr inst = insts_to_rename.front();

-        //For all kind of instructions, check ROB and IQ first
-        //For load instruction, check LQ size and take into account the inflight loads
-        //For store instruction, check SQ size and take into account the inflight stores
+        //For all kind of instructions, check ROB and IQ first For load
+        //instruction, check LQ size and take into account the inflight loads
+        //For store instruction, check SQ size and take into account the
+        //inflight stores

        if (inst->isLoad()) {
            if (calcFreeLQEntries(tid) <= 0) {
@@ -774,7 +776,7 @@ template<class Impl>
 void
 DefaultRename<Impl>::skidInsert(ThreadID tid)
 {
-    DynInstPtr inst = NULL;
+    O3DynInstPtr inst = NULL;

    while (!insts[tid].empty()) {
        inst = insts[tid].front();
@@ -811,7 +813,7 @@ DefaultRename<Impl>::sortInsts()
 {
    int insts_from_decode = fromDecode->size;
    for (int i = 0; i < insts_from_decode; ++i) {
-        const DynInstPtr &inst = fromDecode->insts[i];
+        const O3DynInstPtr &inst = fromDecode->insts[i];
        insts[inst->threadNumber].push_back(inst);
 #if TRACING_ON
        if (Debug::O3PipeView) {
@@ -1035,7 +1037,7 @@ DefaultRename<Impl>::removeFromHistory(InstSeqNum inst_seq_num, ThreadID tid)

 template <class Impl>
 inline void
-DefaultRename<Impl>::renameSrcRegs(const DynInstPtr &inst, ThreadID tid)
+DefaultRename<Impl>::renameSrcRegs(const O3DynInstPtr &inst, ThreadID tid)
 {
    ThreadContext *tc = inst->tcBase();
    UnifiedRenameMap *map = renameMap[tid];
@@ -1102,7 +1104,7 @@ DefaultRename<Impl>::renameSrcRegs(const DynInstPtr &inst, ThreadID tid)

 template <class Impl>
 inline void
-DefaultRename<Impl>::renameDestRegs(const DynInstPtr &inst, ThreadID tid)
+DefaultRename<Impl>::renameDestRegs(const O3DynInstPtr &inst, ThreadID tid)
 {
    ThreadContext *tc = inst->tcBase();
    UnifiedRenameMap *map = renameMap[tid];
@@ -1369,7 +1371,7 @@ DefaultRename<Impl>::checkSignalsAndUpdate(ThreadID tid)
        DPRINTF(Rename, "[tid:%i] Done with serialize stall, switching to "
                "unblocking.\n", tid);

-        DynInstPtr serial_inst = serializeInst[tid];
+        O3DynInstPtr serial_inst = serializeInst[tid];

        renameStatus[tid] = Unblocking;

--- a/src/cpu/o3/rob.hh
+++ b/src/cpu/o3/rob.hh
@@ -60,10 +60,9 @@ class ROB
  public:
    //Typedefs from the Impl.
    typedef typename Impl::O3CPU O3CPU;
-    typedef typename Impl::DynInstPtr DynInstPtr;

    typedef std::pair<RegIndex, RegIndex> UnmapInfo;
-    typedef typename std::list<DynInstPtr>::iterator InstIt;
+    typedef typename std::list<O3DynInstPtr>::iterator InstIt;

    /** Possible ROB statuses. */
    enum Status
@@ -105,36 +104,36 @@ class ROB
     *  ROB for the new instruction.
     *  @param inst The instruction being inserted into the ROB.
     */
-    void insertInst(const DynInstPtr &inst);
+    void insertInst(const O3DynInstPtr &inst);

    /** Returns pointer to the head instruction within the ROB.  There is
     *  no guarantee as to the return value if the ROB is empty.
     *  @retval Pointer to the DynInst that is at the head of the ROB.
     */
-//    DynInstPtr readHeadInst();
+//    O3DynInstPtr readHeadInst();

    /** Returns a pointer to the head instruction of a specific thread within
     *  the ROB.
     *  @return Pointer to the DynInst that is at the head of the ROB.
     */
-    const DynInstPtr &readHeadInst(ThreadID tid);
+    const O3DynInstPtr &readHeadInst(ThreadID tid);

    /** Returns a pointer to the instruction with the given sequence if it is
     *  in the ROB.
     */
-    DynInstPtr findInst(ThreadID tid, InstSeqNum squash_inst);
+    O3DynInstPtr findInst(ThreadID tid, InstSeqNum squash_inst);

    /** Returns pointer to the tail instruction within the ROB.  There is
     *  no guarantee as to the return value if the ROB is empty.
     *  @retval Pointer to the DynInst that is at the tail of the ROB.
     */
-//    DynInstPtr readTailInst();
+//    O3DynInstPtr readTailInst();

    /** Returns a pointer to the tail instruction of a specific thread within
     *  the ROB.
     *  @return Pointer to the DynInst that is at the tail of the ROB.
     */
-    DynInstPtr readTailInst(ThreadID tid);
+    O3DynInstPtr readTailInst(ThreadID tid);

    /** Retires the head instruction, removing it from the ROB. */
 //    void retireHead();
@@ -277,7 +276,7 @@ class ROB
    unsigned maxEntries[O3MaxThreads];

    /** ROB List of Instructions */
-    std::list<DynInstPtr> instList[O3MaxThreads];
+    std::list<O3DynInstPtr> instList[O3MaxThreads];

    /** Number of instructions that can be squashed in a single cycle. */
    unsigned squashWidth;
@@ -308,7 +307,7 @@ class ROB
    int numInstsInROB;

    /** Dummy instruction returned if there are no insts left. */
-    DynInstPtr dummyInst;
+    O3DynInstPtr dummyInst;

  private:
    /** The sequence number of the squashed instruction. */
--- a/src/cpu/o3/rob_impl.hh
+++ b/src/cpu/o3/rob_impl.hh
@@ -200,7 +200,7 @@ ROB<Impl>::countInsts(ThreadID tid)

 template <class Impl>
 void
-ROB<Impl>::insertInst(const DynInstPtr &inst)
+ROB<Impl>::insertInst(const O3DynInstPtr &inst)
 {
    assert(inst);

@@ -246,7 +246,7 @@ ROB<Impl>::retireHead(ThreadID tid)
    // Get the head ROB instruction by copying it and remove it from the list
    InstIt head_it = instList[tid].begin();

-    DynInstPtr head_inst = std::move(*head_it);
+    O3DynInstPtr head_inst = std::move(*head_it);
    instList[tid].erase(head_it);

    assert(head_inst->readyToCommit());
@@ -428,7 +428,7 @@ ROB<Impl>::updateHead()

        InstIt head_thread = instList[tid].begin();

-        DynInstPtr head_inst = (*head_thread);
+        O3DynInstPtr head_inst = (*head_thread);

        assert(head_inst != 0);

@@ -513,7 +513,7 @@ ROB<Impl>::squash(InstSeqNum squash_num, ThreadID tid)
 }

 template <class Impl>
-const typename Impl::DynInstPtr&
+const O3DynInstPtr&
 ROB<Impl>::readHeadInst(ThreadID tid)
 {
    if (threadEntries[tid] != 0) {
@@ -528,7 +528,7 @@ ROB<Impl>::readHeadInst(ThreadID tid)
 }

 template <class Impl>
-typename Impl::DynInstPtr
+O3DynInstPtr
 ROB<Impl>::readTailInst(ThreadID tid)
 {
    InstIt tail_thread = instList[tid].end();
@@ -546,7 +546,7 @@ ROB<Impl>::ROBStats::ROBStats(Stats::Group *parent)
 }

 template <class Impl>
-typename Impl::DynInstPtr
+O3DynInstPtr
 ROB<Impl>::findInst(ThreadID tid, InstSeqNum squash_inst)
 {
    for (InstIt it = instList[tid].begin(); it != instList[tid].end(); it++) {