diff --git a/src/cpu/checker/cpu.hh b/src/cpu/checker/cpu.hh
index a13eec7c98..4a7dad86e3 100644
--- a/src/cpu/checker/cpu.hh
+++ b/src/cpu/checker/cpu.hh
@@ -51,6 +51,7 @@
 #include "cpu/base.hh"
 #include "cpu/exec_context.hh"
 #include "cpu/inst_res.hh"
+#include "cpu/o3/dyn_inst.hh"
 #include "cpu/pc_event.hh"
 #include "cpu/simple_thread.hh"
 #include "cpu/static_inst.hh"
@@ -559,12 +560,9 @@ class CheckerCPU : public BaseCPU, public ExecContext
  * template instantiations of the Checker must be placed at the bottom
  * of checker/cpu.cc.
  */
-template <class Impl>
+template <class DynInstPtr>
 class Checker : public CheckerCPU
 {
-  private:
-    typedef typename Impl::DynInstPtr DynInstPtr;
-
   public:
     Checker(const Params &p)
         : CheckerCPU(p), updateThisCycle(false), unverifiedInst(NULL)
diff --git a/src/cpu/checker/cpu_impl.hh b/src/cpu/checker/cpu_impl.hh
index b18bb8046a..de123d447f 100644
--- a/src/cpu/checker/cpu_impl.hh
+++ b/src/cpu/checker/cpu_impl.hh
@@ -59,9 +59,9 @@
 #include "sim/sim_object.hh"
 #include "sim/stats.hh"
 
-template <class Impl>
+template <class DynInstPtr>
 void
-Checker<Impl>::advancePC(const Fault &fault)
+Checker<DynInstPtr>::advancePC(const Fault &fault)
 {
     if (fault != NoFault) {
         curMacroStaticInst = nullStaticInstPtr;
@@ -80,9 +80,9 @@ Checker<Impl>::advancePC(const Fault &fault)
 }
 //////////////////////////////////////////////////
 
-template <class Impl>
+template <class DynInstPtr>
 void
-Checker<Impl>::handlePendingInt()
+Checker<DynInstPtr>::handlePendingInt()
 {
     DPRINTF(Checker, "IRQ detected at PC: %s with %d insts in buffer\n",
                      thread->pcState(), instList.size());
@@ -114,9 +114,9 @@ Checker<Impl>::handlePendingInt()
     curMacroStaticInst = nullStaticInstPtr;
 }
 
-template <class Impl>
+template <class DynInstPtr>
 void
-Checker<Impl>::verify(const DynInstPtr &completed_inst)
+Checker<DynInstPtr>::verify(const DynInstPtr &completed_inst)
 {
     DynInstPtr inst;
 
@@ -428,22 +428,19 @@ Checker<Impl>::verify(const DynInstPtr &completed_inst)
     unverifiedInst = NULL;
 }
 
-template <class Impl>
+template <class DynInstPtr>
 void
-Checker<Impl>::switchOut()
+Checker<DynInstPtr>::switchOut()
 {
     instList.clear();
 }
 
-template <class Impl>
-void
-Checker<Impl>::takeOverFrom(BaseCPU *oldCPU)
-{
-}
+template <class DynInstPtr>
+void Checker<DynInstPtr>::takeOverFrom(BaseCPU *oldCPU) {}
 
-template <class Impl>
+template <class DynInstPtr>
 void
-Checker<Impl>::validateInst(const DynInstPtr &inst)
+Checker<DynInstPtr>::validateInst(const DynInstPtr &inst)
 {
     if (inst->instAddr() != thread->instAddr()) {
         warn("%lli: PCs do not match! Inst: %s, checker: %s",
@@ -462,9 +459,9 @@ Checker<Impl>::validateInst(const DynInstPtr &inst)
     }
 }
 
-template <class Impl>
+template <class DynInstPtr>
 void
-Checker<Impl>::validateExecution(const DynInstPtr &inst)
+Checker<DynInstPtr>::validateExecution(const DynInstPtr &inst)
 {
     InstResult checker_val;
     InstResult inst_val;
@@ -555,9 +552,9 @@ Checker<Impl>::validateExecution(const DynInstPtr &inst)
 // This function is weird, if it is called it means the Checker and
 // O3 have diverged, so panic is called for now.  It may be useful
 // to resynch states and continue if the divergence is a false positive
-template <class Impl>
+template <class DynInstPtr>
 void
-Checker<Impl>::validateState()
+Checker<DynInstPtr>::validateState()
 {
     if (updateThisCycle) {
         // Change this back to warn if divergences end up being false positives
@@ -580,10 +577,10 @@ Checker<Impl>::validateState()
     }
 }
 
-template <class Impl>
+template <class DynInstPtr>
 void
-Checker<Impl>::copyResult(const DynInstPtr &inst,
-                          const InstResult& mismatch_val, int start_idx)
+Checker<DynInstPtr>::copyResult(
+        const DynInstPtr &inst, const InstResult& mismatch_val, int start_idx)
 {
     // We've already popped one dest off the queue,
     // so do the fix-up then start with the next dest reg;
@@ -657,9 +654,9 @@ Checker<Impl>::copyResult(const DynInstPtr &inst,
     }
 }
 
-template <class Impl>
+template <class DynInstPtr>
 void
-Checker<Impl>::dumpAndExit(const DynInstPtr &inst)
+Checker<DynInstPtr>::dumpAndExit(const DynInstPtr &inst)
 {
     cprintf("Error detected, instruction information:\n");
     cprintf("PC:%s, nextPC:%#x\n[sn:%lli]\n[tid:%i]\n"
@@ -673,9 +670,9 @@ Checker<Impl>::dumpAndExit(const DynInstPtr &inst)
     CheckerCPU::dumpAndExit();
 }
 
-template <class Impl>
+template <class DynInstPtr>
 void
-Checker<Impl>::dumpInsts()
+Checker<DynInstPtr>::dumpInsts()
 {
     int num = 0;
 
diff --git a/src/cpu/o3/checker.cc b/src/cpu/o3/checker.cc
index ff498edabf..7461a29d3d 100644
--- a/src/cpu/o3/checker.cc
+++ b/src/cpu/o3/checker.cc
@@ -43,4 +43,4 @@
 #include "cpu/checker/cpu_impl.hh"
 
 template
-class Checker<O3CPUImpl>;
+class Checker<O3DynInstPtr>;
diff --git a/src/cpu/o3/checker.hh b/src/cpu/o3/checker.hh
index 0c7d6294ae..4a2fbbc851 100644
--- a/src/cpu/o3/checker.hh
+++ b/src/cpu/o3/checker.hh
@@ -48,10 +48,10 @@
 /**
  * Specific non-templated derived class used for SimObject configuration.
  */
-class O3Checker : public Checker<O3CPUImpl>
+class O3Checker : public Checker<O3DynInstPtr>
 {
   public:
-    O3Checker(const Params &p) : Checker<O3CPUImpl>(p)
+    O3Checker(const Params &p) : Checker<O3DynInstPtr>(p)
     {
         // The checker should check all instructions executed by the main
         // cpu and therefore any parameters for early exit don't make much
diff --git a/src/cpu/o3/comm.hh b/src/cpu/o3/comm.hh
index 39bf20ba5a..eb85e5e1e3 100644
--- a/src/cpu/o3/comm.hh
+++ b/src/cpu/o3/comm.hh
@@ -47,6 +47,7 @@
 #include "arch/types.hh"
 #include "base/types.hh"
 #include "cpu/inst_seq.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/limits.hh"
 #include "sim/faults.hh"
 
@@ -54,11 +55,9 @@
 template<class Impl>
 struct DefaultFetchDefaultDecode
 {
-    typedef typename Impl::DynInstPtr DynInstPtr;
-
     int size;
 
-    DynInstPtr insts[O3MaxWidth];
+    O3DynInstPtr insts[O3MaxWidth];
     Fault fetchFault;
     InstSeqNum fetchFaultSN;
     bool clearFetchFault;
@@ -68,34 +67,28 @@ struct DefaultFetchDefaultDecode
 template<class Impl>
 struct DefaultDecodeDefaultRename
 {
-    typedef typename Impl::DynInstPtr DynInstPtr;
-
     int size;
 
-    DynInstPtr insts[O3MaxWidth];
+    O3DynInstPtr insts[O3MaxWidth];
 };
 
 /** Struct that defines the information passed from rename to IEW. */
 template<class Impl>
 struct DefaultRenameDefaultIEW
 {
-    typedef typename Impl::DynInstPtr DynInstPtr;
-
     int size;
 
-    DynInstPtr insts[O3MaxWidth];
+    O3DynInstPtr insts[O3MaxWidth];
 };
 
 /** Struct that defines the information passed from IEW to commit. */
 template<class Impl>
 struct DefaultIEWDefaultCommit
 {
-    typedef typename Impl::DynInstPtr DynInstPtr;
-
     int size;
 
-    DynInstPtr insts[O3MaxWidth];
-    DynInstPtr mispredictInst[O3MaxThreads];
+    O3DynInstPtr insts[O3MaxWidth];
+    O3DynInstPtr mispredictInst[O3MaxThreads];
     Addr mispredPC[O3MaxThreads];
     InstSeqNum squashedSeqNum[O3MaxThreads];
     TheISA::PCState pc[O3MaxThreads];
@@ -109,23 +102,20 @@ struct DefaultIEWDefaultCommit
 template<class Impl>
 struct IssueStruct
 {
-    typedef typename Impl::DynInstPtr DynInstPtr;
-
     int size;
 
-    DynInstPtr insts[O3MaxWidth];
+    O3DynInstPtr insts[O3MaxWidth];
 };
 
 /** Struct that defines all backwards communication. */
 template<class Impl>
 struct TimeBufStruct
 {
-    typedef typename Impl::DynInstPtr DynInstPtr;
-    struct decodeComm
+    struct DecodeComm
     {
         TheISA::PCState nextPC;
-        DynInstPtr mispredictInst;
-        DynInstPtr squashInst;
+        O3DynInstPtr mispredictInst;
+        O3DynInstPtr squashInst;
         InstSeqNum doneSeqNum;
         Addr mispredPC;
         uint64_t branchAddr;
@@ -136,15 +126,13 @@ struct TimeBufStruct
         bool branchTaken;
     };
 
-    decodeComm decodeInfo[O3MaxThreads];
+    DecodeComm decodeInfo[O3MaxThreads];
 
-    struct renameComm
-    {
-    };
+    struct RenameComm {};
 
-    renameComm renameInfo[O3MaxThreads];
+    RenameComm renameInfo[O3MaxThreads];
 
-    struct iewComm
+    struct IewComm
     {
         // Also eventually include skid buffer space.
         unsigned freeIQEntries;
@@ -161,9 +149,9 @@ struct TimeBufStruct
         bool usedLSQ;
     };
 
-    iewComm iewInfo[O3MaxThreads];
+    IewComm iewInfo[O3MaxThreads];
 
-    struct commitComm
+    struct CommitComm
     {
         /////////////////////////////////////////////////////////////////////
         // This code has been re-structured for better packing of variables
@@ -184,14 +172,14 @@ struct TimeBufStruct
 
         /// Provide fetch the instruction that mispredicted, if this
         /// pointer is not-null a misprediction occured
-        DynInstPtr mispredictInst;  // *F
+        O3DynInstPtr mispredictInst;  // *F
 
         /// Instruction that caused the a non-mispredict squash
-        DynInstPtr squashInst; // *F
+        O3DynInstPtr squashInst; // *F
 
         /// Hack for now to send back a strictly ordered access to the
         /// IEW stage.
-        DynInstPtr strictlyOrderedLoad; // *I
+        O3DynInstPtr strictlyOrderedLoad; // *I
 
         /// Communication specifically to the IQ to tell the IQ that it can
         /// schedule a non-speculative instruction.
@@ -227,7 +215,7 @@ struct TimeBufStruct
 
     };
 
-    commitComm commitInfo[O3MaxThreads];
+    CommitComm commitInfo[O3MaxThreads];
 
     bool decodeBlock[O3MaxThreads];
     bool decodeUnblock[O3MaxThreads];
diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh
index 6b01359a0f..bf0b07ca29 100644
--- a/src/cpu/o3/commit.hh
+++ b/src/cpu/o3/commit.hh
@@ -46,6 +46,7 @@
 #include "base/statistics.hh"
 #include "cpu/exetrace.hh"
 #include "cpu/inst_seq.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/iew.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/o3/rename_map.hh"
@@ -87,7 +88,6 @@ class DefaultCommit
   public:
     // Typedefs from the Impl.
     typedef typename Impl::O3CPU O3CPU;
-    typedef typename Impl::DynInstPtr DynInstPtr;
     typedef typename Impl::TimeStruct TimeStruct;
     typedef typename Impl::FetchStruct FetchStruct;
     typedef typename Impl::IEWStruct IEWStruct;
@@ -126,10 +126,10 @@ class DefaultCommit
     CommitPolicy commitPolicy;
 
     /** Probe Points. */
-    ProbePointArg<DynInstPtr> *ppCommit;
-    ProbePointArg<DynInstPtr> *ppCommitStall;
+    ProbePointArg<O3DynInstPtr> *ppCommit;
+    ProbePointArg<O3DynInstPtr> *ppCommitStall;
     /** To probe when an instruction is squashed */
-    ProbePointArg<DynInstPtr> *ppSquash;
+    ProbePointArg<O3DynInstPtr> *ppSquash;
 
     /** Mark the thread as processing a trap. */
     void processTrapEvent(ThreadID tid);
@@ -277,7 +277,7 @@ class DefaultCommit
      * @param tid ID of the thread to squash.
      * @param head_inst Instruction that requested the squash.
      */
-    void squashAfter(ThreadID tid, const DynInstPtr &head_inst);
+    void squashAfter(ThreadID tid, const O3DynInstPtr &head_inst);
 
     /** Handles processing an interrupt. */
     void handleInterrupt();
@@ -291,7 +291,7 @@ class DefaultCommit
     /** Tries to commit the head ROB instruction passed in.
      * @param head_inst The instruction to be committed.
      */
-    bool commitHead(const DynInstPtr &head_inst, unsigned inst_num);
+    bool commitHead(const O3DynInstPtr &head_inst, unsigned inst_num);
 
     /** Gets instructions from rename and inserts them into the ROB. */
     void getInsts();
@@ -385,7 +385,7 @@ class DefaultCommit
      * that caused a squash since this needs to be passed to the fetch
      * stage once squashing starts.
      */
-    DynInstPtr squashAfterInst[O3MaxThreads];
+    O3DynInstPtr squashAfterInst[O3MaxThreads];
 
     /** Priority List used for Commit Policy */
     std::list<ThreadID> priority_list;
@@ -472,7 +472,7 @@ class DefaultCommit
     bool avoidQuiesceLiveLock;
 
     /** Updates commit stats based on this instruction. */
-    void updateComInstStats(const DynInstPtr &inst);
+    void updateComInstStats(const O3DynInstPtr &inst);
 
     // HTM
     int htmStarts[O3MaxThreads];
diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh
index 2c692ea165..06694bf257 100644
--- a/src/cpu/o3/commit_impl.hh
+++ b/src/cpu/o3/commit_impl.hh
@@ -54,6 +54,7 @@
 #include "cpu/exetrace.hh"
 #include "cpu/null_static_inst.hh"
 #include "cpu/o3/commit.hh"
+#include "cpu/o3/dyn_inst.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/o3/thread_state.hh"
 #include "cpu/timebuf.hh"
@@ -140,9 +141,12 @@ template <class Impl>
 void
 DefaultCommit<Impl>::regProbePoints()
 {
-    ppCommit = new ProbePointArg<DynInstPtr>(cpu->getProbeManager(), "Commit");
-    ppCommitStall = new ProbePointArg<DynInstPtr>(cpu->getProbeManager(), "CommitStall");
-    ppSquash = new ProbePointArg<DynInstPtr>(cpu->getProbeManager(), "Squash");
+    ppCommit = new ProbePointArg<O3DynInstPtr>(
+            cpu->getProbeManager(), "Commit");
+    ppCommitStall = new ProbePointArg<O3DynInstPtr>(
+            cpu->getProbeManager(), "CommitStall");
+    ppSquash = new ProbePointArg<O3DynInstPtr>(
+            cpu->getProbeManager(), "Squash");
 }
 
 template <class Impl>
@@ -653,7 +657,7 @@ DefaultCommit<Impl>::squashFromSquashAfter(ThreadID tid)
 
 template <class Impl>
 void
-DefaultCommit<Impl>::squashAfter(ThreadID tid, const DynInstPtr &head_inst)
+DefaultCommit<Impl>::squashAfter(ThreadID tid, const O3DynInstPtr &head_inst)
 {
     DPRINTF(Commit, "Executing squash after for [tid:%i] inst [sn:%llu]\n",
             tid, head_inst->seqNum);
@@ -713,14 +717,14 @@ DefaultCommit<Impl>::tick()
             // will be active.
             _nextStatus = Active;
 
-            GEM5_VAR_USED const DynInstPtr &inst = rob->readHeadInst(tid);
+            GEM5_VAR_USED const O3DynInstPtr &inst = rob->readHeadInst(tid);
 
             DPRINTF(Commit,"[tid:%i] Instruction [sn:%llu] PC %s is head of"
                     " ROB and ready to commit\n",
                     tid, inst->seqNum, inst->pcState());
 
         } else if (!rob->isEmpty(tid)) {
-            const DynInstPtr &inst = rob->readHeadInst(tid);
+            const O3DynInstPtr &inst = rob->readHeadInst(tid);
 
             ppCommitStall->notify(inst);
 
@@ -1001,7 +1005,7 @@ DefaultCommit<Impl>::commitInsts()
 
     unsigned num_committed = 0;
 
-    DynInstPtr head_inst;
+    O3DynInstPtr head_inst;
 
     // Commit as many instructions as possible until the commit bandwidth
     // limit is reached, or it becomes impossible to commit any more.
@@ -1192,7 +1196,8 @@ DefaultCommit<Impl>::commitInsts()
 
 template <class Impl>
 bool
-DefaultCommit<Impl>::commitHead(const DynInstPtr &head_inst, unsigned inst_num)
+DefaultCommit<Impl>::commitHead(
+        const O3DynInstPtr &head_inst, unsigned inst_num)
 {
     assert(head_inst);
 
@@ -1391,7 +1396,7 @@ DefaultCommit<Impl>::getInsts()
     int insts_to_process = std::min((int)renameWidth, fromRename->size);
 
     for (int inst_num = 0; inst_num < insts_to_process; ++inst_num) {
-        const DynInstPtr &inst = fromRename->insts[inst_num];
+        const O3DynInstPtr &inst = fromRename->insts[inst_num];
         ThreadID tid = inst->threadNumber;
 
         if (!inst->isSquashed() &&
@@ -1438,7 +1443,7 @@ DefaultCommit<Impl>::markCompletedInsts()
 
 template <class Impl>
 void
-DefaultCommit<Impl>::updateComInstStats(const DynInstPtr &inst)
+DefaultCommit<Impl>::updateComInstStats(const O3DynInstPtr &inst)
 {
     ThreadID tid = inst->threadNumber;
 
@@ -1583,7 +1588,7 @@ DefaultCommit<Impl>::oldestReady()
 
             if (rob->isHeadReady(tid)) {
 
-                const DynInstPtr &head_inst = rob->readHeadInst(tid);
+                const O3DynInstPtr &head_inst = rob->readHeadInst(tid);
 
                 if (first) {
                     oldest = tid;
diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc
index ed3d5f1d95..1ed725b8ee 100644
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -136,7 +136,7 @@ FullO3CPU<Impl>::FullO3CPU(const DerivO3CPUParams &params)
 
     if (params.checker) {
         BaseCPU *temp_checker = params.checker;
-        checker = dynamic_cast<Checker<Impl> *>(temp_checker);
+        checker = dynamic_cast<Checker<O3DynInstPtr> *>(temp_checker);
         checker->setIcachePort(&this->fetch.getInstPort());
         checker->setSystem(params.system);
     } else {
@@ -378,8 +378,11 @@ FullO3CPU<Impl>::regProbePoints()
 {
     BaseCPU::regProbePoints();
 
-    ppInstAccessComplete = new ProbePointArg<PacketPtr>(getProbeManager(), "InstAccessComplete");
-    ppDataAccessComplete = new ProbePointArg<std::pair<DynInstPtr, PacketPtr> >(getProbeManager(), "DataAccessComplete");
+    ppInstAccessComplete = new ProbePointArg<PacketPtr>(
+            getProbeManager(), "InstAccessComplete");
+    ppDataAccessComplete = new ProbePointArg<
+        std::pair<O3DynInstPtr, PacketPtr>>(
+                getProbeManager(), "DataAccessComplete");
 
     fetch.regProbePoints();
     rename.regProbePoints();
@@ -1501,7 +1504,7 @@ FullO3CPU<Impl>::squashFromTC(ThreadID tid)
 
 template <class Impl>
 typename FullO3CPU<Impl>::ListIt
-FullO3CPU<Impl>::addInst(const DynInstPtr &inst)
+FullO3CPU<Impl>::addInst(const O3DynInstPtr &inst)
 {
     instList.push_back(inst);
 
@@ -1510,7 +1513,7 @@ FullO3CPU<Impl>::addInst(const DynInstPtr &inst)
 
 template <class Impl>
 void
-FullO3CPU<Impl>::instDone(ThreadID tid, const DynInstPtr &inst)
+FullO3CPU<Impl>::instDone(ThreadID tid, const O3DynInstPtr &inst)
 {
     // Keep an instruction count.
     if (!inst->isMicroop() || inst->isLastMicroop()) {
@@ -1530,7 +1533,7 @@ FullO3CPU<Impl>::instDone(ThreadID tid, const DynInstPtr &inst)
 
 template <class Impl>
 void
-FullO3CPU<Impl>::removeFrontInst(const DynInstPtr &inst)
+FullO3CPU<Impl>::removeFrontInst(const O3DynInstPtr &inst)
 {
     DPRINTF(O3CPU, "Removing committed instruction [tid:%i] PC %s "
             "[sn:%lli]\n",
@@ -1686,7 +1689,7 @@ FullO3CPU<Impl>::dumpInsts()
 /*
 template <class Impl>
 void
-FullO3CPU<Impl>::wakeDependents(const DynInstPtr &inst)
+FullO3CPU<Impl>::wakeDependents(const O3DynInstPtr &inst)
 {
     iew.wakeDependents(inst);
 }
diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh
index 196f57d124..fbf402e9d9 100644
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@@ -56,6 +56,7 @@
 #include "cpu/o3/comm.hh"
 #include "cpu/o3/commit.hh"
 #include "cpu/o3/decode.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/fetch.hh"
 #include "cpu/o3/free_list.hh"
 #include "cpu/o3/iew.hh"
@@ -100,13 +101,12 @@ class FullO3CPU : public BaseO3CPU
 {
   public:
     // Typedefs from the Impl here.
-    typedef typename Impl::DynInstPtr DynInstPtr;
     typedef typename Impl::O3CPU O3CPU;
 
     typedef O3ThreadState<Impl> ImplState;
     typedef O3ThreadState<Impl> Thread;
 
-    typedef typename std::list<DynInstPtr>::iterator ListIt;
+    typedef typename std::list<O3DynInstPtr>::iterator ListIt;
 
     friend class O3ThreadContext<Impl>;
 
@@ -184,7 +184,7 @@ class FullO3CPU : public BaseO3CPU
     ~FullO3CPU();
 
     ProbePointArg<PacketPtr> *ppInstAccessComplete;
-    ProbePointArg<std::pair<DynInstPtr, PacketPtr> > *ppDataAccessComplete;
+    ProbePointArg<std::pair<O3DynInstPtr, PacketPtr> > *ppDataAccessComplete;
 
     /** Register probe points. */
     void regProbePoints() override;
@@ -439,15 +439,15 @@ class FullO3CPU : public BaseO3CPU
     /** Function to add instruction onto the head of the list of the
      *  instructions.  Used when new instructions are fetched.
      */
-    ListIt addInst(const DynInstPtr &inst);
+    ListIt addInst(const O3DynInstPtr &inst);
 
     /** Function to tell the CPU that an instruction has completed. */
-    void instDone(ThreadID tid, const DynInstPtr &inst);
+    void instDone(ThreadID tid, const O3DynInstPtr &inst);
 
     /** Remove an instruction from the front end of the list.  There's
      *  no restriction on location of the instruction.
      */
-    void removeFrontInst(const DynInstPtr &inst);
+    void removeFrontInst(const O3DynInstPtr &inst);
 
     /** Remove all instructions that are not currently in the ROB.
      *  There's also an option to not squash delay slot instructions.*/
@@ -472,7 +472,7 @@ class FullO3CPU : public BaseO3CPU
 #endif
 
     /** List of all the instructions in flight. */
-    std::list<DynInstPtr> instList;
+    std::list<O3DynInstPtr> instList;
 
     /** List of all the instructions that will be removed at the end of this
      *  cycle.
@@ -624,7 +624,7 @@ class FullO3CPU : public BaseO3CPU
      * instruction results at run time.  This can be set to NULL if it
      * is not being used.
      */
-    Checker<Impl> *checker;
+    Checker<O3DynInstPtr> *checker;
 
     /** Pointer to the system. */
     System *system;
@@ -648,7 +648,7 @@ class FullO3CPU : public BaseO3CPU
     std::vector<ThreadID> tids;
 
     /** CPU pushRequest function, forwards request to LSQ. */
-    Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
+    Fault pushRequest(const O3DynInstPtr& inst, bool isLoad, uint8_t *data,
                       unsigned int size, Addr addr, Request::Flags flags,
                       uint64_t *res, AtomicOpFunctorPtr amo_op = nullptr,
                       const std::vector<bool>& byte_enable =
diff --git a/src/cpu/o3/decode.hh b/src/cpu/o3/decode.hh
index c694e3c380..38ba0a6cd3 100644
--- a/src/cpu/o3/decode.hh
+++ b/src/cpu/o3/decode.hh
@@ -44,6 +44,7 @@
 #include <queue>
 
 #include "base/statistics.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/timebuf.hh"
 
@@ -62,7 +63,6 @@ class DefaultDecode
   private:
     // Typedefs from the Impl.
     typedef typename Impl::O3CPU O3CPU;
-    typedef typename Impl::DynInstPtr DynInstPtr;
     typedef typename Impl::FetchStruct FetchStruct;
     typedef typename Impl::DecodeStruct DecodeStruct;
     typedef typename Impl::TimeStruct TimeStruct;
@@ -193,7 +193,7 @@ class DefaultDecode
     /** Squashes if there is a PC-relative branch that was predicted
      * incorrectly. Sends squash information back to fetch.
      */
-    void squash(const DynInstPtr &inst, ThreadID tid);
+    void squash(const O3DynInstPtr &inst, ThreadID tid);
 
   public:
     /** Squashes due to commit signalling a squash. Changes status to
@@ -235,10 +235,10 @@ class DefaultDecode
     typename TimeBuffer<FetchStruct>::wire fromFetch;
 
     /** Queue of all instructions coming from fetch this cycle. */
-    std::queue<DynInstPtr> insts[O3MaxThreads];
+    std::queue<O3DynInstPtr> insts[O3MaxThreads];
 
     /** Skid buffer between fetch and decode. */
-    std::queue<DynInstPtr> skidBuffer[O3MaxThreads];
+    std::queue<O3DynInstPtr> skidBuffer[O3MaxThreads];
 
     /** Variable that tracks if decode has written to the time buffer this
      * cycle. Used to tell CPU if there is activity this cycle.
@@ -285,7 +285,7 @@ class DefaultDecode
     Addr bdelayDoneSeqNum[O3MaxThreads];
 
     /** Instruction used for squashing branch (used for MIPS)*/
-    DynInstPtr squashInst[O3MaxThreads];
+    O3DynInstPtr squashInst[O3MaxThreads];
 
     /** Tells when their is a pending delay slot inst. to send
      *  to rename. If there is, then wait squash after the next
diff --git a/src/cpu/o3/decode_impl.hh b/src/cpu/o3/decode_impl.hh
index 5a78bac328..bfbf98f67b 100644
--- a/src/cpu/o3/decode_impl.hh
+++ b/src/cpu/o3/decode_impl.hh
@@ -46,6 +46,7 @@
 #include "config/the_isa.hh"
 #include "cpu/inst_seq.hh"
 #include "cpu/o3/decode.hh"
+#include "cpu/o3/dyn_inst.hh"
 #include "cpu/o3/limits.hh"
 #include "debug/Activity.hh"
 #include "debug/Decode.hh"
@@ -293,7 +294,7 @@ DefaultDecode<Impl>::unblock(ThreadID tid)
 
 template<class Impl>
 void
-DefaultDecode<Impl>::squash(const DynInstPtr &inst, ThreadID tid)
+DefaultDecode<Impl>::squash(const O3DynInstPtr &inst, ThreadID tid)
 {
     DPRINTF(Decode, "[tid:%i] [sn:%llu] Squashing due to incorrect branch "
             "prediction detected at decode.\n", tid, inst->seqNum);
@@ -395,7 +396,7 @@ template<class Impl>
 void
 DefaultDecode<Impl>::skidInsert(ThreadID tid)
 {
-    DynInstPtr inst = NULL;
+    O3DynInstPtr inst = NULL;
 
     while (!insts[tid].empty()) {
         inst = insts[tid].front();
@@ -655,7 +656,7 @@ DefaultDecode<Impl>::decodeInsts(ThreadID tid)
         ++stats.runCycles;
     }
 
-    std::queue<DynInstPtr>
+    std::queue<O3DynInstPtr>
         &insts_to_decode = decodeStatus[tid] == Unblocking ?
         skidBuffer[tid] : insts[tid];
 
@@ -664,7 +665,7 @@ DefaultDecode<Impl>::decodeInsts(ThreadID tid)
     while (insts_available > 0 && toRenameIndex < decodeWidth) {
         assert(!insts_to_decode.empty());
 
-        DynInstPtr inst = std::move(insts_to_decode.front());
+        O3DynInstPtr inst = std::move(insts_to_decode.front());
 
         insts_to_decode.pop();
 
diff --git a/src/cpu/o3/dyn_inst.hh b/src/cpu/o3/dyn_inst.hh
index 0e9ad5daa8..0904800ed3 100644
--- a/src/cpu/o3/dyn_inst.hh
+++ b/src/cpu/o3/dyn_inst.hh
@@ -57,6 +57,7 @@
 #include "cpu/inst_res.hh"
 #include "cpu/inst_seq.hh"
 #include "cpu/o3/cpu.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/isa_specific.hh"
 #include "cpu/o3/lsq_unit.hh"
 #include "cpu/op_class.hh"
@@ -67,10 +68,6 @@
 
 class Packet;
 
-class BaseO3DynInst;
-
-using O3DynInstPtr = RefCountingPtr<BaseO3DynInst>;
-
 class BaseO3DynInst : public ExecContext, public RefCounted
 {
   public:
diff --git a/src/cpu/o3/dyn_inst_ptr.hh b/src/cpu/o3/dyn_inst_ptr.hh
new file mode 100644
index 0000000000..479d175c82
--- /dev/null
+++ b/src/cpu/o3/dyn_inst_ptr.hh
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2010, 2016 ARM Limited
+ * Copyright (c) 2013 Advanced Micro Devices, Inc.
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CPU_O3_DYN_INST_PTR_HH__
+#define __CPU_O3_DYN_INST_PTR_HH__
+
+#include "base/refcnt.hh"
+
+class BaseO3DynInst;
+
+using O3DynInstPtr = RefCountingPtr<BaseO3DynInst>;
+using O3DynInstConstPtr = RefCountingPtr<const BaseO3DynInst>;
+
+#endif // __CPU_O3_DYN_INST_PTR_HH__
diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh
index dd7b5a0af9..dee344b414 100644
--- a/src/cpu/o3/fetch.hh
+++ b/src/cpu/o3/fetch.hh
@@ -44,6 +44,7 @@
 #include "arch/decoder.hh"
 #include "base/statistics.hh"
 #include "config/the_isa.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/pc_event.hh"
 #include "cpu/pred/bpred_unit.hh"
@@ -72,8 +73,6 @@ class DefaultFetch
 {
   public:
     /** Typedefs from Impl. */
-    typedef typename Impl::DynInst DynInst;
-    typedef typename Impl::DynInstPtr DynInstPtr;
     typedef typename Impl::O3CPU O3CPU;
     typedef typename Impl::FetchStruct FetchStruct;
     typedef typename Impl::TimeStruct TimeStruct;
@@ -207,7 +206,7 @@ class DefaultFetch
     std::list<ThreadID> priorityList;
 
     /** Probe points. */
-    ProbePointArg<DynInstPtr> *ppFetch;
+    ProbePointArg<O3DynInstPtr> *ppFetch;
     /** To probe when a fetch request is successfully sent. */
     ProbePointArg<RequestPtr> *ppFetchRequestSent;
 
@@ -294,7 +293,7 @@ class DefaultFetch
      * @param next_NPC Used for ISAs which use delay slots.
      * @return Whether or not a branch was predicted as taken.
      */
-    bool lookupAndUpdateNextPC(const DynInstPtr &inst, TheISA::PCState &pc);
+    bool lookupAndUpdateNextPC(const O3DynInstPtr &inst, TheISA::PCState &pc);
 
     /**
      * Fetches the cache line that contains the fetch PC.  Returns any
@@ -321,14 +320,14 @@ class DefaultFetch
 
     /** Squashes a specific thread and resets the PC. */
     inline void doSquash(const TheISA::PCState &newPC,
-                         const DynInstPtr squashInst, ThreadID tid);
+                         const O3DynInstPtr squashInst, ThreadID tid);
 
     /** Squashes a specific thread and resets the PC. Also tells the CPU to
      * remove any instructions between fetch and decode
      *  that should be sqaushed.
      */
     void squashFromDecode(const TheISA::PCState &newPC,
-                          const DynInstPtr squashInst,
+                          const O3DynInstPtr squashInst,
                           const InstSeqNum seq_num, ThreadID tid);
 
     /** Checks if a thread is stalled. */
@@ -344,7 +343,7 @@ class DefaultFetch
      * squash should be the commit stage.
      */
     void squash(const TheISA::PCState &newPC, const InstSeqNum seq_num,
-                DynInstPtr squashInst, ThreadID tid);
+                O3DynInstPtr squashInst, ThreadID tid);
 
     /** Ticks the fetch stage, processing all inputs signals and fetching
      * as many instructions as possible.
@@ -375,9 +374,9 @@ class DefaultFetch
     RequestPort &getInstPort() { return icachePort; }
 
   private:
-    DynInstPtr buildInst(ThreadID tid, StaticInstPtr staticInst,
-                         StaticInstPtr curMacroop, TheISA::PCState thisPC,
-                         TheISA::PCState nextPC, bool trace);
+    O3DynInstPtr buildInst(ThreadID tid, StaticInstPtr staticInst,
+                           StaticInstPtr curMacroop, TheISA::PCState thisPC,
+                           TheISA::PCState nextPC, bool trace);
 
     /** Returns the appropriate thread to fetch, given the fetch policy. */
     ThreadID getFetchingThread();
@@ -505,7 +504,7 @@ class DefaultFetch
     unsigned fetchQueueSize;
 
     /** Queue of fetched instructions. Per-thread to prevent HoL blocking. */
-    std::deque<DynInstPtr> fetchQueue[O3MaxThreads];
+    std::deque<O3DynInstPtr> fetchQueue[O3MaxThreads];
 
     /** Whether or not the fetch buffer data is valid. */
     bool fetchBufferValid[O3MaxThreads];
diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh
index b1ae2e5b4b..587ae1ae18 100644
--- a/src/cpu/o3/fetch_impl.hh
+++ b/src/cpu/o3/fetch_impl.hh
@@ -150,7 +150,7 @@ template <class Impl>
 void
 DefaultFetch<Impl>::regProbePoints()
 {
-    ppFetch = new ProbePointArg<DynInstPtr>(cpu->getProbeManager(), "Fetch");
+    ppFetch = new ProbePointArg<O3DynInstPtr>(cpu->getProbeManager(), "Fetch");
     ppFetchRequestSent = new ProbePointArg<RequestPtr>(cpu->getProbeManager(),
                                                        "FetchRequest");
 
@@ -526,7 +526,7 @@ DefaultFetch<Impl>::deactivateThread(ThreadID tid)
 template <class Impl>
 bool
 DefaultFetch<Impl>::lookupAndUpdateNextPC(
-        const DynInstPtr &inst, TheISA::PCState &nextPC)
+        const O3DynInstPtr &inst, TheISA::PCState &nextPC)
 {
     // Do branch prediction check here.
     // A bit of a misnomer...next_PC is actually the current PC until
@@ -706,7 +706,7 @@ DefaultFetch<Impl>::finishTranslation(const Fault &fault,
 
         DPRINTF(Fetch, "[tid:%i] Translation faulted, building noop.\n", tid);
         // We will use a nop in ordier to carry the fault.
-        DynInstPtr instruction = buildInst(tid, nopStaticInstPtr, nullptr,
+        O3DynInstPtr instruction = buildInst(tid, nopStaticInstPtr, nullptr,
                 fetchPC, fetchPC, false);
         instruction->setNotAnInst();
 
@@ -729,7 +729,7 @@ DefaultFetch<Impl>::finishTranslation(const Fault &fault,
 template <class Impl>
 inline void
 DefaultFetch<Impl>::doSquash(const TheISA::PCState &newPC,
-                             const DynInstPtr squashInst, ThreadID tid)
+                             const O3DynInstPtr squashInst, ThreadID tid)
 {
     DPRINTF(Fetch, "[tid:%i] Squashing, setting PC to: %s.\n",
             tid, newPC);
@@ -781,7 +781,7 @@ DefaultFetch<Impl>::doSquash(const TheISA::PCState &newPC,
 template<class Impl>
 void
 DefaultFetch<Impl>::squashFromDecode(const TheISA::PCState &newPC,
-                                     const DynInstPtr squashInst,
+                                     const O3DynInstPtr squashInst,
                                      const InstSeqNum seq_num, ThreadID tid)
 {
     DPRINTF(Fetch, "[tid:%i] Squashing from decode.\n", tid);
@@ -851,7 +851,7 @@ DefaultFetch<Impl>::updateFetchStatus()
 template <class Impl>
 void
 DefaultFetch<Impl>::squash(const TheISA::PCState &newPC,
-                           const InstSeqNum seq_num, DynInstPtr squashInst,
+                           const InstSeqNum seq_num, O3DynInstPtr squashInst,
                            ThreadID tid)
 {
     DPRINTF(Fetch, "[tid:%i] Squash from commit.\n", tid);
@@ -1070,7 +1070,7 @@ DefaultFetch<Impl>::checkSignalsAndUpdate(ThreadID tid)
 }
 
 template<class Impl>
-typename Impl::DynInstPtr
+O3DynInstPtr
 DefaultFetch<Impl>::buildInst(ThreadID tid, StaticInstPtr staticInst,
                               StaticInstPtr curMacroop, TheISA::PCState thisPC,
                               TheISA::PCState nextPC, bool trace)
@@ -1079,8 +1079,8 @@ DefaultFetch<Impl>::buildInst(ThreadID tid, StaticInstPtr staticInst,
     InstSeqNum seq = cpu->getAndIncrementInstSeq();
 
     // Create a new DynInst from the instruction fetched.
-    DynInstPtr instruction =
-        new DynInst(staticInst, curMacroop, thisPC, nextPC, seq, cpu);
+    O3DynInstPtr instruction =
+        new BaseO3DynInst(staticInst, curMacroop, thisPC, nextPC, seq, cpu);
     instruction->setTid(tid);
 
     instruction->setThreadState(cpu->thread[tid]);
@@ -1297,7 +1297,7 @@ DefaultFetch<Impl>::fetch(bool &status_change)
                 newMacro |= staticInst->isLastMicroop();
             }
 
-            DynInstPtr instruction =
+            O3DynInstPtr instruction =
                 buildInst(tid, staticInst, curMacroop,
                           thisPC, nextPC, true);
 
diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh
index 687f745d14..4afee5bf76 100644
--- a/src/cpu/o3/iew.hh
+++ b/src/cpu/o3/iew.hh
@@ -46,6 +46,7 @@
 
 #include "base/statistics.hh"
 #include "cpu/o3/comm.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/inst_queue.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/o3/lsq.hh"
@@ -81,7 +82,6 @@ class DefaultIEW
 {
   private:
     //Typedefs from Impl
-    typedef typename Impl::DynInstPtr DynInstPtr;
     typedef typename Impl::O3CPU O3CPU;
     typedef typename Impl::TimeStruct TimeStruct;
     typedef typename Impl::IEWStruct IEWStruct;
@@ -120,12 +120,12 @@ class DefaultIEW
     StageStatus wbStatus;
 
     /** Probe points. */
-    ProbePointArg<DynInstPtr> *ppMispredict;
-    ProbePointArg<DynInstPtr> *ppDispatch;
+    ProbePointArg<O3DynInstPtr> *ppMispredict;
+    ProbePointArg<O3DynInstPtr> *ppDispatch;
     /** To probe when instruction execution begins. */
-    ProbePointArg<DynInstPtr> *ppExecute;
+    ProbePointArg<O3DynInstPtr> *ppExecute;
     /** To probe when instruction execution is complete. */
-    ProbePointArg<DynInstPtr> *ppToCommit;
+    ProbePointArg<O3DynInstPtr> *ppToCommit;
 
   public:
     /** Constructs a DefaultIEW with the given parameters. */
@@ -171,24 +171,24 @@ class DefaultIEW
     void squash(ThreadID tid);
 
     /** Wakes all dependents of a completed instruction. */
-    void wakeDependents(const DynInstPtr &inst);
+    void wakeDependents(const O3DynInstPtr &inst);
 
     /** Tells memory dependence unit that a memory instruction needs to be
      * rescheduled. It will re-execute once replayMemInst() is called.
      */
-    void rescheduleMemInst(const DynInstPtr &inst);
+    void rescheduleMemInst(const O3DynInstPtr &inst);
 
     /** Re-executes all rescheduled memory instructions. */
-    void replayMemInst(const DynInstPtr &inst);
+    void replayMemInst(const O3DynInstPtr &inst);
 
     /** Moves memory instruction onto the list of cache blocked instructions */
-    void blockMemInst(const DynInstPtr &inst);
+    void blockMemInst(const O3DynInstPtr &inst);
 
     /** Notifies that the cache has become unblocked */
     void cacheUnblocked();
 
     /** Sends an instruction to commit through the time buffer. */
-    void instToCommit(const DynInstPtr &inst);
+    void instToCommit(const O3DynInstPtr &inst);
 
     /** Inserts unused instructions of a thread into the skid buffer. */
     void skidInsert(ThreadID tid);
@@ -226,7 +226,7 @@ class DefaultIEW
     bool hasStoresToWB(ThreadID tid) { return ldstQueue.hasStoresToWB(tid); }
 
     /** Check misprediction  */
-    void checkMisprediction(const DynInstPtr &inst);
+    void checkMisprediction(const O3DynInstPtr &inst);
 
     // hardware transactional memory
     // For debugging purposes, it is useful to keep track of the most recent
@@ -242,12 +242,12 @@ class DefaultIEW
     /** Sends commit proper information for a squash due to a branch
      * mispredict.
      */
-    void squashDueToBranch(const DynInstPtr &inst, ThreadID tid);
+    void squashDueToBranch(const O3DynInstPtr &inst, ThreadID tid);
 
     /** Sends commit proper information for a squash due to a memory order
      * violation.
      */
-    void squashDueToMemOrder(const DynInstPtr &inst, ThreadID tid);
+    void squashDueToMemOrder(const O3DynInstPtr &inst, ThreadID tid);
 
     /** Sets Dispatch to blocked, and signals back to other stages to block. */
     void block(ThreadID tid);
@@ -301,7 +301,7 @@ class DefaultIEW
 
   private:
     /** Updates execution stats based on the instruction. */
-    void updateExeInstStats(const DynInstPtr &inst);
+    void updateExeInstStats(const O3DynInstPtr &inst);
 
     /** Pointer to main time buffer used for backwards communication. */
     TimeBuffer<TimeStruct> *timeBuffer;
@@ -337,10 +337,10 @@ class DefaultIEW
     typename TimeBuffer<IEWStruct>::wire toCommit;
 
     /** Queue of all instructions coming from rename this cycle. */
-    std::queue<DynInstPtr> insts[O3MaxThreads];
+    std::queue<O3DynInstPtr> insts[O3MaxThreads];
 
     /** Skid buffer between rename and IEW. */
-    std::queue<DynInstPtr> skidBuffer[O3MaxThreads];
+    std::queue<O3DynInstPtr> skidBuffer[O3MaxThreads];
 
     /** Scoreboard pointer. */
     Scoreboard* scoreboard;
diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh
index d8a539847b..7c6fe5a6a9 100644
--- a/src/cpu/o3/iew_impl.hh
+++ b/src/cpu/o3/iew_impl.hh
@@ -50,6 +50,7 @@
 
 #include "config/the_isa.hh"
 #include "cpu/checker/cpu.hh"
+#include "cpu/o3/dyn_inst.hh"
 #include "cpu/o3/fu_pool.hh"
 #include "cpu/o3/iew.hh"
 #include "cpu/o3/limits.hh"
@@ -122,20 +123,22 @@ template <class Impl>
 void
 DefaultIEW<Impl>::regProbePoints()
 {
-    ppDispatch = new ProbePointArg<DynInstPtr>(cpu->getProbeManager(), "Dispatch");
-    ppMispredict = new ProbePointArg<DynInstPtr>(cpu->getProbeManager(), "Mispredict");
+    ppDispatch = new ProbePointArg<O3DynInstPtr>(
+            cpu->getProbeManager(), "Dispatch");
+    ppMispredict = new ProbePointArg<O3DynInstPtr>(
+            cpu->getProbeManager(), "Mispredict");
     /**
      * Probe point with dynamic instruction as the argument used to probe when
      * an instruction starts to execute.
      */
-    ppExecute = new ProbePointArg<DynInstPtr>(cpu->getProbeManager(),
-                                              "Execute");
+    ppExecute = new ProbePointArg<O3DynInstPtr>(
+            cpu->getProbeManager(), "Execute");
     /**
      * Probe point with dynamic instruction as the argument used to probe when
      * an instruction execution completes and it is marked ready to commit.
      */
-    ppToCommit = new ProbePointArg<DynInstPtr>(cpu->getProbeManager(),
-                                               "ToCommit");
+    ppToCommit = new ProbePointArg<O3DynInstPtr>(
+            cpu->getProbeManager(), "ToCommit");
 }
 
 template <class Impl>
@@ -461,7 +464,7 @@ DefaultIEW<Impl>::squash(ThreadID tid)
 
 template<class Impl>
 void
-DefaultIEW<Impl>::squashDueToBranch(const DynInstPtr& inst, ThreadID tid)
+DefaultIEW<Impl>::squashDueToBranch(const O3DynInstPtr& inst, ThreadID tid)
 {
     DPRINTF(IEW, "[tid:%i] [sn:%llu] Squashing from a specific instruction,"
             " PC: %s "
@@ -487,7 +490,7 @@ DefaultIEW<Impl>::squashDueToBranch(const DynInstPtr& inst, ThreadID tid)
 
 template<class Impl>
 void
-DefaultIEW<Impl>::squashDueToMemOrder(const DynInstPtr& inst, ThreadID tid)
+DefaultIEW<Impl>::squashDueToMemOrder(const O3DynInstPtr& inst, ThreadID tid)
 {
     DPRINTF(IEW, "[tid:%i] Memory violation, squashing violator and younger "
             "insts, PC: %s [sn:%llu].\n", tid, inst->pcState(), inst->seqNum);
@@ -550,28 +553,28 @@ DefaultIEW<Impl>::unblock(ThreadID tid)
 
 template<class Impl>
 void
-DefaultIEW<Impl>::wakeDependents(const DynInstPtr& inst)
+DefaultIEW<Impl>::wakeDependents(const O3DynInstPtr& inst)
 {
     instQueue.wakeDependents(inst);
 }
 
 template<class Impl>
 void
-DefaultIEW<Impl>::rescheduleMemInst(const DynInstPtr& inst)
+DefaultIEW<Impl>::rescheduleMemInst(const O3DynInstPtr& inst)
 {
     instQueue.rescheduleMemInst(inst);
 }
 
 template<class Impl>
 void
-DefaultIEW<Impl>::replayMemInst(const DynInstPtr& inst)
+DefaultIEW<Impl>::replayMemInst(const O3DynInstPtr& inst)
 {
     instQueue.replayMemInst(inst);
 }
 
 template<class Impl>
 void
-DefaultIEW<Impl>::blockMemInst(const DynInstPtr& inst)
+DefaultIEW<Impl>::blockMemInst(const O3DynInstPtr& inst)
 {
     instQueue.blockMemInst(inst);
 }
@@ -585,7 +588,7 @@ DefaultIEW<Impl>::cacheUnblocked()
 
 template<class Impl>
 void
-DefaultIEW<Impl>::instToCommit(const DynInstPtr& inst)
+DefaultIEW<Impl>::instToCommit(const O3DynInstPtr& inst)
 {
     // This function should not be called after writebackInsts in a
     // single cycle.  That will cause problems with an instruction
@@ -630,7 +633,7 @@ template<class Impl>
 void
 DefaultIEW<Impl>::skidInsert(ThreadID tid)
 {
-    DynInstPtr inst = NULL;
+    O3DynInstPtr inst = NULL;
 
     while (!insts[tid].empty()) {
         inst = insts[tid].front();
@@ -927,13 +930,13 @@ DefaultIEW<Impl>::dispatchInsts(ThreadID tid)
 {
     // Obtain instructions from skid buffer if unblocking, or queue from rename
     // otherwise.
-    std::queue<DynInstPtr> &insts_to_dispatch =
+    std::queue<O3DynInstPtr> &insts_to_dispatch =
         dispatchStatus[tid] == Unblocking ?
         skidBuffer[tid] : insts[tid];
 
     int insts_to_add = insts_to_dispatch.size();
 
-    DynInstPtr inst;
+    O3DynInstPtr inst;
     bool add_to_iq = false;
     int dis_num_inst = 0;
 
@@ -1208,7 +1211,7 @@ DefaultIEW<Impl>::executeInsts()
 
         DPRINTF(IEW, "Execute: Executing instructions from IQ.\n");
 
-        DynInstPtr inst = instQueue.getInstToExecute();
+        O3DynInstPtr inst = instQueue.getInstToExecute();
 
         DPRINTF(IEW, "Execute: Processing PC %s, [tid:%i] [sn:%llu].\n",
                 inst->pcState(), inst->threadNumber,inst->seqNum);
@@ -1372,7 +1375,7 @@ DefaultIEW<Impl>::executeInsts()
                 // If there was an ordering violation, then get the
                 // DynInst that caused the violation.  Note that this
                 // clears the violation signal.
-                DynInstPtr violator;
+                O3DynInstPtr violator;
                 violator = ldstQueue.getMemDepViolator(tid);
 
                 DPRINTF(IEW, "LDSTQ detected a violation. Violator PC: %s "
@@ -1396,7 +1399,7 @@ DefaultIEW<Impl>::executeInsts()
             if (ldstQueue.violation(tid)) {
                 assert(inst->isMemRef());
 
-                DynInstPtr violator = ldstQueue.getMemDepViolator(tid);
+                O3DynInstPtr violator = ldstQueue.getMemDepViolator(tid);
 
                 DPRINTF(IEW, "LDSTQ detected a violation.  Violator PC: "
                         "%s, inst PC: %s.  Addr is: %#x.\n",
@@ -1439,7 +1442,7 @@ DefaultIEW<Impl>::writebackInsts()
     // as part of backwards communication.
     for (int inst_num = 0; inst_num < wbWidth &&
              toCommit->insts[inst_num]; inst_num++) {
-        DynInstPtr inst = toCommit->insts[inst_num];
+        O3DynInstPtr inst = toCommit->insts[inst_num];
         ThreadID tid = inst->threadNumber;
 
         DPRINTF(IEW, "Sending instructions to commit, [sn:%lli] PC %s.\n",
@@ -1610,7 +1613,7 @@ DefaultIEW<Impl>::tick()
 
 template <class Impl>
 void
-DefaultIEW<Impl>::updateExeInstStats(const DynInstPtr& inst)
+DefaultIEW<Impl>::updateExeInstStats(const O3DynInstPtr& inst)
 {
     ThreadID tid = inst->threadNumber;
 
@@ -1642,7 +1645,7 @@ DefaultIEW<Impl>::updateExeInstStats(const DynInstPtr& inst)
 
 template <class Impl>
 void
-DefaultIEW<Impl>::checkMisprediction(const DynInstPtr& inst)
+DefaultIEW<Impl>::checkMisprediction(const O3DynInstPtr& inst)
 {
     ThreadID tid = inst->threadNumber;
 
diff --git a/src/cpu/o3/impl.hh b/src/cpu/o3/impl.hh
index c61367f46e..2c7242ea60 100644
--- a/src/cpu/o3/impl.hh
+++ b/src/cpu/o3/impl.hh
@@ -32,8 +32,6 @@
 #include "cpu/o3/comm.hh"
 
 // Forward declarations.
-class BaseO3DynInst;
-
 template <class Impl>
 class FullO3CPU;
 
@@ -66,15 +64,6 @@ struct O3CPUImpl
     typedef TimeBufStruct<O3CPUImpl> TimeStruct;
 
 
-    /** The DynInst type to be used. */
-    typedef BaseO3DynInst DynInst;
-
-    /** The refcounted DynInst pointer to be used.  In most cases this is
-     *  what should be used, and not DynInst *.
-     */
-    typedef RefCountingPtr<DynInst> DynInstPtr;
-    typedef RefCountingPtr<const DynInst> DynInstConstPtr;
-
     /** The O3CPU type to be used. */
     typedef FullO3CPU<O3CPUImpl> O3CPU;
 
diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh
index 2b79e9cb77..6c85ffd87e 100644
--- a/src/cpu/o3/inst_queue.hh
+++ b/src/cpu/o3/inst_queue.hh
@@ -51,6 +51,7 @@
 #include "base/types.hh"
 #include "cpu/inst_seq.hh"
 #include "cpu/o3/dep_graph.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/o3/mem_dep_unit.hh"
 #include "cpu/o3/store_set.hh"
@@ -89,19 +90,18 @@ class InstructionQueue
   public:
     //Typedefs from the Impl.
     typedef typename Impl::O3CPU O3CPU;
-    typedef typename Impl::DynInstPtr DynInstPtr;
     typedef typename Impl::IssueStruct IssueStruct;
     typedef typename Impl::TimeStruct TimeStruct;
 
     // Typedef of iterator through the list of instructions.
-    typedef typename std::list<DynInstPtr>::iterator ListIt;
+    typedef typename std::list<O3DynInstPtr>::iterator ListIt;
 
     /** FU completion event class. */
     class FUCompletion : public Event
     {
       private:
         /** Executing instruction. */
-        DynInstPtr inst;
+        O3DynInstPtr inst;
 
         /** Index of the FU used for executing. */
         int fuIdx;
@@ -116,7 +116,7 @@ class InstructionQueue
 
       public:
         /** Construct a FU completion event. */
-        FUCompletion(const DynInstPtr &_inst, int fu_idx,
+        FUCompletion(const O3DynInstPtr &_inst, int fu_idx,
                      InstructionQueue<Impl> *iq_ptr);
 
         virtual void process();
@@ -177,40 +177,43 @@ class InstructionQueue
     bool hasReadyInsts();
 
     /** Inserts a new instruction into the IQ. */
-    void insert(const DynInstPtr &new_inst);
+    void insert(const O3DynInstPtr &new_inst);
 
     /** Inserts a new, non-speculative instruction into the IQ. */
-    void insertNonSpec(const DynInstPtr &new_inst);
+    void insertNonSpec(const O3DynInstPtr &new_inst);
 
     /** Inserts a memory or write barrier into the IQ to make sure
      *  loads and stores are ordered properly.
      */
-    void insertBarrier(const DynInstPtr &barr_inst);
+    void insertBarrier(const O3DynInstPtr &barr_inst);
 
     /** Returns the oldest scheduled instruction, and removes it from
      * the list of instructions waiting to execute.
      */
-    DynInstPtr getInstToExecute();
+    O3DynInstPtr getInstToExecute();
 
     /** Gets a memory instruction that was referred due to a delayed DTB
      *  translation if it is now ready to execute.  NULL if none available.
      */
-    DynInstPtr getDeferredMemInstToExecute();
+    O3DynInstPtr getDeferredMemInstToExecute();
 
     /** Gets a memory instruction that was blocked on the cache. NULL if none
      *  available.
      */
-    DynInstPtr getBlockedMemInstToExecute();
+    O3DynInstPtr getBlockedMemInstToExecute();
 
     /**
      * Records the instruction as the producer of a register without
      * adding it to the rest of the IQ.
      */
-    void recordProducer(const DynInstPtr &inst)
-    { addToProducers(inst); }
+    void
+    recordProducer(const O3DynInstPtr &inst)
+    {
+        addToProducers(inst);
+    }
 
     /** Process FU completion event. */
-    void processFUCompletion(const DynInstPtr &inst, int fu_idx);
+    void processFUCompletion(const O3DynInstPtr &inst, int fu_idx);
 
     /**
      * Schedules ready instructions, adding the ready ones (oldest first) to
@@ -228,34 +231,35 @@ class InstructionQueue
     void commit(const InstSeqNum &inst, ThreadID tid = 0);
 
     /** Wakes all dependents of a completed instruction. */
-    int wakeDependents(const DynInstPtr &completed_inst);
+    int wakeDependents(const O3DynInstPtr &completed_inst);
 
     /** Adds a ready memory instruction to the ready list. */
-    void addReadyMemInst(const DynInstPtr &ready_inst);
+    void addReadyMemInst(const O3DynInstPtr &ready_inst);
 
     /**
      * Reschedules a memory instruction. It will be ready to issue once
      * replayMemInst() is called.
      */
-    void rescheduleMemInst(const DynInstPtr &resched_inst);
+    void rescheduleMemInst(const O3DynInstPtr &resched_inst);
 
     /** Replays a memory instruction. It must be rescheduled first. */
-    void replayMemInst(const DynInstPtr &replay_inst);
+    void replayMemInst(const O3DynInstPtr &replay_inst);
 
     /**
      * Defers a memory instruction when its DTB translation incurs a hw
      * page table walk.
      */
-    void deferMemInst(const DynInstPtr &deferred_inst);
+    void deferMemInst(const O3DynInstPtr &deferred_inst);
 
     /**  Defers a memory instruction when it is cache blocked. */
-    void blockMemInst(const DynInstPtr &blocked_inst);
+    void blockMemInst(const O3DynInstPtr &blocked_inst);
 
     /**  Notify instruction queue that a previous blockage has resolved */
     void cacheUnblocked();
 
     /** Indicates an ordering violation between a store and a load. */
-    void violation(const DynInstPtr &store, const DynInstPtr &faulting_load);
+    void violation(const O3DynInstPtr &store,
+            const O3DynInstPtr &faulting_load);
 
     /**
      * Squashes instructions for a thread. Squashing information is obtained
@@ -310,23 +314,23 @@ class InstructionQueue
     //////////////////////////////////////
 
     /** List of all the instructions in the IQ (some of which may be issued). */
-    std::list<DynInstPtr> instList[O3MaxThreads];
+    std::list<O3DynInstPtr> instList[O3MaxThreads];
 
     /** List of instructions that are ready to be executed. */
-    std::list<DynInstPtr> instsToExecute;
+    std::list<O3DynInstPtr> instsToExecute;
 
     /** List of instructions waiting for their DTB translation to
      *  complete (hw page table walk in progress).
      */
-    std::list<DynInstPtr> deferredMemInsts;
+    std::list<O3DynInstPtr> deferredMemInsts;
 
     /** List of instructions that have been cache blocked. */
-    std::list<DynInstPtr> blockedMemInsts;
+    std::list<O3DynInstPtr> blockedMemInsts;
 
     /** List of instructions that were cache blocked, but a retry has been seen
      * since, so they can now be retried. May fail again go on the blocked list.
      */
-    std::list<DynInstPtr> retryMemInsts;
+    std::list<O3DynInstPtr> retryMemInsts;
 
     /**
      * Struct for comparing entries to be added to the priority queue.
@@ -335,16 +339,14 @@ class InstructionQueue
      * numbers (and hence are older) will be at the top of the
      * priority queue.
      */
-    struct pqCompare
+    struct PqCompare
     {
-        bool operator() (const DynInstPtr &lhs, const DynInstPtr &rhs) const
-        {
-            return lhs->seqNum > rhs->seqNum;
-        }
+        bool operator()(const O3DynInstPtr &lhs,
+                const O3DynInstPtr &rhs) const;
     };
 
-    typedef std::priority_queue<DynInstPtr, std::vector<DynInstPtr>, pqCompare>
-    ReadyInstQueue;
+    typedef std::priority_queue<
+        O3DynInstPtr, std::vector<O3DynInstPtr>, PqCompare> ReadyInstQueue;
 
     /** List of ready instructions, per op class.  They are separated by op
      *  class to allow for easy mapping to FUs.
@@ -358,9 +360,9 @@ class InstructionQueue
      *  the sequence number will be available.  Thus it is most efficient to be
      *  able to search by the sequence number alone.
      */
-    std::map<InstSeqNum, DynInstPtr> nonSpecInsts;
+    std::map<InstSeqNum, O3DynInstPtr> nonSpecInsts;
 
-    typedef typename std::map<InstSeqNum, DynInstPtr>::iterator NonSpecMapIt;
+    typedef typename std::map<InstSeqNum, O3DynInstPtr>::iterator NonSpecMapIt;
 
     /** Entry for the list age ordering by op class. */
     struct ListOrderEntry
@@ -397,7 +399,7 @@ class InstructionQueue
      */
     void moveToYoungerInst(ListOrderIt age_order_it);
 
-    DependencyGraph<DynInstPtr> dependGraph;
+    DependencyGraph<O3DynInstPtr> dependGraph;
 
     //////////////////////////////////////
     // Various parameters
@@ -450,13 +452,13 @@ class InstructionQueue
     std::vector<bool> regScoreboard;
 
     /** Adds an instruction to the dependency graph, as a consumer. */
-    bool addToDependents(const DynInstPtr &new_inst);
+    bool addToDependents(const O3DynInstPtr &new_inst);
 
     /** Adds an instruction to the dependency graph, as a producer. */
-    void addToProducers(const DynInstPtr &new_inst);
+    void addToProducers(const O3DynInstPtr &new_inst);
 
     /** Moves an instruction to the ready queue if it is ready. */
-    void addIfReady(const DynInstPtr &inst);
+    void addIfReady(const O3DynInstPtr &inst);
 
     /** Debugging function to count how many entries are in the IQ.  It does
      *  a linear walk through the instructions, so do not call this function
diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh
index d4328148b2..9373f6b879 100644
--- a/src/cpu/o3/inst_queue_impl.hh
+++ b/src/cpu/o3/inst_queue_impl.hh
@@ -46,6 +46,7 @@
 #include <vector>
 
 #include "base/logging.hh"
+#include "cpu/o3/dyn_inst.hh"
 #include "cpu/o3/fu_pool.hh"
 #include "cpu/o3/inst_queue.hh"
 #include "cpu/o3/limits.hh"
@@ -59,7 +60,7 @@
 using std::list;
 
 template <class Impl>
-InstructionQueue<Impl>::FUCompletion::FUCompletion(const DynInstPtr &_inst,
+InstructionQueue<Impl>::FUCompletion::FUCompletion(const O3DynInstPtr &_inst,
     int fu_idx, InstructionQueue<Impl> *iq_ptr)
     : Event(Stat_Event_Pri, AutoDelete),
       inst(_inst), fuIdx(fu_idx), iqPtr(iq_ptr), freeFU(false)
@@ -576,7 +577,7 @@ InstructionQueue<Impl>::hasReadyInsts()
 
 template <class Impl>
 void
-InstructionQueue<Impl>::insert(const DynInstPtr &new_inst)
+InstructionQueue<Impl>::insert(const O3DynInstPtr &new_inst)
 {
     if (new_inst->isFloating()) {
         iqIOStats.fpInstQueueWrites++;
@@ -622,7 +623,7 @@ InstructionQueue<Impl>::insert(const DynInstPtr &new_inst)
 
 template <class Impl>
 void
-InstructionQueue<Impl>::insertNonSpec(const DynInstPtr &new_inst)
+InstructionQueue<Impl>::insertNonSpec(const O3DynInstPtr &new_inst)
 {
     // @todo: Clean up this code; can do it by setting inst as unable
     // to issue, then calling normal insert on the inst.
@@ -669,7 +670,7 @@ InstructionQueue<Impl>::insertNonSpec(const DynInstPtr &new_inst)
 
 template <class Impl>
 void
-InstructionQueue<Impl>::insertBarrier(const DynInstPtr &barr_inst)
+InstructionQueue<Impl>::insertBarrier(const O3DynInstPtr &barr_inst)
 {
     memDepUnit[barr_inst->threadNumber].insertBarrier(barr_inst);
 
@@ -677,11 +678,11 @@ InstructionQueue<Impl>::insertBarrier(const DynInstPtr &barr_inst)
 }
 
 template <class Impl>
-typename Impl::DynInstPtr
+O3DynInstPtr
 InstructionQueue<Impl>::getInstToExecute()
 {
     assert(!instsToExecute.empty());
-    DynInstPtr inst = std::move(instsToExecute.front());
+    O3DynInstPtr inst = std::move(instsToExecute.front());
     instsToExecute.pop_front();
     if (inst->isFloating()) {
         iqIOStats.fpInstQueueReads++;
@@ -748,7 +749,8 @@ InstructionQueue<Impl>::moveToYoungerInst(ListOrderIt list_order_it)
 
 template <class Impl>
 void
-InstructionQueue<Impl>::processFUCompletion(const DynInstPtr &inst, int fu_idx)
+InstructionQueue<Impl>::processFUCompletion(
+        const O3DynInstPtr &inst, int fu_idx)
 {
     DPRINTF(IQ, "Processing FU completion [sn:%llu]\n", inst->seqNum);
     assert(!cpu->switchedOut());
@@ -779,7 +781,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
     IssueStruct *i2e_info = issueToExecuteQueue->access(0);
 
-    DynInstPtr mem_inst;
+    O3DynInstPtr mem_inst;
     while ((mem_inst = std::move(getDeferredMemInstToExecute()))) {
         addReadyMemInst(mem_inst);
     }
@@ -806,7 +808,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
         assert(!readyInsts[op_class].empty());
 
-        DynInstPtr issuing_inst = readyInsts[op_class].top();
+        O3DynInstPtr issuing_inst = readyInsts[op_class].top();
 
         if (issuing_inst->isFloating()) {
             iqIOStats.fpInstQueueReads++;
@@ -986,7 +988,7 @@ InstructionQueue<Impl>::commit(const InstSeqNum &inst, ThreadID tid)
 
 template <class Impl>
 int
-InstructionQueue<Impl>::wakeDependents(const DynInstPtr &completed_inst)
+InstructionQueue<Impl>::wakeDependents(const O3DynInstPtr &completed_inst)
 {
     int dependents = 0;
 
@@ -1054,7 +1056,7 @@ InstructionQueue<Impl>::wakeDependents(const DynInstPtr &completed_inst)
 
         //Go through the dependency chain, marking the registers as
         //ready within the waiting instructions.
-        DynInstPtr dep_inst = dependGraph.pop(dest_reg->flatIndex());
+        O3DynInstPtr dep_inst = dependGraph.pop(dest_reg->flatIndex());
 
         while (dep_inst) {
             DPRINTF(IQ, "Waking up a dependent instruction, [sn:%llu] "
@@ -1086,7 +1088,7 @@ InstructionQueue<Impl>::wakeDependents(const DynInstPtr &completed_inst)
 
 template <class Impl>
 void
-InstructionQueue<Impl>::addReadyMemInst(const DynInstPtr &ready_inst)
+InstructionQueue<Impl>::addReadyMemInst(const O3DynInstPtr &ready_inst)
 {
     OpClass op_class = ready_inst->opClass();
 
@@ -1109,7 +1111,7 @@ InstructionQueue<Impl>::addReadyMemInst(const DynInstPtr &ready_inst)
 
 template <class Impl>
 void
-InstructionQueue<Impl>::rescheduleMemInst(const DynInstPtr &resched_inst)
+InstructionQueue<Impl>::rescheduleMemInst(const O3DynInstPtr &resched_inst)
 {
     DPRINTF(IQ, "Rescheduling mem inst [sn:%llu]\n", resched_inst->seqNum);
 
@@ -1123,21 +1125,21 @@ InstructionQueue<Impl>::rescheduleMemInst(const DynInstPtr &resched_inst)
 
 template <class Impl>
 void
-InstructionQueue<Impl>::replayMemInst(const DynInstPtr &replay_inst)
+InstructionQueue<Impl>::replayMemInst(const O3DynInstPtr &replay_inst)
 {
     memDepUnit[replay_inst->threadNumber].replay();
 }
 
 template <class Impl>
 void
-InstructionQueue<Impl>::deferMemInst(const DynInstPtr &deferred_inst)
+InstructionQueue<Impl>::deferMemInst(const O3DynInstPtr &deferred_inst)
 {
     deferredMemInsts.push_back(deferred_inst);
 }
 
 template <class Impl>
 void
-InstructionQueue<Impl>::blockMemInst(const DynInstPtr &blocked_inst)
+InstructionQueue<Impl>::blockMemInst(const O3DynInstPtr &blocked_inst)
 {
     blocked_inst->clearIssued();
     blocked_inst->clearCanIssue();
@@ -1154,13 +1156,13 @@ InstructionQueue<Impl>::cacheUnblocked()
 }
 
 template <class Impl>
-typename Impl::DynInstPtr
+O3DynInstPtr
 InstructionQueue<Impl>::getDeferredMemInstToExecute()
 {
     for (ListIt it = deferredMemInsts.begin(); it != deferredMemInsts.end();
          ++it) {
         if ((*it)->translationCompleted() || (*it)->isSquashed()) {
-            DynInstPtr mem_inst = std::move(*it);
+            O3DynInstPtr mem_inst = std::move(*it);
             deferredMemInsts.erase(it);
             return mem_inst;
         }
@@ -1169,13 +1171,13 @@ InstructionQueue<Impl>::getDeferredMemInstToExecute()
 }
 
 template <class Impl>
-typename Impl::DynInstPtr
+O3DynInstPtr
 InstructionQueue<Impl>::getBlockedMemInstToExecute()
 {
     if (retryMemInsts.empty()) {
         return nullptr;
     } else {
-        DynInstPtr mem_inst = std::move(retryMemInsts.front());
+        O3DynInstPtr mem_inst = std::move(retryMemInsts.front());
         retryMemInsts.pop_front();
         return mem_inst;
     }
@@ -1183,8 +1185,8 @@ InstructionQueue<Impl>::getBlockedMemInstToExecute()
 
 template <class Impl>
 void
-InstructionQueue<Impl>::violation(const DynInstPtr &store,
-                                  const DynInstPtr &faulting_load)
+InstructionQueue<Impl>::violation(const O3DynInstPtr &store,
+                                  const O3DynInstPtr &faulting_load)
 {
     iqIOStats.intInstQueueWrites++;
     memDepUnit[store->threadNumber].violation(store, faulting_load);
@@ -1223,7 +1225,7 @@ InstructionQueue<Impl>::doSquash(ThreadID tid)
     while (squash_it != instList[tid].end() &&
            (*squash_it)->seqNum > squashedSeqNum[tid]) {
 
-        DynInstPtr squashed_inst = (*squash_it);
+        O3DynInstPtr squashed_inst = (*squash_it);
         if (squashed_inst->isFloating()) {
             iqIOStats.fpInstQueueWrites++;
         } else if (squashed_inst->isVector()) {
@@ -1329,7 +1331,7 @@ InstructionQueue<Impl>::doSquash(ThreadID tid)
         // IQ clears out the heads of the dependency graph only when
         // instructions reach writeback stage. If an instruction is squashed
         // before writeback stage, its head of dependency graph would not be
-        // cleared out; it holds the instruction's DynInstPtr. This prevents
+        // cleared out; it holds the instruction's O3DynInstPtr. This prevents
         // freeing the squashed instruction's DynInst.
         // Thus, we need to manually clear out the squashed instructions' heads
         // of dependency graph.
@@ -1352,7 +1354,15 @@ InstructionQueue<Impl>::doSquash(ThreadID tid)
 
 template <class Impl>
 bool
-InstructionQueue<Impl>::addToDependents(const DynInstPtr &new_inst)
+InstructionQueue<Impl>::PqCompare::operator()(
+        const O3DynInstPtr &lhs, const O3DynInstPtr &rhs) const
+{
+    return lhs->seqNum > rhs->seqNum;
+}
+
+template <class Impl>
+bool
+InstructionQueue<Impl>::addToDependents(const O3DynInstPtr &new_inst)
 {
     // Loop through the instruction's source registers, adding
     // them to the dependency list if they are not ready.
@@ -1400,7 +1410,7 @@ InstructionQueue<Impl>::addToDependents(const DynInstPtr &new_inst)
 
 template <class Impl>
 void
-InstructionQueue<Impl>::addToProducers(const DynInstPtr &new_inst)
+InstructionQueue<Impl>::addToProducers(const O3DynInstPtr &new_inst)
 {
     // Nothing really needs to be marked when an instruction becomes
     // the producer of a register's value, but for convenience a ptr
@@ -1436,7 +1446,7 @@ InstructionQueue<Impl>::addToProducers(const DynInstPtr &new_inst)
 
 template <class Impl>
 void
-InstructionQueue<Impl>::addIfReady(const DynInstPtr &inst)
+InstructionQueue<Impl>::addIfReady(const O3DynInstPtr &inst)
 {
     // If the instruction now has all of its source registers
     // available, then add it to the list of ready instructions.
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index e7e1f274aa..eb76e655a3 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -53,6 +53,8 @@
 #include "base/flags.hh"
 #include "base/types.hh"
 #include "cpu/inst_seq.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
+#include "cpu/o3/impl.hh"
 #include "cpu/utils.hh"
 #include "enums/SMTQueuePolicy.hh"
 #include "mem/port.hh"
@@ -74,7 +76,6 @@ class LSQ
 {
   public:
     typedef typename Impl::O3CPU O3CPU;
-    typedef typename Impl::DynInstPtr DynInstPtr;
 
     class LSQRequest;
     /** Derived class to hold any sender state the LSQ needs. */
@@ -93,7 +94,7 @@ class LSQ
       public:
 
         /** Instruction which initiated the access to memory. */
-        DynInstPtr inst;
+        O3DynInstPtr inst;
         /** The main packet from a split load, used during writeback. */
         PacketPtr mainPkt;
         /** A second packet from a split store that needs sending. */
@@ -113,7 +114,7 @@ class LSQ
          * case the SenderState knows.
          */
         bool deleted;
-        ContextID contextId() { return inst->contextId(); }
+        ContextID contextId();
 
         /** Completes a packet and returns whether the access is finished. */
         inline bool isComplete() { return outstanding == 0; }
@@ -293,7 +294,7 @@ class LSQ
 
       public:
         LSQUnit<Impl>& _port;
-        const DynInstPtr _inst;
+        const O3DynInstPtr _inst;
         uint32_t _taskId;
         PacketDataPtr _data;
         std::vector<PacketPtr> _packets;
@@ -308,38 +309,11 @@ class LSQ
         AtomicOpFunctorPtr _amo_op;
       protected:
         LSQUnit<Impl>* lsqUnit() { return &_port; }
-        LSQRequest(LSQUnit<Impl> *port, const DynInstPtr& inst, bool isLoad) :
-            _state(State::NotIssued), _senderState(nullptr),
-            _port(*port), _inst(inst), _data(nullptr),
-            _res(nullptr), _addr(0), _size(0), _flags(0),
-            _numOutstandingPackets(0), _amo_op(nullptr)
-        {
-            flags.set(Flag::IsLoad, isLoad);
-            flags.set(Flag::WbStore,
-                      _inst->isStoreConditional() || _inst->isAtomic());
-            flags.set(Flag::IsAtomic, _inst->isAtomic());
-            install();
-        }
-        LSQRequest(LSQUnit<Impl>* port, const DynInstPtr& inst, bool isLoad,
-                   const Addr& addr, const uint32_t& size,
-                   const Request::Flags& flags_,
-                   PacketDataPtr data = nullptr, uint64_t* res = nullptr,
-                   AtomicOpFunctorPtr amo_op = nullptr)
-            : _state(State::NotIssued), _senderState(nullptr),
-            numTranslatedFragments(0),
-            numInTranslationFragments(0),
-            _port(*port), _inst(inst), _data(data),
-            _res(res), _addr(addr), _size(size),
-            _flags(flags_),
-            _numOutstandingPackets(0),
-            _amo_op(std::move(amo_op))
-        {
-            flags.set(Flag::IsLoad, isLoad);
-            flags.set(Flag::WbStore,
-                      _inst->isStoreConditional() || _inst->isAtomic());
-            flags.set(Flag::IsAtomic, _inst->isAtomic());
-            install();
-        }
+        LSQRequest(LSQUnit<Impl>* port, const O3DynInstPtr& inst, bool isLoad);
+        LSQRequest(LSQUnit<Impl>* port, const O3DynInstPtr& inst, bool isLoad,
+                const Addr& addr, const uint32_t& size,
+                const Request::Flags& flags_, PacketDataPtr data=nullptr,
+                uint64_t* res=nullptr, AtomicOpFunctorPtr amo_op=nullptr);
 
         bool
         isLoad() const
@@ -354,21 +328,9 @@ class LSQ
         }
 
         /** Install the request in the LQ/SQ. */
-        void install()
-        {
-            if (isLoad()) {
-                _port.loadQueue[_inst->lqIdx].setRequest(this);
-            } else {
-                // Store, StoreConditional, and Atomic requests are pushed
-                // to this storeQueue
-                _port.storeQueue[_inst->sqIdx].setRequest(this);
-            }
-        }
-        virtual bool
-        squashed() const override
-        {
-            return _inst->isSquashed();
-        }
+        void install();
+
+        bool squashed() const override;
 
         /**
          * Test if the LSQRequest has been released, i.e. self-owned.
@@ -391,7 +353,8 @@ class LSQ
          * but there is any in-flight translation request to the TLB or access
          * request to the memory.
          */
-        void release(Flag reason)
+        void
+        release(Flag reason)
         {
             assert(reason == Flag::LSQEntryFreed || reason == Flag::Discarded);
             if (!isAnyOutstandingRequest()) {
@@ -410,35 +373,14 @@ class LSQ
          * The request is only added if the mask is empty or if there is at
          * least an active element in it.
          */
-        void
-        addRequest(Addr addr, unsigned size,
-                   const std::vector<bool>& byte_enable)
-        {
-            if (isAnyActiveElement(byte_enable.begin(), byte_enable.end())) {
-                auto request = std::make_shared<Request>(
-                        addr, size, _flags, _inst->requestorId(),
-                        _inst->instAddr(), _inst->contextId(),
-                        std::move(_amo_op));
-                request->setByteEnable(byte_enable);
-                _requests.push_back(request);
-            }
-        }
+        void addRequest(Addr addr, unsigned size,
+                const std::vector<bool>& byte_enable);
 
         /** Destructor.
          * The LSQRequest owns the request. If the packet has already been
          * sent, the sender state will be deleted upon receiving the reply.
          */
-        virtual ~LSQRequest()
-        {
-            assert(!isAnyOutstandingRequest());
-            _inst->savedReq = nullptr;
-            if (_senderState)
-                delete _senderState;
-
-            for (auto r: _packets)
-                delete r;
-        };
-
+        virtual ~LSQRequest();
 
       public:
         /** Convenience getters/setters. */
@@ -450,7 +392,7 @@ class LSQ
             request()->setContext(context_id);
         }
 
-        const DynInstPtr&
+        const O3DynInstPtr&
         instruction()
         {
             return _inst;
@@ -728,7 +670,7 @@ class LSQ
         using LSQRequest::_numOutstandingPackets;
         using LSQRequest::_amo_op;
       public:
-        SingleDataRequest(LSQUnit<Impl>* port, const DynInstPtr& inst,
+        SingleDataRequest(LSQUnit<Impl>* port, const O3DynInstPtr& inst,
                 bool isLoad, const Addr& addr, const uint32_t& size,
                 const Request::Flags& flags_, PacketDataPtr data=nullptr,
                 uint64_t* res=nullptr, AtomicOpFunctorPtr amo_op=nullptr) :
@@ -766,7 +708,7 @@ class LSQ
       using LSQRequest::flags;
       using LSQRequest::setState;
     public:
-      HtmCmdRequest(LSQUnit<Impl>* port, const DynInstPtr& inst,
+      HtmCmdRequest(LSQUnit<Impl>* port, const O3DynInstPtr& inst,
               const Request::Flags& flags_);
       inline virtual ~HtmCmdRequest() {}
       virtual void initiateTranslation();
@@ -813,7 +755,7 @@ class LSQ
         PacketPtr _mainPacket;
 
       public:
-        SplitDataRequest(LSQUnit<Impl>* port, const DynInstPtr& inst,
+        SplitDataRequest(LSQUnit<Impl>* port, const O3DynInstPtr& inst,
                 bool isLoad, const Addr& addr, const uint32_t& size,
                 const Request::Flags & flags_, PacketDataPtr data=nullptr,
                 uint64_t* res=nullptr) :
@@ -876,15 +818,15 @@ class LSQ
     void tick();
 
     /** Inserts a load into the LSQ. */
-    void insertLoad(const DynInstPtr &load_inst);
+    void insertLoad(const O3DynInstPtr &load_inst);
     /** Inserts a store into the LSQ. */
-    void insertStore(const DynInstPtr &store_inst);
+    void insertStore(const O3DynInstPtr &store_inst);
 
     /** Executes a load. */
-    Fault executeLoad(const DynInstPtr &inst);
+    Fault executeLoad(const O3DynInstPtr &inst);
 
     /** Executes a store. */
-    Fault executeStore(const DynInstPtr &inst);
+    Fault executeStore(const O3DynInstPtr &inst);
 
     /**
      * Commits loads up until the given sequence number for a specific thread.
@@ -924,7 +866,7 @@ class LSQ
     bool violation(ThreadID tid) { return thread.at(tid).violation(); }
 
     /** Gets the instruction that caused the memory ordering violation. */
-    DynInstPtr
+    O3DynInstPtr
     getMemDepViolator(ThreadID tid)
     {
         return thread.at(tid).getMemDepViolator();
@@ -1103,7 +1045,7 @@ class LSQ
 
     void recvTimingSnoopReq(PacketPtr pkt);
 
-    Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
+    Fault pushRequest(const O3DynInstPtr& inst, bool isLoad, uint8_t *data,
                       unsigned int size, Addr addr, Request::Flags flags,
                       uint64_t *res, AtomicOpFunctorPtr amo_op,
                       const std::vector<bool>& byte_enable);
diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh
index 452a679d8b..5cde78e0f8 100644
--- a/src/cpu/o3/lsq_impl.hh
+++ b/src/cpu/o3/lsq_impl.hh
@@ -49,6 +49,7 @@
 #include "base/compiler.hh"
 #include "base/logging.hh"
 #include "cpu/o3/cpu.hh"
+#include "cpu/o3/dyn_inst.hh"
 #include "cpu/o3/iew.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/o3/lsq.hh"
@@ -59,6 +60,13 @@
 #include "debug/Writeback.hh"
 #include "params/DerivO3CPU.hh"
 
+template <class Impl>
+ContextID
+LSQ<Impl>::LSQSenderState::contextId()
+{
+    return inst->contextId();
+}
+
 template <class Impl>
 LSQ<Impl>::LSQ(O3CPU *cpu_ptr, DefaultIEW<Impl> *iew_ptr,
         const DerivO3CPUParams &params)
@@ -220,7 +228,7 @@ LSQ<Impl>::cachePortBusy(bool is_load)
 
 template<class Impl>
 void
-LSQ<Impl>::insertLoad(const DynInstPtr &load_inst)
+LSQ<Impl>::insertLoad(const O3DynInstPtr &load_inst)
 {
     ThreadID tid = load_inst->threadNumber;
 
@@ -229,7 +237,7 @@ LSQ<Impl>::insertLoad(const DynInstPtr &load_inst)
 
 template<class Impl>
 void
-LSQ<Impl>::insertStore(const DynInstPtr &store_inst)
+LSQ<Impl>::insertStore(const O3DynInstPtr &store_inst)
 {
     ThreadID tid = store_inst->threadNumber;
 
@@ -238,7 +246,7 @@ LSQ<Impl>::insertStore(const DynInstPtr &store_inst)
 
 template<class Impl>
 Fault
-LSQ<Impl>::executeLoad(const DynInstPtr &inst)
+LSQ<Impl>::executeLoad(const O3DynInstPtr &inst)
 {
     ThreadID tid = inst->threadNumber;
 
@@ -247,7 +255,7 @@ LSQ<Impl>::executeLoad(const DynInstPtr &inst)
 
 template<class Impl>
 Fault
-LSQ<Impl>::executeStore(const DynInstPtr &inst)
+LSQ<Impl>::executeStore(const O3DynInstPtr &inst)
 {
     ThreadID tid = inst->threadNumber;
 
@@ -676,7 +684,7 @@ LSQ<Impl>::dumpInsts() const
 
 template<class Impl>
 Fault
-LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
+LSQ<Impl>::pushRequest(const O3DynInstPtr& inst, bool isLoad, uint8_t *data,
                        unsigned int size, Addr addr, Request::Flags flags,
                        uint64_t *res, AtomicOpFunctorPtr amo_op,
                        const std::vector<bool>& byte_enable)
@@ -951,6 +959,85 @@ LSQ<Impl>::SplitDataRequest::initiateTranslation()
     }
 }
 
+template<class Impl>
+LSQ<Impl>::LSQRequest::LSQRequest(
+        LSQUnit<Impl> *port, const O3DynInstPtr& inst, bool isLoad) :
+    _state(State::NotIssued), _senderState(nullptr),
+    _port(*port), _inst(inst), _data(nullptr),
+    _res(nullptr), _addr(0), _size(0), _flags(0),
+    _numOutstandingPackets(0), _amo_op(nullptr)
+{
+    flags.set(Flag::IsLoad, isLoad);
+    flags.set(Flag::WbStore,
+              _inst->isStoreConditional() || _inst->isAtomic());
+    flags.set(Flag::IsAtomic, _inst->isAtomic());
+    install();
+}
+
+template<class Impl>
+LSQ<Impl>::LSQRequest::LSQRequest(
+        LSQUnit<Impl>* port, const O3DynInstPtr& inst, bool isLoad,
+        const Addr& addr, const uint32_t& size, const Request::Flags& flags_,
+           PacketDataPtr data, uint64_t* res, AtomicOpFunctorPtr amo_op)
+    : _state(State::NotIssued), _senderState(nullptr),
+    numTranslatedFragments(0),
+    numInTranslationFragments(0),
+    _port(*port), _inst(inst), _data(data),
+    _res(res), _addr(addr), _size(size),
+    _flags(flags_),
+    _numOutstandingPackets(0),
+    _amo_op(std::move(amo_op))
+{
+    flags.set(Flag::IsLoad, isLoad);
+    flags.set(Flag::WbStore,
+              _inst->isStoreConditional() || _inst->isAtomic());
+    flags.set(Flag::IsAtomic, _inst->isAtomic());
+    install();
+}
+
+template<class Impl>
+void
+LSQ<Impl>::LSQRequest::install()
+{
+    if (isLoad()) {
+        _port.loadQueue[_inst->lqIdx].setRequest(this);
+    } else {
+        // Store, StoreConditional, and Atomic requests are pushed
+        // to this storeQueue
+        _port.storeQueue[_inst->sqIdx].setRequest(this);
+    }
+}
+
+template<class Impl>
+bool LSQ<Impl>::LSQRequest::squashed() const { return _inst->isSquashed(); }
+
+template<class Impl>
+void
+LSQ<Impl>::LSQRequest::addRequest(Addr addr, unsigned size,
+           const std::vector<bool>& byte_enable)
+{
+    if (isAnyActiveElement(byte_enable.begin(), byte_enable.end())) {
+        auto request = std::make_shared<Request>(
+                addr, size, _flags, _inst->requestorId(),
+                _inst->instAddr(), _inst->contextId(),
+                std::move(_amo_op));
+        request->setByteEnable(byte_enable);
+        _requests.push_back(request);
+    }
+}
+
+template<class Impl>
+LSQ<Impl>::LSQRequest::~LSQRequest()
+{
+    assert(!isAnyOutstandingRequest());
+    _inst->savedReq = nullptr;
+    if (_senderState)
+        delete _senderState;
+
+    for (auto r: _packets)
+        delete r;
+};
+
 template<class Impl>
 void
 LSQ<Impl>::LSQRequest::sendFragmentToTranslation(int i)
@@ -1226,7 +1313,7 @@ LSQ<Impl>::DcachePort::recvReqRetry()
 
 template<class Impl>
 LSQ<Impl>::HtmCmdRequest::HtmCmdRequest(LSQUnit<Impl>* port,
-                  const DynInstPtr& inst,
+                  const O3DynInstPtr& inst,
                   const Request::Flags& flags_) :
     SingleDataRequest(port, inst, true, 0x0lu, 8, flags_,
         nullptr, nullptr, nullptr)
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index eda044d0d6..4d3c41958e 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -53,6 +53,7 @@
 #include "arch/locked_mem.hh"
 #include "config/the_isa.hh"
 #include "cpu/inst_seq.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/lsq.hh"
 #include "cpu/timebuf.hh"
 #include "debug/HtmCpu.hh"
@@ -85,7 +86,6 @@ class LSQUnit
     static constexpr auto MaxDataBytes = MaxVecRegLenInBytes;
 
     typedef typename Impl::O3CPU O3CPU;
-    typedef typename Impl::DynInstPtr DynInstPtr;
     typedef typename Impl::IssueStruct IssueStruct;
 
     using LSQSenderState = typename LSQ<Impl>::LSQSenderState;
@@ -95,23 +95,17 @@ class LSQUnit
     {
       private:
         /** The instruction. */
-        DynInstPtr inst;
+        O3DynInstPtr inst;
         /** The request. */
-        LSQRequest* req;
+        LSQRequest* req = nullptr;
         /** The size of the operation. */
-        uint32_t _size;
+        uint32_t _size = 0;
         /** Valid entry. */
-        bool _valid;
-      public:
-        /** Constructs an empty store queue entry. */
-        LSQEntry()
-            : inst(nullptr), req(nullptr), _size(0), _valid(false)
-        {
-        }
+        bool _valid = false;
 
+      public:
         ~LSQEntry()
         {
-            inst = nullptr;
             if (req != nullptr) {
                 req->freeLSQEntry();
                 req = nullptr;
@@ -131,13 +125,14 @@ class LSQUnit
         }
 
         void
-        set(const DynInstPtr& inst)
+        set(const O3DynInstPtr& inst)
         {
             assert(!_valid);
             this->inst = inst;
             _valid = true;
             _size = 0;
         }
+
         LSQRequest* request() { return req; }
         void setRequest(LSQRequest* r) { req = r; }
         bool hasRequest() { return req != nullptr; }
@@ -146,7 +141,7 @@ class LSQUnit
         bool valid() const { return _valid; }
         uint32_t& size() { return _size; }
         const uint32_t& size() const { return _size; }
-        const DynInstPtr& instruction() const { return inst; }
+        const O3DynInstPtr& instruction() const { return inst; }
         /** @} */
     };
 
@@ -156,32 +151,27 @@ class LSQUnit
         /** The store data. */
         char _data[MaxDataBytes];
         /** Whether or not the store can writeback. */
-        bool _canWB;
+        bool _canWB = false;
         /** Whether or not the store is committed. */
-        bool _committed;
+        bool _committed = false;
         /** Whether or not the store is completed. */
-        bool _completed;
+        bool _completed = false;
         /** Does this request write all zeros and thus doesn't
          * have any data attached to it. Used for cache block zero
          * style instructs (ARM DC ZVA; ALPHA WH64)
          */
-        bool _isAllZeros;
+        bool _isAllZeros = false;
+
       public:
         static constexpr size_t DataSize = sizeof(_data);
         /** Constructs an empty store queue entry. */
         SQEntry()
-            : _canWB(false), _committed(false), _completed(false),
-              _isAllZeros(false)
         {
             std::memset(_data, 0, DataSize);
         }
 
-        ~SQEntry()
-        {
-        }
-
         void
-        set(const DynInstPtr& inst)
+        set(const O3DynInstPtr& inst)
         {
             LSQEntry::set(inst);
         }
@@ -192,6 +182,7 @@ class LSQUnit
             LSQEntry::clear();
             _canWB = _completed = _committed = _isAllZeros = false;
         }
+
         /** Member accessors. */
         /** @{ */
         bool& canWB() { return _canWB; }
@@ -250,11 +241,11 @@ class LSQUnit
     void takeOverFrom();
 
     /** Inserts an instruction. */
-    void insert(const DynInstPtr &inst);
+    void insert(const O3DynInstPtr &inst);
     /** Inserts a load instruction. */
-    void insertLoad(const DynInstPtr &load_inst);
+    void insertLoad(const O3DynInstPtr &load_inst);
     /** Inserts a store instruction. */
-    void insertStore(const DynInstPtr &store_inst);
+    void insertStore(const O3DynInstPtr &store_inst);
 
     /** Check for ordering violations in the LSQ. For a store squash if we
      * ever find a conflicting load. For a load, only squash if we
@@ -263,7 +254,7 @@ class LSQUnit
      * @param inst the instruction to check
      */
     Fault checkViolations(typename LoadQueue::iterator& loadIt,
-            const DynInstPtr& inst);
+            const O3DynInstPtr& inst);
 
     /** Check if an incoming invalidate hits in the lsq on a load
      * that might have issued out of order wrt another load beacuse
@@ -272,11 +263,11 @@ class LSQUnit
     void checkSnoop(PacketPtr pkt);
 
     /** Executes a load instruction. */
-    Fault executeLoad(const DynInstPtr &inst);
+    Fault executeLoad(const O3DynInstPtr &inst);
 
     Fault executeLoad(int lq_idx) { panic("Not implemented"); return NoFault; }
     /** Executes a store instruction. */
-    Fault executeStore(const DynInstPtr &inst);
+    Fault executeStore(const O3DynInstPtr &inst);
 
     /** Commits the head load. */
     void commitLoad();
@@ -302,7 +293,7 @@ class LSQUnit
     bool violation() { return memDepViolator; }
 
     /** Returns the memory ordering violator. */
-    DynInstPtr getMemDepViolator();
+    O3DynInstPtr getMemDepViolator();
 
     /** Returns the number of free LQ entries. */
     unsigned numFreeLoadEntries();
@@ -378,7 +369,7 @@ class LSQUnit
     void resetState();
 
     /** Writes back the instruction, sending it to IEW. */
-    void writeback(const DynInstPtr &inst, PacketPtr pkt);
+    void writeback(const O3DynInstPtr &inst, PacketPtr pkt);
 
     /** Try to finish a previously blocked write back attempt */
     void writebackBlockedStore();
@@ -460,7 +451,7 @@ class LSQUnit
     {
       public:
         /** Constructs a writeback event. */
-        WritebackEvent(const DynInstPtr &_inst, PacketPtr pkt,
+        WritebackEvent(const O3DynInstPtr &_inst, PacketPtr pkt,
                 LSQUnit *lsq_ptr);
 
         /** Processes the writeback event. */
@@ -471,7 +462,7 @@ class LSQUnit
 
       private:
         /** Instruction whose results are being written back. */
-        DynInstPtr inst;
+        O3DynInstPtr inst;
 
         /** The packet that would have been sent to memory. */
         PacketPtr pkt;
@@ -552,7 +543,7 @@ class LSQUnit
     bool storeInFlight;
 
     /** The oldest load that caused a memory ordering violation. */
-    DynInstPtr memDepViolator;
+    O3DynInstPtr memDepViolator;
 
     /** Whether or not there is a packet that couldn't be sent because of
      * a lack of cache ports. */
@@ -634,357 +625,4 @@ class LSQUnit
     typedef CircularQueue<SQEntry> SQueue;
 };
 
-template <class Impl>
-Fault
-LSQUnit<Impl>::read(LSQRequest *req, int load_idx)
-{
-    LQEntry& load_req = loadQueue[load_idx];
-    const DynInstPtr& load_inst = load_req.instruction();
-
-    load_req.setRequest(req);
-    assert(load_inst);
-
-    assert(!load_inst->isExecuted());
-
-    // Make sure this isn't a strictly ordered load
-    // A bit of a hackish way to get strictly ordered accesses to work
-    // only if they're at the head of the LSQ and are ready to commit
-    // (at the head of the ROB too).
-
-    if (req->mainRequest()->isStrictlyOrdered() &&
-        (load_idx != loadQueue.head() || !load_inst->isAtCommit())) {
-        // Tell IQ/mem dep unit that this instruction will need to be
-        // rescheduled eventually
-        iewStage->rescheduleMemInst(load_inst);
-        load_inst->clearIssued();
-        load_inst->effAddrValid(false);
-        ++stats.rescheduledLoads;
-        DPRINTF(LSQUnit, "Strictly ordered load [sn:%lli] PC %s\n",
-                load_inst->seqNum, load_inst->pcState());
-
-        // Must delete request now that it wasn't handed off to
-        // memory.  This is quite ugly.  @todo: Figure out the proper
-        // place to really handle request deletes.
-        load_req.setRequest(nullptr);
-        req->discard();
-        return std::make_shared<GenericISA::M5PanicFault>(
-            "Strictly ordered load [sn:%llx] PC %s\n",
-            load_inst->seqNum, load_inst->pcState());
-    }
-
-    DPRINTF(LSQUnit, "Read called, load idx: %i, store idx: %i, "
-            "storeHead: %i addr: %#x%s\n",
-            load_idx - 1, load_inst->sqIt._idx, storeQueue.head() - 1,
-            req->mainRequest()->getPaddr(), req->isSplit() ? " split" : "");
-
-    if (req->mainRequest()->isLLSC()) {
-        // Disable recording the result temporarily.  Writing to misc
-        // regs normally updates the result, but this is not the
-        // desired behavior when handling store conditionals.
-        load_inst->recordResult(false);
-        TheISA::handleLockedRead(load_inst.get(), req->mainRequest());
-        load_inst->recordResult(true);
-    }
-
-    if (req->mainRequest()->isLocalAccess()) {
-        assert(!load_inst->memData);
-        assert(!load_inst->inHtmTransactionalState());
-        load_inst->memData = new uint8_t[MaxDataBytes];
-
-        ThreadContext *thread = cpu->tcBase(lsqID);
-        PacketPtr main_pkt = new Packet(req->mainRequest(), MemCmd::ReadReq);
-
-        main_pkt->dataStatic(load_inst->memData);
-
-        Cycles delay = req->mainRequest()->localAccessor(thread, main_pkt);
-
-        WritebackEvent *wb = new WritebackEvent(load_inst, main_pkt, this);
-        cpu->schedule(wb, cpu->clockEdge(delay));
-        return NoFault;
-    }
-
-    // hardware transactional memory
-    if (req->mainRequest()->isHTMStart() || req->mainRequest()->isHTMCommit())
-    {
-        // don't want to send nested transactionStarts and
-        // transactionStops outside of core, e.g. to Ruby
-        if (req->mainRequest()->getFlags().isSet(Request::NO_ACCESS)) {
-            Cycles delay(0);
-            PacketPtr data_pkt =
-                new Packet(req->mainRequest(), MemCmd::ReadReq);
-
-            // Allocate memory if this is the first time a load is issued.
-            if (!load_inst->memData) {
-                load_inst->memData =
-                    new uint8_t[req->mainRequest()->getSize()];
-                // sanity checks espect zero in request's data
-                memset(load_inst->memData, 0, req->mainRequest()->getSize());
-            }
-
-            data_pkt->dataStatic(load_inst->memData);
-            if (load_inst->inHtmTransactionalState()) {
-                data_pkt->setHtmTransactional(
-                    load_inst->getHtmTransactionUid());
-            }
-            data_pkt->makeResponse();
-
-            WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this);
-            cpu->schedule(wb, cpu->clockEdge(delay));
-            return NoFault;
-        }
-    }
-
-    // Check the SQ for any previous stores that might lead to forwarding
-    auto store_it = load_inst->sqIt;
-    assert (store_it >= storeWBIt);
-    // End once we've reached the top of the LSQ
-    while (store_it != storeWBIt) {
-        // Move the index to one younger
-        store_it--;
-        assert(store_it->valid());
-        assert(store_it->instruction()->seqNum < load_inst->seqNum);
-        int store_size = store_it->size();
-
-        // Cache maintenance instructions go down via the store
-        // path but they carry no data and they shouldn't be
-        // considered for forwarding
-        if (store_size != 0 && !store_it->instruction()->strictlyOrdered() &&
-            !(store_it->request()->mainRequest() &&
-              store_it->request()->mainRequest()->isCacheMaintenance())) {
-            assert(store_it->instruction()->effAddrValid());
-
-            // Check if the store data is within the lower and upper bounds of
-            // addresses that the request needs.
-            auto req_s = req->mainRequest()->getVaddr();
-            auto req_e = req_s + req->mainRequest()->getSize();
-            auto st_s = store_it->instruction()->effAddr;
-            auto st_e = st_s + store_size;
-
-            bool store_has_lower_limit = req_s >= st_s;
-            bool store_has_upper_limit = req_e <= st_e;
-            bool lower_load_has_store_part = req_s < st_e;
-            bool upper_load_has_store_part = req_e > st_s;
-
-            auto coverage = AddrRangeCoverage::NoAddrRangeCoverage;
-
-            // If the store entry is not atomic (atomic does not have valid
-            // data), the store has all of the data needed, and
-            // the load is not LLSC, then
-            // we can forward data from the store to the load
-            if (!store_it->instruction()->isAtomic() &&
-                store_has_lower_limit && store_has_upper_limit &&
-                !req->mainRequest()->isLLSC()) {
-
-                const auto& store_req = store_it->request()->mainRequest();
-                coverage = store_req->isMasked() ?
-                    AddrRangeCoverage::PartialAddrRangeCoverage :
-                    AddrRangeCoverage::FullAddrRangeCoverage;
-            } else if (
-                // This is the partial store-load forwarding case where a store
-                // has only part of the load's data and the load isn't LLSC
-                (!req->mainRequest()->isLLSC() &&
-                 ((store_has_lower_limit && lower_load_has_store_part) ||
-                  (store_has_upper_limit && upper_load_has_store_part) ||
-                  (lower_load_has_store_part && upper_load_has_store_part))) ||
-                // The load is LLSC, and the store has all or part of the
-                // load's data
-                (req->mainRequest()->isLLSC() &&
-                 ((store_has_lower_limit || upper_load_has_store_part) &&
-                  (store_has_upper_limit || lower_load_has_store_part))) ||
-                // The store entry is atomic and has all or part of the load's
-                // data
-                (store_it->instruction()->isAtomic() &&
-                 ((store_has_lower_limit || upper_load_has_store_part) &&
-                  (store_has_upper_limit || lower_load_has_store_part)))) {
-
-                coverage = AddrRangeCoverage::PartialAddrRangeCoverage;
-            }
-
-            if (coverage == AddrRangeCoverage::FullAddrRangeCoverage) {
-                // Get shift amount for offset into the store's data.
-                int shift_amt = req->mainRequest()->getVaddr() -
-                    store_it->instruction()->effAddr;
-
-                // Allocate memory if this is the first time a load is issued.
-                if (!load_inst->memData) {
-                    load_inst->memData =
-                        new uint8_t[req->mainRequest()->getSize()];
-                }
-                if (store_it->isAllZeros())
-                    memset(load_inst->memData, 0,
-                            req->mainRequest()->getSize());
-                else
-                    memcpy(load_inst->memData,
-                        store_it->data() + shift_amt,
-                        req->mainRequest()->getSize());
-
-                DPRINTF(LSQUnit, "Forwarding from store idx %i to load to "
-                        "addr %#x\n", store_it._idx,
-                        req->mainRequest()->getVaddr());
-
-                PacketPtr data_pkt = new Packet(req->mainRequest(),
-                        MemCmd::ReadReq);
-                data_pkt->dataStatic(load_inst->memData);
-
-                // hardware transactional memory
-                // Store to load forwarding within a transaction
-                // This should be okay because the store will be sent to
-                // the memory subsystem and subsequently get added to the
-                // write set of the transaction. The write set has a stronger
-                // property than the read set, so the load doesn't necessarily
-                // have to be there.
-                assert(!req->mainRequest()->isHTMCmd());
-                if (load_inst->inHtmTransactionalState()) {
-                    assert (!storeQueue[store_it._idx].completed());
-                    assert (
-                        storeQueue[store_it._idx].instruction()->
-                          inHtmTransactionalState());
-                    assert (
-                        load_inst->getHtmTransactionUid() ==
-                        storeQueue[store_it._idx].instruction()->
-                          getHtmTransactionUid());
-                    data_pkt->setHtmTransactional(
-                        load_inst->getHtmTransactionUid());
-                    DPRINTF(HtmCpu, "HTM LD (ST2LDF) "
-                      "pc=0x%lx - vaddr=0x%lx - "
-                      "paddr=0x%lx - htmUid=%u\n",
-                      load_inst->instAddr(),
-                      data_pkt->req->hasVaddr() ?
-                        data_pkt->req->getVaddr() : 0lu,
-                      data_pkt->getAddr(),
-                      load_inst->getHtmTransactionUid());
-                }
-
-                if (req->isAnyOutstandingRequest()) {
-                    assert(req->_numOutstandingPackets > 0);
-                    // There are memory requests packets in flight already.
-                    // This may happen if the store was not complete the
-                    // first time this load got executed. Signal the senderSate
-                    // that response packets should be discarded.
-                    req->discardSenderState();
-                }
-
-                WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt,
-                        this);
-
-                // We'll say this has a 1 cycle load-store forwarding latency
-                // for now.
-                // @todo: Need to make this a parameter.
-                cpu->schedule(wb, curTick());
-
-                // Don't need to do anything special for split loads.
-                ++stats.forwLoads;
-
-                return NoFault;
-            } else if (coverage == AddrRangeCoverage::PartialAddrRangeCoverage) {
-                // If it's already been written back, then don't worry about
-                // stalling on it.
-                if (store_it->completed()) {
-                    panic("Should not check one of these");
-                    continue;
-                }
-
-                // Must stall load and force it to retry, so long as it's the
-                // oldest load that needs to do so.
-                if (!stalled ||
-                    (stalled &&
-                     load_inst->seqNum <
-                     loadQueue[stallingLoadIdx].instruction()->seqNum)) {
-                    stalled = true;
-                    stallingStoreIsn = store_it->instruction()->seqNum;
-                    stallingLoadIdx = load_idx;
-                }
-
-                // Tell IQ/mem dep unit that this instruction will need to be
-                // rescheduled eventually
-                iewStage->rescheduleMemInst(load_inst);
-                load_inst->clearIssued();
-                load_inst->effAddrValid(false);
-                ++stats.rescheduledLoads;
-
-                // Do not generate a writeback event as this instruction is not
-                // complete.
-                DPRINTF(LSQUnit, "Load-store forwarding mis-match. "
-                        "Store idx %i to load addr %#x\n",
-                        store_it._idx, req->mainRequest()->getVaddr());
-
-                // Must discard the request.
-                req->discard();
-                load_req.setRequest(nullptr);
-                return NoFault;
-            }
-        }
-    }
-
-    // If there's no forwarding case, then go access memory
-    DPRINTF(LSQUnit, "Doing memory access for inst [sn:%lli] PC %s\n",
-            load_inst->seqNum, load_inst->pcState());
-
-    // Allocate memory if this is the first time a load is issued.
-    if (!load_inst->memData) {
-        load_inst->memData = new uint8_t[req->mainRequest()->getSize()];
-    }
-
-
-    // hardware transactional memory
-    if (req->mainRequest()->isHTMCmd()) {
-        // this is a simple sanity check
-        // the Ruby cache controller will set
-        // memData to 0x0ul if successful.
-        *load_inst->memData = (uint64_t) 0x1ull;
-    }
-
-    // For now, load throughput is constrained by the number of
-    // load FUs only, and loads do not consume a cache port (only
-    // stores do).
-    // @todo We should account for cache port contention
-    // and arbitrate between loads and stores.
-
-    // if we the cache is not blocked, do cache access
-    if (req->senderState() == nullptr) {
-        LQSenderState *state = new LQSenderState(
-                loadQueue.getIterator(load_idx));
-        state->isLoad = true;
-        state->inst = load_inst;
-        state->isSplit = req->isSplit();
-        req->senderState(state);
-    }
-    req->buildPackets();
-    req->sendPacketToCache();
-    if (!req->isSent())
-        iewStage->blockMemInst(load_inst);
-
-    return NoFault;
-}
-
-template <class Impl>
-Fault
-LSQUnit<Impl>::write(LSQRequest *req, uint8_t *data, int store_idx)
-{
-    assert(storeQueue[store_idx].valid());
-
-    DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x | storeHead:%i "
-            "[sn:%llu]\n",
-            store_idx - 1, req->request()->getPaddr(), storeQueue.head() - 1,
-            storeQueue[store_idx].instruction()->seqNum);
-
-    storeQueue[store_idx].setRequest(req);
-    unsigned size = req->_size;
-    storeQueue[store_idx].size() = size;
-    bool store_no_data =
-        req->mainRequest()->getFlags() & Request::STORE_NO_DATA;
-    storeQueue[store_idx].isAllZeros() = store_no_data;
-    assert(size <= SQEntry::DataSize || store_no_data);
-
-    // copy data into the storeQueue only if the store request has valid data
-    if (!(req->request()->getFlags() & Request::CACHE_BLOCK_ZERO) &&
-        !req->request()->isCacheMaintenance() &&
-        !req->request()->isAtomic())
-        memcpy(storeQueue[store_idx].data(), data, size);
-
-    // This function only writes the data to the store queue, so no fault
-    // can happen here.
-    return NoFault;
-}
-
 #endif // __CPU_O3_LSQ_UNIT_HH__
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
index bafd88e4b3..174916df99 100644
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -60,7 +60,7 @@
 #include "mem/request.hh"
 
 template<class Impl>
-LSQUnit<Impl>::WritebackEvent::WritebackEvent(const DynInstPtr &_inst,
+LSQUnit<Impl>::WritebackEvent::WritebackEvent(const O3DynInstPtr &_inst,
         PacketPtr _pkt, LSQUnit *lsq_ptr)
     : Event(Default_Pri, AutoDelete),
       inst(_inst), pkt(_pkt), lsqPtr(lsq_ptr)
@@ -112,7 +112,7 @@ void
 LSQUnit<Impl>::completeDataAccess(PacketPtr pkt)
 {
     LSQSenderState *state = dynamic_cast<LSQSenderState *>(pkt->senderState);
-    DynInstPtr inst = state->inst;
+    O3DynInstPtr inst = state->inst;
 
     // hardware transactional memory
     // sanity check
@@ -317,7 +317,7 @@ LSQUnit<Impl>::takeOverFrom()
 
 template <class Impl>
 void
-LSQUnit<Impl>::insert(const DynInstPtr &inst)
+LSQUnit<Impl>::insert(const O3DynInstPtr &inst)
 {
     assert(inst->isMemRef());
 
@@ -334,7 +334,7 @@ LSQUnit<Impl>::insert(const DynInstPtr &inst)
 
 template <class Impl>
 void
-LSQUnit<Impl>::insertLoad(const DynInstPtr &load_inst)
+LSQUnit<Impl>::insertLoad(const O3DynInstPtr &load_inst)
 {
     assert(!loadQueue.full());
     assert(loads < loadQueue.capacity());
@@ -397,7 +397,7 @@ LSQUnit<Impl>::insertLoad(const DynInstPtr &load_inst)
 
 template <class Impl>
 void
-LSQUnit<Impl>::insertStore(const DynInstPtr& store_inst)
+LSQUnit<Impl>::insertStore(const O3DynInstPtr& store_inst)
 {
     // Make sure it is not full before inserting an instruction.
     assert(!storeQueue.full());
@@ -418,10 +418,10 @@ LSQUnit<Impl>::insertStore(const DynInstPtr& store_inst)
 }
 
 template <class Impl>
-typename Impl::DynInstPtr
+O3DynInstPtr
 LSQUnit<Impl>::getMemDepViolator()
 {
-    DynInstPtr temp = memDepViolator;
+    O3DynInstPtr temp = memDepViolator;
 
     memDepViolator = NULL;
 
@@ -475,7 +475,7 @@ LSQUnit<Impl>::checkSnoop(PacketPtr pkt)
 
     Addr invalidate_addr = pkt->getAddr() & cacheBlockMask;
 
-    DynInstPtr ld_inst = iter->instruction();
+    O3DynInstPtr ld_inst = iter->instruction();
     assert(ld_inst);
     LSQRequest *req = iter->request();
 
@@ -535,7 +535,7 @@ LSQUnit<Impl>::checkSnoop(PacketPtr pkt)
 template <class Impl>
 Fault
 LSQUnit<Impl>::checkViolations(typename LoadQueue::iterator& loadIt,
-        const DynInstPtr& inst)
+        const O3DynInstPtr& inst)
 {
     Addr inst_eff_addr1 = inst->effAddr >> depCheckShift;
     Addr inst_eff_addr2 = (inst->effAddr + inst->effSize - 1) >> depCheckShift;
@@ -546,7 +546,7 @@ LSQUnit<Impl>::checkViolations(typename LoadQueue::iterator& loadIt,
      * like the implementation that came before it, we're overly conservative.
      */
     while (loadIt != loadQueue.end()) {
-        DynInstPtr ld_inst = loadIt->instruction();
+        O3DynInstPtr ld_inst = loadIt->instruction();
         if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) {
             ++loadIt;
             continue;
@@ -615,7 +615,7 @@ LSQUnit<Impl>::checkViolations(typename LoadQueue::iterator& loadIt,
 
 template <class Impl>
 Fault
-LSQUnit<Impl>::executeLoad(const DynInstPtr &inst)
+LSQUnit<Impl>::executeLoad(const O3DynInstPtr &inst)
 {
     // Execute a specific load.
     Fault load_fault = NoFault;
@@ -682,7 +682,7 @@ LSQUnit<Impl>::executeLoad(const DynInstPtr &inst)
 
 template <class Impl>
 Fault
-LSQUnit<Impl>::executeStore(const DynInstPtr &store_inst)
+LSQUnit<Impl>::executeStore(const O3DynInstPtr &store_inst)
 {
     // Make sure that a store exists.
     assert(stores != 0);
@@ -837,7 +837,7 @@ LSQUnit<Impl>::writebackStores()
         assert(storeWBIt->hasRequest());
         assert(!storeWBIt->committed());
 
-        DynInstPtr inst = storeWBIt->instruction();
+        O3DynInstPtr inst = storeWBIt->instruction();
         LSQRequest* req = storeWBIt->request();
 
         // Process store conditionals or store release after all previous
@@ -1095,7 +1095,7 @@ LSQUnit<Impl>::storePostSend()
 
 template <class Impl>
 void
-LSQUnit<Impl>::writeback(const DynInstPtr &inst, PacketPtr pkt)
+LSQUnit<Impl>::writeback(const O3DynInstPtr &inst, PacketPtr pkt)
 {
     iewStage->wakeCPU();
 
@@ -1170,7 +1170,7 @@ LSQUnit<Impl>::completeStore(typename StoreQueue::iterator store_idx)
 
     /* We 'need' a copy here because we may clear the entry from the
      * store queue. */
-    DynInstPtr store_inst = store_idx->instruction();
+    O3DynInstPtr store_inst = store_idx->instruction();
     if (store_idx == storeQueue.begin()) {
         do {
             storeQueue.front().clear();
@@ -1279,7 +1279,7 @@ LSQUnit<Impl>::dumpInsts() const
     cprintf("Load queue: ");
 
     for (const auto& e: loadQueue) {
-        const DynInstPtr &inst(e.instruction());
+        const O3DynInstPtr &inst(e.instruction());
         cprintf("%s.[sn:%llu] ", inst->pcState(), inst->seqNum);
     }
     cprintf("\n");
@@ -1288,7 +1288,7 @@ LSQUnit<Impl>::dumpInsts() const
     cprintf("Store queue: ");
 
     for (const auto& e: storeQueue) {
-        const DynInstPtr &inst(e.instruction());
+        const O3DynInstPtr &inst(e.instruction());
         cprintf("%s.[sn:%llu] ", inst->pcState(), inst->seqNum);
     }
 
@@ -1302,4 +1302,358 @@ LSQUnit<Impl>::cacheLineSize()
     return cpu->cacheLineSize();
 }
 
+template <class Impl>
+Fault
+LSQUnit<Impl>::read(LSQRequest *req, int load_idx)
+{
+    LQEntry& load_req = loadQueue[load_idx];
+    const O3DynInstPtr& load_inst = load_req.instruction();
+
+    load_req.setRequest(req);
+    assert(load_inst);
+
+    assert(!load_inst->isExecuted());
+
+    // Make sure this isn't a strictly ordered load
+    // A bit of a hackish way to get strictly ordered accesses to work
+    // only if they're at the head of the LSQ and are ready to commit
+    // (at the head of the ROB too).
+
+    if (req->mainRequest()->isStrictlyOrdered() &&
+        (load_idx != loadQueue.head() || !load_inst->isAtCommit())) {
+        // Tell IQ/mem dep unit that this instruction will need to be
+        // rescheduled eventually
+        iewStage->rescheduleMemInst(load_inst);
+        load_inst->clearIssued();
+        load_inst->effAddrValid(false);
+        ++stats.rescheduledLoads;
+        DPRINTF(LSQUnit, "Strictly ordered load [sn:%lli] PC %s\n",
+                load_inst->seqNum, load_inst->pcState());
+
+        // Must delete request now that it wasn't handed off to
+        // memory.  This is quite ugly.  @todo: Figure out the proper
+        // place to really handle request deletes.
+        load_req.setRequest(nullptr);
+        req->discard();
+        return std::make_shared<GenericISA::M5PanicFault>(
+            "Strictly ordered load [sn:%llx] PC %s\n",
+            load_inst->seqNum, load_inst->pcState());
+    }
+
+    DPRINTF(LSQUnit, "Read called, load idx: %i, store idx: %i, "
+            "storeHead: %i addr: %#x%s\n",
+            load_idx - 1, load_inst->sqIt._idx, storeQueue.head() - 1,
+            req->mainRequest()->getPaddr(), req->isSplit() ? " split" : "");
+
+    if (req->mainRequest()->isLLSC()) {
+        // Disable recording the result temporarily.  Writing to misc
+        // regs normally updates the result, but this is not the
+        // desired behavior when handling store conditionals.
+        load_inst->recordResult(false);
+        TheISA::handleLockedRead(load_inst.get(), req->mainRequest());
+        load_inst->recordResult(true);
+    }
+
+    if (req->mainRequest()->isLocalAccess()) {
+        assert(!load_inst->memData);
+        assert(!load_inst->inHtmTransactionalState());
+        load_inst->memData = new uint8_t[MaxDataBytes];
+
+        ThreadContext *thread = cpu->tcBase(lsqID);
+        PacketPtr main_pkt = new Packet(req->mainRequest(), MemCmd::ReadReq);
+
+        main_pkt->dataStatic(load_inst->memData);
+
+        Cycles delay = req->mainRequest()->localAccessor(thread, main_pkt);
+
+        WritebackEvent *wb = new WritebackEvent(load_inst, main_pkt, this);
+        cpu->schedule(wb, cpu->clockEdge(delay));
+        return NoFault;
+    }
+
+    // hardware transactional memory
+    if (req->mainRequest()->isHTMStart() || req->mainRequest()->isHTMCommit())
+    {
+        // don't want to send nested transactionStarts and
+        // transactionStops outside of core, e.g. to Ruby
+        if (req->mainRequest()->getFlags().isSet(Request::NO_ACCESS)) {
+            Cycles delay(0);
+            PacketPtr data_pkt =
+                new Packet(req->mainRequest(), MemCmd::ReadReq);
+
+            // Allocate memory if this is the first time a load is issued.
+            if (!load_inst->memData) {
+                load_inst->memData =
+                    new uint8_t[req->mainRequest()->getSize()];
+                // sanity checks espect zero in request's data
+                memset(load_inst->memData, 0, req->mainRequest()->getSize());
+            }
+
+            data_pkt->dataStatic(load_inst->memData);
+            if (load_inst->inHtmTransactionalState()) {
+                data_pkt->setHtmTransactional(
+                    load_inst->getHtmTransactionUid());
+            }
+            data_pkt->makeResponse();
+
+            WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this);
+            cpu->schedule(wb, cpu->clockEdge(delay));
+            return NoFault;
+        }
+    }
+
+    // Check the SQ for any previous stores that might lead to forwarding
+    auto store_it = load_inst->sqIt;
+    assert (store_it >= storeWBIt);
+    // End once we've reached the top of the LSQ
+    while (store_it != storeWBIt) {
+        // Move the index to one younger
+        store_it--;
+        assert(store_it->valid());
+        assert(store_it->instruction()->seqNum < load_inst->seqNum);
+        int store_size = store_it->size();
+
+        // Cache maintenance instructions go down via the store
+        // path but they carry no data and they shouldn't be
+        // considered for forwarding
+        if (store_size != 0 && !store_it->instruction()->strictlyOrdered() &&
+            !(store_it->request()->mainRequest() &&
+              store_it->request()->mainRequest()->isCacheMaintenance())) {
+            assert(store_it->instruction()->effAddrValid());
+
+            // Check if the store data is within the lower and upper bounds of
+            // addresses that the request needs.
+            auto req_s = req->mainRequest()->getVaddr();
+            auto req_e = req_s + req->mainRequest()->getSize();
+            auto st_s = store_it->instruction()->effAddr;
+            auto st_e = st_s + store_size;
+
+            bool store_has_lower_limit = req_s >= st_s;
+            bool store_has_upper_limit = req_e <= st_e;
+            bool lower_load_has_store_part = req_s < st_e;
+            bool upper_load_has_store_part = req_e > st_s;
+
+            auto coverage = AddrRangeCoverage::NoAddrRangeCoverage;
+
+            // If the store entry is not atomic (atomic does not have valid
+            // data), the store has all of the data needed, and
+            // the load is not LLSC, then
+            // we can forward data from the store to the load
+            if (!store_it->instruction()->isAtomic() &&
+                store_has_lower_limit && store_has_upper_limit &&
+                !req->mainRequest()->isLLSC()) {
+
+                const auto& store_req = store_it->request()->mainRequest();
+                coverage = store_req->isMasked() ?
+                    AddrRangeCoverage::PartialAddrRangeCoverage :
+                    AddrRangeCoverage::FullAddrRangeCoverage;
+            } else if (
+                // This is the partial store-load forwarding case where a store
+                // has only part of the load's data and the load isn't LLSC
+                (!req->mainRequest()->isLLSC() &&
+                 ((store_has_lower_limit && lower_load_has_store_part) ||
+                  (store_has_upper_limit && upper_load_has_store_part) ||
+                  (lower_load_has_store_part && upper_load_has_store_part))) ||
+                // The load is LLSC, and the store has all or part of the
+                // load's data
+                (req->mainRequest()->isLLSC() &&
+                 ((store_has_lower_limit || upper_load_has_store_part) &&
+                  (store_has_upper_limit || lower_load_has_store_part))) ||
+                // The store entry is atomic and has all or part of the load's
+                // data
+                (store_it->instruction()->isAtomic() &&
+                 ((store_has_lower_limit || upper_load_has_store_part) &&
+                  (store_has_upper_limit || lower_load_has_store_part)))) {
+
+                coverage = AddrRangeCoverage::PartialAddrRangeCoverage;
+            }
+
+            if (coverage == AddrRangeCoverage::FullAddrRangeCoverage) {
+                // Get shift amount for offset into the store's data.
+                int shift_amt = req->mainRequest()->getVaddr() -
+                    store_it->instruction()->effAddr;
+
+                // Allocate memory if this is the first time a load is issued.
+                if (!load_inst->memData) {
+                    load_inst->memData =
+                        new uint8_t[req->mainRequest()->getSize()];
+                }
+                if (store_it->isAllZeros())
+                    memset(load_inst->memData, 0,
+                            req->mainRequest()->getSize());
+                else
+                    memcpy(load_inst->memData,
+                        store_it->data() + shift_amt,
+                        req->mainRequest()->getSize());
+
+                DPRINTF(LSQUnit, "Forwarding from store idx %i to load to "
+                        "addr %#x\n", store_it._idx,
+                        req->mainRequest()->getVaddr());
+
+                PacketPtr data_pkt = new Packet(req->mainRequest(),
+                        MemCmd::ReadReq);
+                data_pkt->dataStatic(load_inst->memData);
+
+                // hardware transactional memory
+                // Store to load forwarding within a transaction
+                // This should be okay because the store will be sent to
+                // the memory subsystem and subsequently get added to the
+                // write set of the transaction. The write set has a stronger
+                // property than the read set, so the load doesn't necessarily
+                // have to be there.
+                assert(!req->mainRequest()->isHTMCmd());
+                if (load_inst->inHtmTransactionalState()) {
+                    assert (!storeQueue[store_it._idx].completed());
+                    assert (
+                        storeQueue[store_it._idx].instruction()->
+                          inHtmTransactionalState());
+                    assert (
+                        load_inst->getHtmTransactionUid() ==
+                        storeQueue[store_it._idx].instruction()->
+                          getHtmTransactionUid());
+                    data_pkt->setHtmTransactional(
+                        load_inst->getHtmTransactionUid());
+                    DPRINTF(HtmCpu, "HTM LD (ST2LDF) "
+                      "pc=0x%lx - vaddr=0x%lx - "
+                      "paddr=0x%lx - htmUid=%u\n",
+                      load_inst->instAddr(),
+                      data_pkt->req->hasVaddr() ?
+                        data_pkt->req->getVaddr() : 0lu,
+                      data_pkt->getAddr(),
+                      load_inst->getHtmTransactionUid());
+                }
+
+                if (req->isAnyOutstandingRequest()) {
+                    assert(req->_numOutstandingPackets > 0);
+                    // There are memory requests packets in flight already.
+                    // This may happen if the store was not complete the
+                    // first time this load got executed. Signal the senderSate
+                    // that response packets should be discarded.
+                    req->discardSenderState();
+                }
+
+                WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt,
+                        this);
+
+                // We'll say this has a 1 cycle load-store forwarding latency
+                // for now.
+                // @todo: Need to make this a parameter.
+                cpu->schedule(wb, curTick());
+
+                // Don't need to do anything special for split loads.
+                ++stats.forwLoads;
+
+                return NoFault;
+            } else if (
+                    coverage == AddrRangeCoverage::PartialAddrRangeCoverage) {
+                // If it's already been written back, then don't worry about
+                // stalling on it.
+                if (store_it->completed()) {
+                    panic("Should not check one of these");
+                    continue;
+                }
+
+                // Must stall load and force it to retry, so long as it's the
+                // oldest load that needs to do so.
+                if (!stalled ||
+                    (stalled &&
+                     load_inst->seqNum <
+                     loadQueue[stallingLoadIdx].instruction()->seqNum)) {
+                    stalled = true;
+                    stallingStoreIsn = store_it->instruction()->seqNum;
+                    stallingLoadIdx = load_idx;
+                }
+
+                // Tell IQ/mem dep unit that this instruction will need to be
+                // rescheduled eventually
+                iewStage->rescheduleMemInst(load_inst);
+                load_inst->clearIssued();
+                load_inst->effAddrValid(false);
+                ++stats.rescheduledLoads;
+
+                // Do not generate a writeback event as this instruction is not
+                // complete.
+                DPRINTF(LSQUnit, "Load-store forwarding mis-match. "
+                        "Store idx %i to load addr %#x\n",
+                        store_it._idx, req->mainRequest()->getVaddr());
+
+                // Must discard the request.
+                req->discard();
+                load_req.setRequest(nullptr);
+                return NoFault;
+            }
+        }
+    }
+
+    // If there's no forwarding case, then go access memory
+    DPRINTF(LSQUnit, "Doing memory access for inst [sn:%lli] PC %s\n",
+            load_inst->seqNum, load_inst->pcState());
+
+    // Allocate memory if this is the first time a load is issued.
+    if (!load_inst->memData) {
+        load_inst->memData = new uint8_t[req->mainRequest()->getSize()];
+    }
+
+
+    // hardware transactional memory
+    if (req->mainRequest()->isHTMCmd()) {
+        // this is a simple sanity check
+        // the Ruby cache controller will set
+        // memData to 0x0ul if successful.
+        *load_inst->memData = (uint64_t) 0x1ull;
+    }
+
+    // For now, load throughput is constrained by the number of
+    // load FUs only, and loads do not consume a cache port (only
+    // stores do).
+    // @todo We should account for cache port contention
+    // and arbitrate between loads and stores.
+
+    // if we the cache is not blocked, do cache access
+    if (req->senderState() == nullptr) {
+        LQSenderState *state = new LQSenderState(
+                loadQueue.getIterator(load_idx));
+        state->isLoad = true;
+        state->inst = load_inst;
+        state->isSplit = req->isSplit();
+        req->senderState(state);
+    }
+    req->buildPackets();
+    req->sendPacketToCache();
+    if (!req->isSent())
+        iewStage->blockMemInst(load_inst);
+
+    return NoFault;
+}
+
+template <class Impl>
+Fault
+LSQUnit<Impl>::write(LSQRequest *req, uint8_t *data, int store_idx)
+{
+    assert(storeQueue[store_idx].valid());
+
+    DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x | storeHead:%i "
+            "[sn:%llu]\n",
+            store_idx - 1, req->request()->getPaddr(), storeQueue.head() - 1,
+            storeQueue[store_idx].instruction()->seqNum);
+
+    storeQueue[store_idx].setRequest(req);
+    unsigned size = req->_size;
+    storeQueue[store_idx].size() = size;
+    bool store_no_data =
+        req->mainRequest()->getFlags() & Request::STORE_NO_DATA;
+    storeQueue[store_idx].isAllZeros() = store_no_data;
+    assert(size <= SQEntry::DataSize || store_no_data);
+
+    // copy data into the storeQueue only if the store request has valid data
+    if (!(req->request()->getFlags() & Request::CACHE_BLOCK_ZERO) &&
+        !req->request()->isCacheMaintenance() &&
+        !req->request()->isAtomic())
+        memcpy(storeQueue[store_idx].data(), data, size);
+
+    // This function only writes the data to the store queue, so no fault
+    // can happen here.
+    return NoFault;
+}
+
 #endif//__CPU_O3_LSQ_UNIT_IMPL_HH__
diff --git a/src/cpu/o3/mem_dep_unit.hh b/src/cpu/o3/mem_dep_unit.hh
index 8178a4913a..b2cf9bf8d2 100644
--- a/src/cpu/o3/mem_dep_unit.hh
+++ b/src/cpu/o3/mem_dep_unit.hh
@@ -49,6 +49,7 @@
 
 #include "base/statistics.hh"
 #include "cpu/inst_seq.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/limits.hh"
 #include "debug/MemDepUnit.hh"
 
@@ -85,8 +86,6 @@ class MemDepUnit
     std::string _name;
 
   public:
-    typedef typename Impl::DynInstPtr DynInstPtr;
-    typedef typename Impl::DynInstConstPtr DynInstConstPtr;
     typedef typename Impl::O3CPU O3CPU;
 
     /** Empty constructor. Must call init() prior to using in this case. */
@@ -117,22 +116,22 @@ class MemDepUnit
     void setIQ(InstructionQueue<Impl> *iq_ptr);
 
     /** Inserts a memory instruction. */
-    void insert(const DynInstPtr &inst);
+    void insert(const O3DynInstPtr &inst);
 
     /** Inserts a non-speculative memory instruction. */
-    void insertNonSpec(const DynInstPtr &inst);
+    void insertNonSpec(const O3DynInstPtr &inst);
 
     /** Inserts a barrier instruction. */
-    void insertBarrier(const DynInstPtr &barr_inst);
+    void insertBarrier(const O3DynInstPtr &barr_inst);
 
     /** Indicate that an instruction has its registers ready. */
-    void regsReady(const DynInstPtr &inst);
+    void regsReady(const O3DynInstPtr &inst);
 
     /** Indicate that a non-speculative instruction is ready. */
-    void nonSpecInstReady(const DynInstPtr &inst);
+    void nonSpecInstReady(const O3DynInstPtr &inst);
 
     /** Reschedules an instruction to be re-executed. */
-    void reschedule(const DynInstPtr &inst);
+    void reschedule(const O3DynInstPtr &inst);
 
     /** Replays all instructions that have been rescheduled by moving them to
      *  the ready list.
@@ -140,7 +139,7 @@ class MemDepUnit
     void replay();
 
     /** Notifies completion of an instruction. */
-    void completeInst(const DynInstPtr &inst);
+    void completeInst(const O3DynInstPtr &inst);
 
     /** Squashes all instructions up until a given sequence number for a
      *  specific thread.
@@ -148,11 +147,11 @@ class MemDepUnit
     void squash(const InstSeqNum &squashed_num, ThreadID tid);
 
     /** Indicates an ordering violation between a store and a younger load. */
-    void violation(const DynInstPtr &store_inst,
-                   const DynInstPtr &violating_load);
+    void violation(const O3DynInstPtr &store_inst,
+                   const O3DynInstPtr &violating_load);
 
     /** Issues the given instruction */
-    void issue(const DynInstPtr &inst);
+    void issue(const O3DynInstPtr &inst);
 
     /** Debugging function to dump the lists of instructions. */
     void dumpLists();
@@ -160,12 +159,12 @@ class MemDepUnit
   private:
 
     /** Completes a memory instruction. */
-    void completed(const DynInstPtr &inst);
+    void completed(const O3DynInstPtr &inst);
 
     /** Wakes any dependents of a memory instruction. */
-    void wakeDependents(const DynInstPtr &inst);
+    void wakeDependents(const O3DynInstPtr &inst);
 
-    typedef typename std::list<DynInstPtr>::iterator ListIt;
+    typedef typename std::list<O3DynInstPtr>::iterator ListIt;
 
     class MemDepEntry;
 
@@ -179,7 +178,7 @@ class MemDepUnit
     {
       public:
         /** Constructs a memory dependence entry. */
-        MemDepEntry(const DynInstPtr &new_inst)
+        MemDepEntry(const O3DynInstPtr &new_inst)
             : inst(new_inst), regsReady(false), memDeps(0),
               completed(false), squashed(false)
         {
@@ -209,7 +208,7 @@ class MemDepUnit
         std::string name() const { return "memdepentry"; }
 
         /** The instruction being tracked. */
-        DynInstPtr inst;
+        O3DynInstPtr inst;
 
         /** The iterator to the instruction's location inside the list. */
         ListIt listIt;
@@ -235,10 +234,10 @@ class MemDepUnit
     };
 
     /** Finds the memory dependence entry in the hash map. */
-    inline MemDepEntryPtr &findInHash(const DynInstConstPtr& inst);
+    MemDepEntryPtr &findInHash(const O3DynInstConstPtr& inst);
 
     /** Moves an entry to the ready list. */
-    inline void moveToReady(MemDepEntryPtr &ready_inst_entry);
+    void moveToReady(MemDepEntryPtr &ready_inst_entry);
 
     typedef std::unordered_map<InstSeqNum, MemDepEntryPtr, SNHash> MemDepHash;
 
@@ -248,10 +247,10 @@ class MemDepUnit
     MemDepHash memDepHash;
 
     /** A list of all instructions in the memory dependence unit. */
-    std::list<DynInstPtr> instList[O3MaxThreads];
+    std::list<O3DynInstPtr> instList[O3MaxThreads];
 
     /** A list of all instructions that are going to be replayed. */
-    std::list<DynInstPtr> instsToReplay;
+    std::list<O3DynInstPtr> instsToReplay;
 
     /** The memory dependence predictor.  It is accessed upon new
      *  instructions being added to the IQ, and responds by telling
@@ -273,7 +272,7 @@ class MemDepUnit
     bool hasStoreBarrier() const { return !storeBarrierSNs.empty(); }
 
     /** Inserts the SN of a barrier inst. to the list of tracked barriers */
-    void insertBarrierSN(const DynInstPtr &barr_inst);
+    void insertBarrierSN(const O3DynInstPtr &barr_inst);
 
     /** Pointer to the IQ. */
     InstructionQueue<Impl> *iqPtr;
diff --git a/src/cpu/o3/mem_dep_unit_impl.hh b/src/cpu/o3/mem_dep_unit_impl.hh
index 4f1f725229..34bba53d95 100644
--- a/src/cpu/o3/mem_dep_unit_impl.hh
+++ b/src/cpu/o3/mem_dep_unit_impl.hh
@@ -172,7 +172,7 @@ MemDepUnit<MemDepPred, Impl>::setIQ(InstructionQueue<Impl> *iq_ptr)
 
 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::insertBarrierSN(const DynInstPtr &barr_inst)
+MemDepUnit<MemDepPred, Impl>::insertBarrierSN(const O3DynInstPtr &barr_inst)
 {
     InstSeqNum barr_sn = barr_inst->seqNum;
 
@@ -205,7 +205,7 @@ MemDepUnit<MemDepPred, Impl>::insertBarrierSN(const DynInstPtr &barr_inst)
 
 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::insert(const DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::insert(const O3DynInstPtr &inst)
 {
     ThreadID tid = inst->threadNumber;
 
@@ -316,7 +316,7 @@ MemDepUnit<MemDepPred, Impl>::insert(const DynInstPtr &inst)
 
 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::insertNonSpec(const DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::insertNonSpec(const O3DynInstPtr &inst)
 {
     insertBarrier(inst);
 
@@ -338,7 +338,7 @@ MemDepUnit<MemDepPred, Impl>::insertNonSpec(const DynInstPtr &inst)
 
 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::insertBarrier(const DynInstPtr &barr_inst)
+MemDepUnit<MemDepPred, Impl>::insertBarrier(const O3DynInstPtr &barr_inst)
 {
     ThreadID tid = barr_inst->threadNumber;
 
@@ -361,7 +361,7 @@ MemDepUnit<MemDepPred, Impl>::insertBarrier(const DynInstPtr &barr_inst)
 
 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::regsReady(const DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::regsReady(const O3DynInstPtr &inst)
 {
     DPRINTF(MemDepUnit, "Marking registers as ready for "
             "instruction PC %s [sn:%lli].\n",
@@ -384,7 +384,7 @@ MemDepUnit<MemDepPred, Impl>::regsReady(const DynInstPtr &inst)
 
 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::nonSpecInstReady(const DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::nonSpecInstReady(const O3DynInstPtr &inst)
 {
     DPRINTF(MemDepUnit, "Marking non speculative "
             "instruction PC %s as ready [sn:%lli].\n",
@@ -397,7 +397,7 @@ MemDepUnit<MemDepPred, Impl>::nonSpecInstReady(const DynInstPtr &inst)
 
 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::reschedule(const DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::reschedule(const O3DynInstPtr &inst)
 {
     instsToReplay.push_back(inst);
 }
@@ -406,7 +406,7 @@ template <class MemDepPred, class Impl>
 void
 MemDepUnit<MemDepPred, Impl>::replay()
 {
-    DynInstPtr temp_inst;
+    O3DynInstPtr temp_inst;
 
     // For now this replay function replays all waiting memory ops.
     while (!instsToReplay.empty()) {
@@ -425,7 +425,7 @@ MemDepUnit<MemDepPred, Impl>::replay()
 
 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::completed(const DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::completed(const O3DynInstPtr &inst)
 {
     DPRINTF(MemDepUnit, "Completed mem instruction PC %s [sn:%lli].\n",
             inst->pcState(), inst->seqNum);
@@ -449,7 +449,7 @@ MemDepUnit<MemDepPred, Impl>::completed(const DynInstPtr &inst)
 
 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::completeInst(const DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::completeInst(const O3DynInstPtr &inst)
 {
     wakeDependents(inst);
     completed(inst);
@@ -481,7 +481,7 @@ MemDepUnit<MemDepPred, Impl>::completeInst(const DynInstPtr &inst)
 
 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::wakeDependents(const DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::wakeDependents(const O3DynInstPtr &inst)
 {
     // Only stores, atomics and barriers have dependents.
     if (!inst->isStore() && !inst->isAtomic() && !inst->isReadBarrier() &&
@@ -570,8 +570,8 @@ MemDepUnit<MemDepPred, Impl>::squash(const InstSeqNum &squashed_num,
 
 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::violation(const DynInstPtr &store_inst,
-                                        const DynInstPtr &violating_load)
+MemDepUnit<MemDepPred, Impl>::violation(const O3DynInstPtr &store_inst,
+                                        const O3DynInstPtr &violating_load)
 {
     DPRINTF(MemDepUnit, "Passing violating PCs to store sets,"
             " load: %#x, store: %#x\n", violating_load->instAddr(),
@@ -582,7 +582,7 @@ MemDepUnit<MemDepPred, Impl>::violation(const DynInstPtr &store_inst,
 
 template <class MemDepPred, class Impl>
 void
-MemDepUnit<MemDepPred, Impl>::issue(const DynInstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::issue(const O3DynInstPtr &inst)
 {
     DPRINTF(MemDepUnit, "Issuing instruction PC %#x [sn:%lli].\n",
             inst->instAddr(), inst->seqNum);
@@ -592,7 +592,7 @@ MemDepUnit<MemDepPred, Impl>::issue(const DynInstPtr &inst)
 
 template <class MemDepPred, class Impl>
 inline typename MemDepUnit<MemDepPred,Impl>::MemDepEntryPtr &
-MemDepUnit<MemDepPred, Impl>::findInHash(const DynInstConstPtr &inst)
+MemDepUnit<MemDepPred, Impl>::findInHash(const O3DynInstConstPtr &inst)
 {
     MemDepHashIt hash_it = memDepHash.find(inst->seqNum);
 
diff --git a/src/cpu/o3/probe/elastic_trace.cc b/src/cpu/o3/probe/elastic_trace.cc
index 2ae7c97013..afea6135db 100644
--- a/src/cpu/o3/probe/elastic_trace.cc
+++ b/src/cpu/o3/probe/elastic_trace.cc
@@ -40,6 +40,7 @@
 #include "base/callback.hh"
 #include "base/output.hh"
 #include "base/trace.hh"
+#include "cpu/o3/dyn_inst.hh"
 #include "cpu/reg_class.hh"
 #include "debug/ElasticTrace.hh"
 #include "mem/packet.hh"
@@ -124,21 +125,21 @@ ElasticTrace::regEtraceListeners()
     listeners.push_back(new ProbeListenerArg<ElasticTrace, RequestPtr>(this,
                         "FetchRequest", &ElasticTrace::fetchReqTrace));
     listeners.push_back(new ProbeListenerArg<ElasticTrace,
-            DynInstConstPtr>(this, "Execute",
+            O3DynInstConstPtr>(this, "Execute",
                 &ElasticTrace::recordExecTick));
     listeners.push_back(new ProbeListenerArg<ElasticTrace,
-            DynInstConstPtr>(this, "ToCommit",
+            O3DynInstConstPtr>(this, "ToCommit",
                 &ElasticTrace::recordToCommTick));
     listeners.push_back(new ProbeListenerArg<ElasticTrace,
-            DynInstConstPtr>(this, "Rename",
+            O3DynInstConstPtr>(this, "Rename",
                 &ElasticTrace::updateRegDep));
     listeners.push_back(new ProbeListenerArg<ElasticTrace, SeqNumRegPair>(this,
                         "SquashInRename", &ElasticTrace::removeRegDepMapEntry));
     listeners.push_back(new ProbeListenerArg<ElasticTrace,
-            DynInstConstPtr>(this, "Squash",
+            O3DynInstConstPtr>(this, "Squash",
                 &ElasticTrace::addSquashedInst));
     listeners.push_back(new ProbeListenerArg<ElasticTrace,
-            DynInstConstPtr>(this, "Commit",
+            O3DynInstConstPtr>(this, "Commit",
                 &ElasticTrace::addCommittedInst));
     allProbesReg = true;
 }
@@ -166,7 +167,7 @@ ElasticTrace::fetchReqTrace(const RequestPtr &req)
 }
 
 void
-ElasticTrace::recordExecTick(const DynInstConstPtr& dyn_inst)
+ElasticTrace::recordExecTick(const O3DynInstConstPtr& dyn_inst)
 {
 
     // In a corner case, a retired instruction is propagated backward to the
@@ -203,7 +204,7 @@ ElasticTrace::recordExecTick(const DynInstConstPtr& dyn_inst)
 }
 
 void
-ElasticTrace::recordToCommTick(const DynInstConstPtr& dyn_inst)
+ElasticTrace::recordToCommTick(const O3DynInstConstPtr& dyn_inst)
 {
     // If tracing has just been enabled then the instruction at this stage of
     // execution is far enough that we cannot gather info about its past like
@@ -224,7 +225,7 @@ ElasticTrace::recordToCommTick(const DynInstConstPtr& dyn_inst)
 }
 
 void
-ElasticTrace::updateRegDep(const DynInstConstPtr& dyn_inst)
+ElasticTrace::updateRegDep(const O3DynInstConstPtr& dyn_inst)
 {
     // Get the sequence number of the instruction
     InstSeqNum seq_num = dyn_inst->seqNum;
@@ -303,7 +304,7 @@ ElasticTrace::removeRegDepMapEntry(const SeqNumRegPair &inst_reg_pair)
 }
 
 void
-ElasticTrace::addSquashedInst(const DynInstConstPtr& head_inst)
+ElasticTrace::addSquashedInst(const O3DynInstConstPtr& head_inst)
 {
     // If the squashed instruction was squashed before being processed by
     // execute stage then it will not be in the temporary store. In this case
@@ -331,7 +332,7 @@ ElasticTrace::addSquashedInst(const DynInstConstPtr& head_inst)
 }
 
 void
-ElasticTrace::addCommittedInst(const DynInstConstPtr& head_inst)
+ElasticTrace::addCommittedInst(const O3DynInstConstPtr& head_inst)
 {
     DPRINTFR(ElasticTrace, "Attempt to add committed inst [sn:%lli]\n",
                 head_inst->seqNum);
@@ -390,7 +391,7 @@ ElasticTrace::addCommittedInst(const DynInstConstPtr& head_inst)
 }
 
 void
-ElasticTrace::addDepTraceRecord(const DynInstConstPtr& head_inst,
+ElasticTrace::addDepTraceRecord(const O3DynInstConstPtr& head_inst,
                                 InstExecInfo* exec_info_ptr, bool commit)
 {
     // Create a record to assign dynamic intruction related fields.
@@ -652,7 +653,7 @@ ElasticTrace::hasCompCompleted(TraceInfo* past_record,
 }
 
 void
-ElasticTrace::clearTempStoreUntil(const DynInstConstPtr& head_inst)
+ElasticTrace::clearTempStoreUntil(const O3DynInstConstPtr& head_inst)
 {
     // Clear from temp store starting with the execution info object
     // corresponding the head_inst and continue clearing by decrementing the
diff --git a/src/cpu/o3/probe/elastic_trace.hh b/src/cpu/o3/probe/elastic_trace.hh
index fb802d5fd0..70f2763b8e 100644
--- a/src/cpu/o3/probe/elastic_trace.hh
+++ b/src/cpu/o3/probe/elastic_trace.hh
@@ -50,7 +50,7 @@
 #include <unordered_map>
 #include <utility>
 
-#include "cpu/o3/dyn_inst.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/impl.hh"
 #include "mem/request.hh"
 #include "params/ElasticTrace.hh"
@@ -85,8 +85,6 @@ class ElasticTrace : public ProbeListenerObject
 {
 
   public:
-    typedef typename O3CPUImpl::DynInstPtr DynInstPtr;
-    typedef typename O3CPUImpl::DynInstConstPtr DynInstConstPtr;
     typedef typename std::pair<InstSeqNum, RegIndex> SeqNumRegPair;
 
     /** Trace record types corresponding to instruction node types */
@@ -129,7 +127,7 @@ class ElasticTrace : public ProbeListenerObject
      *
      * @param dyn_inst pointer to dynamic instruction in flight
      */
-    void recordExecTick(const DynInstConstPtr& dyn_inst);
+    void recordExecTick(const O3DynInstConstPtr& dyn_inst);
 
     /**
      * Populate the timestamp field in an InstExecInfo object for an
@@ -138,7 +136,7 @@ class ElasticTrace : public ProbeListenerObject
      *
      * @param dyn_inst pointer to dynamic instruction in flight
      */
-    void recordToCommTick(const DynInstConstPtr& dyn_inst);
+    void recordToCommTick(const O3DynInstConstPtr& dyn_inst);
 
     /**
      * Record a Read After Write physical register dependency if there has
@@ -149,7 +147,7 @@ class ElasticTrace : public ProbeListenerObject
      *
      * @param dyn_inst pointer to dynamic instruction in flight
      */
-    void updateRegDep(const DynInstConstPtr& dyn_inst);
+    void updateRegDep(const O3DynInstConstPtr& dyn_inst);
 
     /**
      * When an instruction gets squashed the destination register mapped to it
@@ -166,14 +164,14 @@ class ElasticTrace : public ProbeListenerObject
      *
      * @param head_inst pointer to dynamic instruction to be squashed
      */
-    void addSquashedInst(const DynInstConstPtr& head_inst);
+    void addSquashedInst(const O3DynInstConstPtr& head_inst);
 
     /**
      * Add an instruction that is at the head of the ROB and is committed.
      *
      * @param head_inst pointer to dynamic instruction to be committed
      */
-    void addCommittedInst(const DynInstConstPtr& head_inst);
+    void addCommittedInst(const O3DynInstConstPtr& head_inst);
 
     /** Event to trigger registering this listener for all probe points. */
     EventFunctionWrapper regEtraceListenersEvent;
@@ -379,7 +377,7 @@ class ElasticTrace : public ProbeListenerObject
      * @param exec_info_ptr Pointer to InstExecInfo for that instruction
      * @param commit        True if instruction is committed, false if squashed
      */
-    void addDepTraceRecord(const DynInstConstPtr& head_inst,
+    void addDepTraceRecord(const O3DynInstConstPtr& head_inst,
                            InstExecInfo* exec_info_ptr, bool commit);
 
     /**
@@ -388,7 +386,7 @@ class ElasticTrace : public ProbeListenerObject
      *
      * @param head_inst pointer to dynamic instruction
      */
-    void clearTempStoreUntil(const DynInstConstPtr& head_inst);
+    void clearTempStoreUntil(const O3DynInstConstPtr& head_inst);
 
     /**
      * Calculate the computational delay between an instruction and a
diff --git a/src/cpu/o3/probe/simple_trace.cc b/src/cpu/o3/probe/simple_trace.cc
index cc4cceaa67..fc2282e32d 100644
--- a/src/cpu/o3/probe/simple_trace.cc
+++ b/src/cpu/o3/probe/simple_trace.cc
@@ -38,16 +38,17 @@
 #include "cpu/o3/probe/simple_trace.hh"
 
 #include "base/trace.hh"
+#include "cpu/o3/dyn_inst.hh"
 #include "debug/SimpleTrace.hh"
 
-void SimpleTrace::traceCommit(const O3CPUImpl::DynInstConstPtr& dynInst)
+void SimpleTrace::traceCommit(const O3DynInstConstPtr& dynInst)
 {
     DPRINTFR(SimpleTrace, "[%s]: Commit 0x%08x %s.\n", name(),
              dynInst->instAddr(),
              dynInst->staticInst->disassemble(dynInst->instAddr()));
 }
 
-void SimpleTrace::traceFetch(const O3CPUImpl::DynInstConstPtr& dynInst)
+void SimpleTrace::traceFetch(const O3DynInstConstPtr& dynInst)
 {
     DPRINTFR(SimpleTrace, "[%s]: Fetch 0x%08x %s.\n", name(),
              dynInst->instAddr(),
@@ -57,7 +58,7 @@ void SimpleTrace::traceFetch(const O3CPUImpl::DynInstConstPtr& dynInst)
 void SimpleTrace::regProbeListeners()
 {
     typedef ProbeListenerArg<SimpleTrace,
-            O3CPUImpl::DynInstConstPtr> DynInstListener;
+            O3DynInstConstPtr> DynInstListener;
     listeners.push_back(new DynInstListener(this, "Commit",
                 &SimpleTrace::traceCommit));
     listeners.push_back(new DynInstListener(this, "Fetch",
diff --git a/src/cpu/o3/probe/simple_trace.hh b/src/cpu/o3/probe/simple_trace.hh
index e73779a981..abcce0f24b 100644
--- a/src/cpu/o3/probe/simple_trace.hh
+++ b/src/cpu/o3/probe/simple_trace.hh
@@ -44,7 +44,7 @@
 #ifndef __CPU_O3_PROBE_SIMPLE_TRACE_HH__
 #define __CPU_O3_PROBE_SIMPLE_TRACE_HH__
 
-#include "cpu/o3/dyn_inst.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/impl.hh"
 #include "params/SimpleTrace.hh"
 #include "sim/probe/probe.hh"
@@ -69,8 +69,8 @@ class SimpleTrace : public ProbeListenerObject
     }
 
   private:
-    void traceFetch(const O3CPUImpl::DynInstConstPtr& dynInst);
-    void traceCommit(const O3CPUImpl::DynInstConstPtr& dynInst);
+    void traceFetch(const O3DynInstConstPtr& dynInst);
+    void traceCommit(const O3DynInstConstPtr& dynInst);
 
 };
 #endif//__CPU_O3_PROBE_SIMPLE_TRACE_HH__
diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh
index eac8e30a3b..2c4796a0bd 100644
--- a/src/cpu/o3/rename.hh
+++ b/src/cpu/o3/rename.hh
@@ -48,6 +48,7 @@
 #include "base/statistics.hh"
 #include "config/the_isa.hh"
 #include "cpu/o3/commit.hh"
+#include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/free_list.hh"
 #include "cpu/o3/iew.hh"
 #include "cpu/o3/limits.hh"
@@ -73,7 +74,6 @@ class DefaultRename
 {
   public:
     // Typedefs from the Impl.
-    typedef typename Impl::DynInstPtr DynInstPtr;
     typedef typename Impl::O3CPU O3CPU;
     typedef typename Impl::DecodeStruct DecodeStruct;
     typedef typename Impl::RenameStruct RenameStruct;
@@ -83,7 +83,7 @@ class DefaultRename
     // be added to the front of the queue, which is the only reason for
     // using a deque instead of a queue. (Most other stages use a
     // queue)
-    typedef std::deque<DynInstPtr> InstQueue;
+    typedef std::deque<O3DynInstPtr> InstQueue;
 
   public:
     /** Overall rename status. Used to determine if the CPU can
@@ -117,7 +117,7 @@ class DefaultRename
     /** Probe points. */
     typedef typename std::pair<InstSeqNum, PhysRegIdPtr> SeqNumRegPair;
     /** To probe when register renaming for an instruction is complete */
-    ProbePointArg<DynInstPtr> *ppRename;
+    ProbePointArg<O3DynInstPtr> *ppRename;
     /**
      * To probe when an instruction is squashed and the register mapping
      * for it needs to be undone
@@ -248,22 +248,22 @@ class DefaultRename
     void removeFromHistory(InstSeqNum inst_seq_num, ThreadID tid);
 
     /** Renames the source registers of an instruction. */
-    inline void renameSrcRegs(const DynInstPtr &inst, ThreadID tid);
+    void renameSrcRegs(const O3DynInstPtr &inst, ThreadID tid);
 
     /** Renames the destination registers of an instruction. */
-    inline void renameDestRegs(const DynInstPtr &inst, ThreadID tid);
+    void renameDestRegs(const O3DynInstPtr &inst, ThreadID tid);
 
     /** Calculates the number of free ROB entries for a specific thread. */
-    inline int calcFreeROBEntries(ThreadID tid);
+    int calcFreeROBEntries(ThreadID tid);
 
     /** Calculates the number of free IQ entries for a specific thread. */
-    inline int calcFreeIQEntries(ThreadID tid);
+    int calcFreeIQEntries(ThreadID tid);
 
     /** Calculates the number of free LQ entries for a specific thread. */
-    inline int calcFreeLQEntries(ThreadID tid);
+    int calcFreeLQEntries(ThreadID tid);
 
     /** Calculates the number of free SQ entries for a specific thread. */
-    inline int calcFreeSQEntries(ThreadID tid);
+    int calcFreeSQEntries(ThreadID tid);
 
     /** Returns the number of valid instructions coming from decode. */
     unsigned validInsts();
@@ -417,7 +417,7 @@ class DefaultRename
     Stalls stalls[O3MaxThreads];
 
     /** The serialize instruction that rename has stalled on. */
-    DynInstPtr serializeInst[O3MaxThreads];
+    O3DynInstPtr serializeInst[O3MaxThreads];
 
     /** Records if rename needs to serialize on the next instruction for any
      * thread.
diff --git a/src/cpu/o3/rename_impl.hh b/src/cpu/o3/rename_impl.hh
index f48a89257c..bc33c5d83d 100644
--- a/src/cpu/o3/rename_impl.hh
+++ b/src/cpu/o3/rename_impl.hh
@@ -177,7 +177,8 @@ template <class Impl>
 void
 DefaultRename<Impl>::regProbePoints()
 {
-    ppRename = new ProbePointArg<DynInstPtr>(cpu->getProbeManager(), "Rename");
+    ppRename = new ProbePointArg<O3DynInstPtr>(
+            cpu->getProbeManager(), "Rename");
     ppSquashInRename = new ProbePointArg<SeqNumRegPair>(cpu->getProbeManager(),
                                                         "SquashInRename");
 }
@@ -612,11 +613,12 @@ DefaultRename<Impl>::renameInsts(ThreadID tid)
 
         assert(!insts_to_rename.empty());
 
-        DynInstPtr inst = insts_to_rename.front();
+        O3DynInstPtr inst = insts_to_rename.front();
 
-        //For all kind of instructions, check ROB and IQ first
-        //For load instruction, check LQ size and take into account the inflight loads
-        //For store instruction, check SQ size and take into account the inflight stores
+        //For all kind of instructions, check ROB and IQ first For load
+        //instruction, check LQ size and take into account the inflight loads
+        //For store instruction, check SQ size and take into account the
+        //inflight stores
 
         if (inst->isLoad()) {
             if (calcFreeLQEntries(tid) <= 0) {
@@ -774,7 +776,7 @@ template<class Impl>
 void
 DefaultRename<Impl>::skidInsert(ThreadID tid)
 {
-    DynInstPtr inst = NULL;
+    O3DynInstPtr inst = NULL;
 
     while (!insts[tid].empty()) {
         inst = insts[tid].front();
@@ -811,7 +813,7 @@ DefaultRename<Impl>::sortInsts()
 {
     int insts_from_decode = fromDecode->size;
     for (int i = 0; i < insts_from_decode; ++i) {
-        const DynInstPtr &inst = fromDecode->insts[i];
+        const O3DynInstPtr &inst = fromDecode->insts[i];
         insts[inst->threadNumber].push_back(inst);
 #if TRACING_ON
         if (Debug::O3PipeView) {
@@ -1035,7 +1037,7 @@ DefaultRename<Impl>::removeFromHistory(InstSeqNum inst_seq_num, ThreadID tid)
 
 template <class Impl>
 inline void
-DefaultRename<Impl>::renameSrcRegs(const DynInstPtr &inst, ThreadID tid)
+DefaultRename<Impl>::renameSrcRegs(const O3DynInstPtr &inst, ThreadID tid)
 {
     ThreadContext *tc = inst->tcBase();
     UnifiedRenameMap *map = renameMap[tid];
@@ -1102,7 +1104,7 @@ DefaultRename<Impl>::renameSrcRegs(const DynInstPtr &inst, ThreadID tid)
 
 template <class Impl>
 inline void
-DefaultRename<Impl>::renameDestRegs(const DynInstPtr &inst, ThreadID tid)
+DefaultRename<Impl>::renameDestRegs(const O3DynInstPtr &inst, ThreadID tid)
 {
     ThreadContext *tc = inst->tcBase();
     UnifiedRenameMap *map = renameMap[tid];
@@ -1369,7 +1371,7 @@ DefaultRename<Impl>::checkSignalsAndUpdate(ThreadID tid)
         DPRINTF(Rename, "[tid:%i] Done with serialize stall, switching to "
                 "unblocking.\n", tid);
 
-        DynInstPtr serial_inst = serializeInst[tid];
+        O3DynInstPtr serial_inst = serializeInst[tid];
 
         renameStatus[tid] = Unblocking;
 
diff --git a/src/cpu/o3/rob.hh b/src/cpu/o3/rob.hh
index ba5e027176..1259b53f8d 100644
--- a/src/cpu/o3/rob.hh
+++ b/src/cpu/o3/rob.hh
@@ -60,10 +60,9 @@ class ROB
   public:
     //Typedefs from the Impl.
     typedef typename Impl::O3CPU O3CPU;
-    typedef typename Impl::DynInstPtr DynInstPtr;
 
     typedef std::pair<RegIndex, RegIndex> UnmapInfo;
-    typedef typename std::list<DynInstPtr>::iterator InstIt;
+    typedef typename std::list<O3DynInstPtr>::iterator InstIt;
 
     /** Possible ROB statuses. */
     enum Status
@@ -105,36 +104,36 @@ class ROB
      *  ROB for the new instruction.
      *  @param inst The instruction being inserted into the ROB.
      */
-    void insertInst(const DynInstPtr &inst);
+    void insertInst(const O3DynInstPtr &inst);
 
     /** Returns pointer to the head instruction within the ROB.  There is
      *  no guarantee as to the return value if the ROB is empty.
      *  @retval Pointer to the DynInst that is at the head of the ROB.
      */
-//    DynInstPtr readHeadInst();
+//    O3DynInstPtr readHeadInst();
 
     /** Returns a pointer to the head instruction of a specific thread within
      *  the ROB.
      *  @return Pointer to the DynInst that is at the head of the ROB.
      */
-    const DynInstPtr &readHeadInst(ThreadID tid);
+    const O3DynInstPtr &readHeadInst(ThreadID tid);
 
     /** Returns a pointer to the instruction with the given sequence if it is
      *  in the ROB.
      */
-    DynInstPtr findInst(ThreadID tid, InstSeqNum squash_inst);
+    O3DynInstPtr findInst(ThreadID tid, InstSeqNum squash_inst);
 
     /** Returns pointer to the tail instruction within the ROB.  There is
      *  no guarantee as to the return value if the ROB is empty.
      *  @retval Pointer to the DynInst that is at the tail of the ROB.
      */
-//    DynInstPtr readTailInst();
+//    O3DynInstPtr readTailInst();
 
     /** Returns a pointer to the tail instruction of a specific thread within
      *  the ROB.
      *  @return Pointer to the DynInst that is at the tail of the ROB.
      */
-    DynInstPtr readTailInst(ThreadID tid);
+    O3DynInstPtr readTailInst(ThreadID tid);
 
     /** Retires the head instruction, removing it from the ROB. */
 //    void retireHead();
@@ -277,7 +276,7 @@ class ROB
     unsigned maxEntries[O3MaxThreads];
 
     /** ROB List of Instructions */
-    std::list<DynInstPtr> instList[O3MaxThreads];
+    std::list<O3DynInstPtr> instList[O3MaxThreads];
 
     /** Number of instructions that can be squashed in a single cycle. */
     unsigned squashWidth;
@@ -308,7 +307,7 @@ class ROB
     int numInstsInROB;
 
     /** Dummy instruction returned if there are no insts left. */
-    DynInstPtr dummyInst;
+    O3DynInstPtr dummyInst;
 
   private:
     /** The sequence number of the squashed instruction. */
diff --git a/src/cpu/o3/rob_impl.hh b/src/cpu/o3/rob_impl.hh
index 0f192b789d..6bdf23a790 100644
--- a/src/cpu/o3/rob_impl.hh
+++ b/src/cpu/o3/rob_impl.hh
@@ -200,7 +200,7 @@ ROB<Impl>::countInsts(ThreadID tid)
 
 template <class Impl>
 void
-ROB<Impl>::insertInst(const DynInstPtr &inst)
+ROB<Impl>::insertInst(const O3DynInstPtr &inst)
 {
     assert(inst);
 
@@ -246,7 +246,7 @@ ROB<Impl>::retireHead(ThreadID tid)
     // Get the head ROB instruction by copying it and remove it from the list
     InstIt head_it = instList[tid].begin();
 
-    DynInstPtr head_inst = std::move(*head_it);
+    O3DynInstPtr head_inst = std::move(*head_it);
     instList[tid].erase(head_it);
 
     assert(head_inst->readyToCommit());
@@ -428,7 +428,7 @@ ROB<Impl>::updateHead()
 
         InstIt head_thread = instList[tid].begin();
 
-        DynInstPtr head_inst = (*head_thread);
+        O3DynInstPtr head_inst = (*head_thread);
 
         assert(head_inst != 0);
 
@@ -513,7 +513,7 @@ ROB<Impl>::squash(InstSeqNum squash_num, ThreadID tid)
 }
 
 template <class Impl>
-const typename Impl::DynInstPtr&
+const O3DynInstPtr&
 ROB<Impl>::readHeadInst(ThreadID tid)
 {
     if (threadEntries[tid] != 0) {
@@ -528,7 +528,7 @@ ROB<Impl>::readHeadInst(ThreadID tid)
 }
 
 template <class Impl>
-typename Impl::DynInstPtr
+O3DynInstPtr
 ROB<Impl>::readTailInst(ThreadID tid)
 {
     InstIt tail_thread = instList[tid].end();
@@ -546,7 +546,7 @@ ROB<Impl>::ROBStats::ROBStats(Stats::Group *parent)
 }
 
 template <class Impl>
-typename Impl::DynInstPtr
+O3DynInstPtr
 ROB<Impl>::findInst(ThreadID tid, InstSeqNum squash_inst)
 {
     for (InstIt it = instList[tid].begin(); it != instList[tid].end(); it++) {