Merge zizzer.eecs.umich.edu:/bk/newmem

into zeep.eecs.umich.edu:/home/gblack/m5/newmem --HG-- extra : convert_revision : 30b2475ba034550376455e1bc0e52e19a200fd5a
2006-10-12 10:58:45 -04:00
parent 6a31898a88 78aec04b66
commit 866cfaf9dc
171 changed files with 5677 additions and 5149 deletions
--- a/configs/common/FSConfig.py
+++ b/configs/common/FSConfig.py
@@ -30,7 +30,6 @@ import m5
 from m5 import makeList
 from m5.objects import *
 from Benchmarks import *
-from FullO3Config import *

 class CowIdeDisk(IdeDisk):
    image = CowDiskImage(child=RawDiskImage(read_only=True),
--- a/configs/example/fs.py
+++ b/configs/example/fs.py
@@ -49,10 +49,14 @@ parser.add_option("--dual", action="store_true",
 parser.add_option("-b", "--benchmark", action="store", type="string",
                  dest="benchmark",
                  help="Specify the benchmark to run. Available benchmarks: %s"\
-                          % DefinedBenchmarks)
+                  % DefinedBenchmarks)
 parser.add_option("--etherdump", action="store", type="string", dest="etherdump",
-                  help="Specify the filename to dump a pcap capture of the ethernet"
-                  "traffic")
+                  help="Specify the filename to dump a pcap capture of the" \
+                  "ethernet traffic")
+parser.add_option("--checkpoint_dir", action="store", type="string",
+                  help="Place all checkpoints in this absolute directory")
+parser.add_option("-c", "--checkpoint", action="store", type="int",
+                  help="restore from checkpoint <N>")

 (options, args) = parser.parse_args()

@@ -61,8 +65,8 @@ if args:
    sys.exit(1)

 if options.detailed:
-    cpu = DetailedO3CPU()
-    cpu2 = DetailedO3CPU()
+    cpu = DerivO3CPU()
+    cpu2 = DerivO3CPU()
    mem_mode = 'timing'
 elif options.timing:
    cpu = TimingSimpleCPU()
@@ -75,6 +79,8 @@ else:

 cpu.clock = '2GHz'
 cpu2.clock = '2GHz'
+cpu.cpu_id = 0
+cpu2.cpu_id = 0

 if options.benchmark:
    if options.benchmark not in Benchmarks:
@@ -111,6 +117,31 @@ else:

 m5.instantiate(root)

+if options.checkpoint:
+    from os.path import isdir
+    from os import listdir, getcwd
+    import re
+    if options.checkpoint_dir:
+        cptdir = options.checkpoint_dir
+    else:
+        cptdir = getcwd()
+
+    if not isdir(cptdir):
+        m5.panic("checkpoint dir %s does not exist!" % cptdir)
+
+    dirs = listdir(cptdir)
+    expr = re.compile('cpt.([0-9]*)')
+    cpts = []
+    for dir in dirs:
+        match = expr.match(dir)
+        if match:
+            cpts.append(match.group(1))
+
+    if options.checkpoint > len(cpts):
+        m5.panic('Checkpoint %d not found' % options.checkpoint)
+
+    m5.restoreCheckpoint(root, "/".join([cptdir, "cpt.%s" % cpts[options.checkpoint - 1]]))
+
 if options.maxtick:
    maxtick = options.maxtick
 elif options.maxtime:
@@ -123,7 +154,14 @@ else:
 exit_event = m5.simulate(maxtick)

 while exit_event.getCause() == "checkpoint":
-    m5.checkpoint(root, "cpt.%d")
-    exit_event = m5.simulate(maxtick - m5.curTick())
+    if options.checkpoint_dir:
+        m5.checkpoint(root, "/".join([options.checkpoint_dir, "cpt.%d"]))
+    else:
+        m5.checkpoint(root, "cpt.%d")
+
+    if maxtick == -1:
+        exit_event = m5.simulate(maxtick)
+    else:
+        exit_event = m5.simulate(maxtick - m5.curTick())

 print 'Exiting @ cycle', m5.curTick(), 'because', exit_event.getCause()
--- a/configs/example/se.py
+++ b/configs/example/se.py
@@ -34,7 +34,6 @@ import m5
 from m5.objects import *
 import os, optparse, sys
 m5.AddToPath('../common')
-from FullO3Config import *

 parser = optparse.OptionParser()

@@ -86,11 +85,12 @@ if options.detailed:
 if options.timing:
    cpu = TimingSimpleCPU()
 elif options.detailed:
-    cpu = DetailedO3CPU()
+    cpu = DerivO3CPU()
 else:
    cpu = AtomicSimpleCPU()

 cpu.workload = process
+cpu.cpu_id = 0

 system = System(cpu = cpu,
                physmem = PhysicalMemory(),
--- a/src/arch/SConscript
+++ b/src/arch/SConscript
@@ -50,6 +50,7 @@ isa_switch_hdrs = Split('''
 	arguments.hh
 	faults.hh
 	isa_traits.hh
+        locked_mem.hh
 	process.hh
 	regfile.hh
 	stacktrace.hh
--- a/src/arch/alpha/isa/decoder.isa
+++ b/src/arch/alpha/isa/decoder.isa
@@ -701,7 +701,7 @@ decode OPCODE default Unknown::unknown() {
    0x00: decode PALFUNC {
        format EmulatedCallPal {
            0x00: halt ({{
-                exitSimLoop(curTick, "halt instruction encountered");
+                exitSimLoop("halt instruction encountered");
            }}, IsNonSpeculative);
            0x83: callsys({{
                xc->syscall(R0);
--- a/src/arch/alpha/locked_mem.hh
+++ b/src/arch/alpha/locked_mem.hh
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2006 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Steve Reinhardt
+ */
+
+#ifndef __ARCH_ALPHA_LOCKED_MEM_HH__
+#define __ARCH_ALPHA_LOCKED_MEM_HH__
+
+/**
+ * @file
+ *
+ * ISA-specific helper functions for locked memory accesses.
+ */
+
+#include "arch/isa_traits.hh"
+#include "base/misc.hh"
+#include "mem/request.hh"
+
+
+namespace AlphaISA
+{
+template <class XC>
+inline void
+handleLockedRead(XC *xc, Request *req)
+{
+    xc->setMiscReg(Lock_Addr_DepTag, req->getPaddr() & ~0xf);
+    xc->setMiscReg(Lock_Flag_DepTag, true);
+}
+
+
+template <class XC>
+inline bool
+handleLockedWrite(XC *xc, Request *req)
+{
+    if (req->isUncacheable()) {
+        // Funky Turbolaser mailbox access...don't update
+        // result register (see stq_c in decoder.isa)
+        req->setScResult(2);
+    } else {
+        // standard store conditional
+        bool lock_flag = xc->readMiscReg(Lock_Flag_DepTag);
+        Addr lock_addr = xc->readMiscReg(Lock_Addr_DepTag);
+        if (!lock_flag || (req->getPaddr() & ~0xf) != lock_addr) {
+            // Lock flag not set or addr mismatch in CPU;
+            // don't even bother sending to memory system
+            req->setScResult(0);
+            xc->setMiscReg(Lock_Flag_DepTag, false);
+            // the rest of this code is not architectural;
+            // it's just a debugging aid to help detect
+            // livelock by warning on long sequences of failed
+            // store conditionals
+            int stCondFailures = xc->readStCondFailures();
+            stCondFailures++;
+            xc->setStCondFailures(stCondFailures);
+            if (stCondFailures % 100000 == 0) {
+                warn("cpu %d: %d consecutive "
+                     "store conditional failures\n",
+                     xc->readCpuId(), stCondFailures);
+            }
+
+            // store conditional failed already, so don't issue it to mem
+            return false;
+        }
+    }
+
+    return true;
+}
+
+
+} // namespace AlphaISA
+
+#endif
--- a/src/arch/mips/locked_mem.hh
+++ b/src/arch/mips/locked_mem.hh
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2006 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Steve Reinhardt
+ */
+
+#ifndef __ARCH_MIPS_LOCKED_MEM_HH__
+#define __ARCH_MIPS_LOCKED_MEM_HH__
+
+/**
+ * @file
+ *
+ * ISA-specific helper functions for locked memory accesses.
+ */
+
+#include "mem/request.hh"
+
+
+namespace MipsISA
+{
+template <class XC>
+inline void
+handleLockedRead(XC *xc, Request *req)
+{
+}
+
+
+template <class XC>
+inline bool
+handleLockedWrite(XC *xc, Request *req)
+{
+    return true;
+}
+
+
+} // namespace MipsISA
+
+#endif
--- a/src/arch/sparc/locked_mem.hh
+++ b/src/arch/sparc/locked_mem.hh
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2006 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Steve Reinhardt
+ */
+
+#ifndef __ARCH_SPARC_LOCKED_MEM_HH__
+#define __ARCH_SPARC_LOCKED_MEM_HH__
+
+/**
+ * @file
+ *
+ * ISA-specific helper functions for locked memory accesses.
+ */
+
+#include "mem/request.hh"
+
+
+namespace SparcISA
+{
+template <class XC>
+inline void
+handleLockedRead(XC *xc, Request *req)
+{
+}
+
+
+template <class XC>
+inline bool
+handleLockedWrite(XC *xc, Request *req)
+{
+    return true;
+}
+
+
+} // namespace SparcISA
+
+#endif
--- a/src/arch/sparc/system.cc
+++ b/src/arch/sparc/system.cc
@@ -152,10 +152,6 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(SparcSystem)
    Param<std::string> readfile;
    Param<unsigned int> init_param;

-    Param<bool> bin;
-    VectorParam<std::string> binned_fns;
-    Param<bool> bin_int;
-
 END_DECLARE_SIM_OBJECT_PARAMS(SparcSystem)

 BEGIN_INIT_SIM_OBJECT_PARAMS(SparcSystem)
@@ -173,10 +169,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(SparcSystem)
    INIT_PARAM_DFLT(readfile, "file to read startup script from", ""),
    INIT_PARAM_DFLT(init_param, "numerical value to pass into simulator", 0),
    INIT_PARAM_DFLT(system_type, "Type of system we are emulating", 34),
-    INIT_PARAM_DFLT(system_rev, "Revision of system we are emulating", 1<<10),
-    INIT_PARAM_DFLT(bin, "is this system to be binned", false),
-    INIT_PARAM(binned_fns, "functions to be broken down and binned"),
-    INIT_PARAM_DFLT(bin_int, "is interrupt code binned seperately?", true)
+    INIT_PARAM_DFLT(system_rev, "Revision of system we are emulating", 1<<10)

 END_INIT_SIM_OBJECT_PARAMS(SparcSystem)

@@ -196,9 +189,6 @@ CREATE_SIM_OBJECT(SparcSystem)
    p->readfile = readfile;
    p->system_type = system_type;
    p->system_rev = system_rev;
-    p->bin = bin;
-    p->binned_fns = binned_fns;
-    p->bin_int = bin_int;
    return new SparcSystem(p);
 }

--- a/src/base/traceflags.py
+++ b/src/base/traceflags.py
@@ -58,6 +58,7 @@ baseFlags = [
    'BusAddrRanges',
    'BusBridge',
    'Cache',
+    'CachePort',
    'Chains',
    'Checker',
    'Clock',
@@ -112,6 +113,7 @@ baseFlags = [
    'IdeDisk',
    'InstExec',
    'Interrupt',
+    'LLSC',
    'LSQ',
    'LSQUnit',
    'Loader',
--- a/src/cpu/SConscript
+++ b/src/cpu/SConscript
@@ -158,6 +158,7 @@ if 'O3CPU' in env['CPU_MODELS']:
        o3/scoreboard.cc
        o3/store_set.cc
        ''')
+    sources += Split('memtest/memtest.cc')
    if env['USE_CHECKER']:
        sources += Split('o3/checker_builder.cc')
    else:
--- a/src/cpu/base.cc
+++ b/src/cpu/base.cc
@@ -41,6 +41,7 @@
 #include "cpu/cpuevent.hh"
 #include "cpu/thread_context.hh"
 #include "cpu/profile.hh"
+#include "sim/sim_exit.hh"
 #include "sim/param.hh"
 #include "sim/process.hh"
 #include "sim/sim_events.hh"
@@ -125,8 +126,9 @@ BaseCPU::BaseCPU(Params *p)
    //
    if (p->max_insts_any_thread != 0)
        for (int i = 0; i < number_of_threads; ++i)
-            new SimLoopExitEvent(comInstEventQueue[i], p->max_insts_any_thread,
-                                 "a thread reached the max instruction count");
+            schedExitSimLoop("a thread reached the max instruction count",
+                             p->max_insts_any_thread, 0,
+                             comInstEventQueue[i]);

    if (p->max_insts_all_threads != 0) {
        // allocate & initialize shared downcounter: each event will
@@ -150,8 +152,9 @@ BaseCPU::BaseCPU(Params *p)
    //
    if (p->max_loads_any_thread != 0)
        for (int i = 0; i < number_of_threads; ++i)
-            new SimLoopExitEvent(comLoadEventQueue[i], p->max_loads_any_thread,
-                                 "a thread reached the max load count");
+            schedExitSimLoop("a thread reached the max load count",
+                             p->max_loads_any_thread, 0,
+                             comLoadEventQueue[i]);

    if (p->max_loads_all_threads != 0) {
        // allocate & initialize shared downcounter: each event will
--- a/src/cpu/base.hh
+++ b/src/cpu/base.hh
@@ -140,8 +140,8 @@ class BaseCPU : public MemObject
        bool functionTrace;
        Tick functionTraceStart;
        System *system;
-#if FULL_SYSTEM
        int cpu_id;
+#if FULL_SYSTEM
        Tick profile;
 #endif
        Tick progress_interval;
--- a/src/cpu/base_dyn_inst_impl.hh
+++ b/src/cpu/base_dyn_inst_impl.hh
@@ -193,7 +193,7 @@ BaseDynInst<Impl>::prefetch(Addr addr, unsigned flags)
    // note this is a local, not BaseDynInst::fault
    Fault trans_fault = cpu->translateDataReadReq(req);

-    if (trans_fault == NoFault && !(req->flags & UNCACHEABLE)) {
+    if (trans_fault == NoFault && !(req->isUncacheable())) {
        // It's a valid address to cacheable space.  Record key MemReq
        // parameters so we can generate another one just like it for
        // the timing access without calling translate() again (which
--- a/src/cpu/checker/cpu.cc
+++ b/src/cpu/checker/cpu.cc
@@ -175,7 +175,7 @@ CheckerCPU::read(Addr addr, T &data, unsigned flags)

    pkt->dataStatic(&data);

-    if (!(memReq->getFlags() & UNCACHEABLE)) {
+    if (!(memReq->isUncacheable())) {
        // Access memory to see if we have the same data
        dcachePort->sendFunctional(pkt);
    } else {
@@ -251,9 +251,9 @@ CheckerCPU::write(T data, Addr addr, unsigned flags, uint64_t *res)
    // This is because the LSQ would have to be snooped in the CPU to
    // verify this data.
    if (unverifiedReq &&
-        !(unverifiedReq->getFlags() & UNCACHEABLE) &&
-        (!(unverifiedReq->getFlags() & LOCKED) ||
-         ((unverifiedReq->getFlags() & LOCKED) &&
+        !(unverifiedReq->isUncacheable()) &&
+        (!(unverifiedReq->isLocked()) ||
+         ((unverifiedReq->isLocked()) &&
          unverifiedReq->getScResult() == 1))) {
        T inst_data;
 /*
--- a/src/cpu/checker/thread_context.hh
+++ b/src/cpu/checker/thread_context.hh
@@ -133,7 +133,7 @@ class CheckerThreadContext : public ThreadContext
    void takeOverFrom(ThreadContext *oldContext)
    {
        actualTC->takeOverFrom(oldContext);
-        checkerTC->takeOverFrom(oldContext);
+        checkerTC->copyState(oldContext);
    }

    void regStats(const std::string &name) { actualTC->regStats(name); }
--- a/src/cpu/memtest/memtest.cc
+++ b/src/cpu/memtest/memtest.cc
@@ -38,86 +38,158 @@

 #include "base/misc.hh"
 #include "base/statistics.hh"
-#include "cpu/simple_thread.hh"
+//#include "cpu/simple_thread.hh"
 #include "cpu/memtest/memtest.hh"
-#include "mem/cache/base_cache.hh"
+//#include "mem/cache/base_cache.hh"
+//#include "mem/physical.hh"
 #include "sim/builder.hh"
 #include "sim/sim_events.hh"
 #include "sim/stats.hh"
+#include "mem/packet.hh"
+#include "mem/request.hh"
+#include "mem/port.hh"
+#include "mem/mem_object.hh"

 using namespace std;
-using namespace TheISA;

 int TESTER_ALLOCATOR=0;

+bool
+MemTest::CpuPort::recvTiming(Packet *pkt)
+{
+    memtest->completeRequest(pkt);
+    return true;
+}
+
+Tick
+MemTest::CpuPort::recvAtomic(Packet *pkt)
+{
+    panic("MemTest doesn't expect recvAtomic callback!");
+    return curTick;
+}
+
+void
+MemTest::CpuPort::recvFunctional(Packet *pkt)
+{
+    //Do nothing if we see one come through
+    if (curTick != 0)//Supress warning durring initialization
+        warn("Functional Writes not implemented in MemTester\n");
+    //Need to find any response values that intersect and update
+    return;
+}
+
+void
+MemTest::CpuPort::recvStatusChange(Status status)
+{
+    if (status == RangeChange)
+        return;
+
+    panic("MemTest doesn't expect recvStatusChange callback!");
+}
+
+void
+MemTest::CpuPort::recvRetry()
+{
+    memtest->doRetry();
+}
+
+void
+MemTest::sendPkt(Packet *pkt) {
+    if (atomic) {
+        cachePort.sendAtomic(pkt);
+        pkt->makeAtomicResponse();
+        completeRequest(pkt);
+    }
+    else if (!cachePort.sendTiming(pkt)) {
+        accessRetry = true;
+        retryPkt = pkt;
+    }
+
+}
+
 MemTest::MemTest(const string &name,
-                 MemInterface *_cache_interface,
-                 FunctionalMemory *main_mem,
-                 FunctionalMemory *check_mem,
+//		 MemInterface *_cache_interface,
+//		 PhysicalMemory *main_mem,
+//		 PhysicalMemory *check_mem,
                 unsigned _memorySize,
                 unsigned _percentReads,
-                 unsigned _percentCopies,
+//		 unsigned _percentCopies,
                 unsigned _percentUncacheable,
                 unsigned _progressInterval,
                 unsigned _percentSourceUnaligned,
                 unsigned _percentDestUnaligned,
                 Addr _traceAddr,
-                 Counter _max_loads)
-    : SimObject(name),
+                 Counter _max_loads,
+                 bool _atomic)
+    : MemObject(name),
      tickEvent(this),
-      cacheInterface(_cache_interface),
-      mainMem(main_mem),
-      checkMem(check_mem),
+      cachePort("test", this),
+      funcPort("functional", this),
+      retryPkt(NULL),
+//      mainMem(main_mem),
+//      checkMem(check_mem),
      size(_memorySize),
      percentReads(_percentReads),
-      percentCopies(_percentCopies),
+//      percentCopies(_percentCopies),
      percentUncacheable(_percentUncacheable),
      progressInterval(_progressInterval),
      nextProgressMessage(_progressInterval),
      percentSourceUnaligned(_percentSourceUnaligned),
      percentDestUnaligned(percentDestUnaligned),
-      maxLoads(_max_loads)
+      maxLoads(_max_loads),
+      atomic(_atomic)
 {
    vector<string> cmd;
    cmd.push_back("/bin/ls");
    vector<string> null_vec;
-    thread = new SimpleThread(NULL, 0, mainMem, 0);
-
-    blockSize = cacheInterface->getBlockSize();
-    blockAddrMask = blockSize - 1;
-    traceBlockAddr = blockAddr(_traceAddr);
-
-    //setup data storage with interesting values
-    uint8_t *data1 = new uint8_t[size];
-    uint8_t *data2 = new uint8_t[size];
-    uint8_t *data3 = new uint8_t[size];
-    memset(data1, 1, size);
-    memset(data2, 2, size);
-    memset(data3, 3, size);
+    //  thread = new SimpleThread(NULL, 0, NULL, 0, mainMem);
    curTick = 0;

+    // Needs to be masked off once we know the block size.
+    traceBlockAddr = _traceAddr;
    baseAddr1 = 0x100000;
    baseAddr2 = 0x400000;
    uncacheAddr = 0x800000;

-    // set up intial memory contents here
-    mainMem->prot_write(baseAddr1, data1, size);
-    checkMem->prot_write(baseAddr1, data1, size);
-    mainMem->prot_write(baseAddr2, data2, size);
-    checkMem->prot_write(baseAddr2, data2, size);
-    mainMem->prot_write(uncacheAddr, data3, size);
-    checkMem->prot_write(uncacheAddr, data3, size);
-
-    delete [] data1;
-    delete [] data2;
-    delete [] data3;
-
    // set up counters
    noResponseCycles = 0;
    numReads = 0;
    tickEvent.schedule(0);

    id = TESTER_ALLOCATOR++;
+    if (TESTER_ALLOCATOR > 8)
+        panic("False sharing memtester only allows up to 8 testers");
+
+    accessRetry = false;
+}
+
+Port *
+MemTest::getPort(const std::string &if_name, int idx)
+{
+    if (if_name == "functional")
+        return &funcPort;
+    else if (if_name == "test")
+        return &cachePort;
+    else
+        panic("No Such Port\n");
+}
+
+void
+MemTest::init()
+{
+    // By the time init() is called, the ports should be hooked up.
+    blockSize = cachePort.peerBlockSize();
+    blockAddrMask = blockSize - 1;
+    traceBlockAddr = blockAddr(traceBlockAddr);
+
+    // set up intial memory contents here
+
+    cachePort.memsetBlob(baseAddr1, 1, size);
+    funcPort.memsetBlob(baseAddr1, 1, size);
+    cachePort.memsetBlob(baseAddr2, 2, size);
+    funcPort.memsetBlob(baseAddr2, 2, size);
+    cachePort.memsetBlob(uncacheAddr, 3, size);
+    funcPort.memsetBlob(uncacheAddr, 3, size);
 }

 static void
@@ -132,23 +204,31 @@ printData(ostream &os, uint8_t *data, int nbytes)
 }

 void
-MemTest::completeRequest(MemReqPtr &req, uint8_t *data)
+MemTest::completeRequest(Packet *pkt)
 {
+    MemTestSenderState *state =
+        dynamic_cast<MemTestSenderState *>(pkt->senderState);
+
+    uint8_t *data = state->data;
+    uint8_t *pkt_data = pkt->getPtr<uint8_t>();
+    Request *req = pkt->req;
+
    //Remove the address from the list of outstanding
-    std::set<unsigned>::iterator removeAddr = outstandingAddrs.find(req->paddr);
+    std::set<unsigned>::iterator removeAddr = outstandingAddrs.find(req->getPaddr());
    assert(removeAddr != outstandingAddrs.end());
    outstandingAddrs.erase(removeAddr);

-    switch (req->cmd) {
-      case Read:
-        if (memcmp(req->data, data, req->size) != 0) {
-            cerr << name() << ": on read of 0x" << hex << req->paddr
-                 << " (0x" << hex << blockAddr(req->paddr) << ")"
+    switch (pkt->cmd) {
+      case Packet::ReadResp:
+
+        if (memcmp(pkt_data, data, pkt->getSize()) != 0) {
+            cerr << name() << ": on read of 0x" << hex << req->getPaddr()
+                 << " (0x" << hex << blockAddr(req->getPaddr()) << ")"
                 << "@ cycle " << dec << curTick
                 << ", cache returns 0x";
-            printData(cerr, req->data, req->size);
+            printData(cerr, pkt_data, pkt->getSize());
            cerr << ", expected 0x";
-            printData(cerr, data, req->size);
+            printData(cerr, data, pkt->getSize());
            cerr << endl;
            fatal("");
        }
@@ -163,13 +243,13 @@ MemTest::completeRequest(MemReqPtr &req, uint8_t *data)
        }

        if (numReads >= maxLoads)
-            SimExit(curTick, "Maximum number of loads reached!");
+            exitSimLoop("Maximum number of loads reached!");
        break;

-      case Write:
+      case Packet::WriteResp:
        numWritesStat++;
        break;
-
+/*
      case Copy:
        //Also remove dest from outstanding list
        removeAddr = outstandingAddrs.find(req->dest);
@@ -177,36 +257,37 @@ MemTest::completeRequest(MemReqPtr &req, uint8_t *data)
        outstandingAddrs.erase(removeAddr);
        numCopiesStat++;
        break;
-
+*/
      default:
        panic("invalid command");
    }

-    if (blockAddr(req->paddr) == traceBlockAddr) {
+    if (blockAddr(req->getPaddr()) == traceBlockAddr) {
        cerr << name() << ": completed "
-             << (req->cmd.isWrite() ? "write" : "read")
+             << (pkt->isWrite() ? "write" : "read")
             << " access of "
-             << dec << req->size << " bytes at address 0x"
-             << hex << req->paddr
-             << " (0x" << hex << blockAddr(req->paddr) << ")"
+             << dec << pkt->getSize() << " bytes at address 0x"
+             << hex << req->getPaddr()
+             << " (0x" << hex << blockAddr(req->getPaddr()) << ")"
             << ", value = 0x";
-        printData(cerr, req->data, req->size);
+        printData(cerr, pkt_data, pkt->getSize());
        cerr << " @ cycle " << dec << curTick;

        cerr << endl;
    }

    noResponseCycles = 0;
+    delete state;
    delete [] data;
+    delete pkt->req;
+    delete pkt;
 }

-
 void
 MemTest::regStats()
 {
    using namespace Stats;

-
    numReadsStat
        .name(name() + ".num_reads")
        .desc("number of read accesses completed")
@@ -234,7 +315,7 @@ MemTest::tick()
        fatal("");
    }

-    if (cacheInterface->isBlocked()) {
+    if (accessRetry) {
        return;
    }

@@ -248,30 +329,30 @@ MemTest::tick()

    //If we aren't doing copies, use id as offset, and do a false sharing
    //mem tester
-    if (percentCopies == 0) {
-        //We can eliminate the lower bits of the offset, and then use the id
-        //to offset within the blks
-        offset &= ~63; //Not the low order bits
-        offset += id;
-        access_size = 0;
-    }
+    //We can eliminate the lower bits of the offset, and then use the id
+    //to offset within the blks
+    offset &= ~63; //Not the low order bits
+    offset += id;
+    access_size = 0;

-    MemReqPtr req = new MemReq();
+    Request *req = new Request();
+    uint32_t flags = 0;
+    Addr paddr;

    if (cacheable < percentUncacheable) {
-        req->flags |= UNCACHEABLE;
-        req->paddr = uncacheAddr + offset;
+        flags |= UNCACHEABLE;
+        paddr = uncacheAddr + offset;
    } else {
-        req->paddr = ((base) ? baseAddr1 : baseAddr2) + offset;
+        paddr = ((base) ? baseAddr1 : baseAddr2) + offset;
    }
-    // bool probe = (random() % 2 == 1) && !req->isUncacheable();
+    //bool probe = (random() % 2 == 1) && !req->isUncacheable();
    bool probe = false;

-    req->size = 1 << access_size;
-    req->data = new uint8_t[req->size];
-    req->paddr &= ~(req->size - 1);
-    req->time = curTick;
-    req->xc = thread->getProxy();
+    paddr &= ~((1 << access_size) - 1);
+    req->setPhys(paddr, 1 << access_size, flags);
+    req->setThreadContext(id,0);
+
+    uint8_t *result = new uint8_t[8];

    if (cmd < percentReads) {
        // read
@@ -279,60 +360,75 @@ MemTest::tick()
        //For now we only allow one outstanding request per addreess per tester
        //This means we assume CPU does write forwarding to reads that alias something
        //in the cpu store buffer.
-        if (outstandingAddrs.find(req->paddr) != outstandingAddrs.end()) return;
-        else outstandingAddrs.insert(req->paddr);
+        if (outstandingAddrs.find(paddr) != outstandingAddrs.end()) return;
+        else outstandingAddrs.insert(paddr);

-        req->cmd = Read;
-        uint8_t *result = new uint8_t[8];
-        checkMem->access(Read, req->paddr, result, req->size);
-        if (blockAddr(req->paddr) == traceBlockAddr) {
+        // ***** NOTE FOR RON: I'm not sure how to access checkMem. - Kevin
+        funcPort.readBlob(req->getPaddr(), result, req->getSize());
+
+        if (blockAddr(paddr) == traceBlockAddr) {
            cerr << name()
                 << ": initiating read "
                 << ((probe) ? "probe of " : "access of ")
-                 << dec << req->size << " bytes from addr 0x"
-                 << hex << req->paddr
-                 << " (0x" << hex << blockAddr(req->paddr) << ")"
+                 << dec << req->getSize() << " bytes from addr 0x"
+                 << hex << paddr
+                 << " (0x" << hex << blockAddr(paddr) << ")"
                 << " at cycle "
                 << dec << curTick << endl;
        }
+
+        Packet *pkt = new Packet(req, Packet::ReadReq, Packet::Broadcast);
+        pkt->dataDynamicArray(new uint8_t[req->getSize()]);
+        MemTestSenderState *state = new MemTestSenderState(result);
+        pkt->senderState = state;
+
        if (probe) {
-            cacheInterface->probeAndUpdate(req);
-            completeRequest(req, result);
+            cachePort.sendFunctional(pkt);
+            completeRequest(pkt);
        } else {
-            req->completionEvent = new MemCompleteEvent(req, result, this);
-            cacheInterface->access(req);
+//	    req->completionEvent = new MemCompleteEvent(req, result, this);
+            sendPkt(pkt);
        }
-    } else if (cmd < (100 - percentCopies)){
+    } else {
        // write

        //For now we only allow one outstanding request per addreess per tester
        //This means we assume CPU does write forwarding to reads that alias something
        //in the cpu store buffer.
-        if (outstandingAddrs.find(req->paddr) != outstandingAddrs.end()) return;
-        else outstandingAddrs.insert(req->paddr);
+        if (outstandingAddrs.find(paddr) != outstandingAddrs.end()) return;
+        else outstandingAddrs.insert(paddr);

-        req->cmd = Write;
-        memcpy(req->data, &data, req->size);
-        checkMem->access(Write, req->paddr, req->data, req->size);
-        if (blockAddr(req->paddr) == traceBlockAddr) {
+/*
+        if (blockAddr(req->getPaddr()) == traceBlockAddr) {
            cerr << name() << ": initiating write "
                 << ((probe)?"probe of ":"access of ")
-                 << dec << req->size << " bytes (value = 0x";
-            printData(cerr, req->data, req->size);
+                 << dec << req->getSize() << " bytes (value = 0x";
+            printData(cerr, data_pkt->getPtr(), req->getSize());
            cerr << ") to addr 0x"
-                 << hex << req->paddr
-                 << " (0x" << hex << blockAddr(req->paddr) << ")"
+                 << hex << req->getPaddr()
+                 << " (0x" << hex << blockAddr(req->getPaddr()) << ")"
                 << " at cycle "
                 << dec << curTick << endl;
        }
+*/
+        Packet *pkt = new Packet(req, Packet::WriteReq, Packet::Broadcast);
+        uint8_t *pkt_data = new uint8_t[req->getSize()];
+        pkt->dataDynamicArray(pkt_data);
+        memcpy(pkt_data, &data, req->getSize());
+        MemTestSenderState *state = new MemTestSenderState(result);
+        pkt->senderState = state;
+
+        funcPort.writeBlob(req->getPaddr(), pkt_data, req->getSize());
+
        if (probe) {
-            cacheInterface->probeAndUpdate(req);
-            completeRequest(req, NULL);
+            cachePort.sendFunctional(pkt);
+            completeRequest(pkt);
        } else {
-            req->completionEvent = new MemCompleteEvent(req, NULL, this);
-            cacheInterface->access(req);
+//	    req->completionEvent = new MemCompleteEvent(req, NULL, this);
+            sendPkt(pkt);
        }
-    } else {
+    }
+/*    else {
        // copy
        unsigned source_align = random() % 100;
        unsigned dest_align = random() % 100;
@@ -369,56 +465,51 @@ MemTest::tick()
                 << " (0x" << hex << blockAddr(dest) << ")"
                 << " at cycle "
                 << dec << curTick << endl;
-        }
+        }*
        cacheInterface->access(req);
        uint8_t result[blockSize];
        checkMem->access(Read, source, &result, blockSize);
        checkMem->access(Write, dest, &result, blockSize);
    }
+*/
 }

-
 void
-MemCompleteEvent::process()
+MemTest::doRetry()
 {
-    tester->completeRequest(req, data);
-    delete this;
+    if (cachePort.sendTiming(retryPkt)) {
+        accessRetry = false;
+        retryPkt = NULL;
+    }
 }

-
-const char *
-MemCompleteEvent::description()
-{
-    return "memory access completion";
-}
-
-
 BEGIN_DECLARE_SIM_OBJECT_PARAMS(MemTest)

-    SimObjectParam<BaseCache *> cache;
-    SimObjectParam<FunctionalMemory *> main_mem;
-    SimObjectParam<FunctionalMemory *> check_mem;
+//    SimObjectParam<BaseCache *> cache;
+//    SimObjectParam<PhysicalMemory *> main_mem;
+//    SimObjectParam<PhysicalMemory *> check_mem;
    Param<unsigned> memory_size;
    Param<unsigned> percent_reads;
-    Param<unsigned> percent_copies;
+//    Param<unsigned> percent_copies;
    Param<unsigned> percent_uncacheable;
    Param<unsigned> progress_interval;
    Param<unsigned> percent_source_unaligned;
    Param<unsigned> percent_dest_unaligned;
    Param<Addr> trace_addr;
    Param<Counter> max_loads;
+    Param<bool> atomic;

 END_DECLARE_SIM_OBJECT_PARAMS(MemTest)


 BEGIN_INIT_SIM_OBJECT_PARAMS(MemTest)

-    INIT_PARAM(cache, "L1 cache"),
-    INIT_PARAM(main_mem, "hierarchical memory"),
-    INIT_PARAM(check_mem, "check memory"),
+//    INIT_PARAM(cache, "L1 cache"),
+//    INIT_PARAM(main_mem, "hierarchical memory"),
+//    INIT_PARAM(check_mem, "check memory"),
    INIT_PARAM(memory_size, "memory size"),
    INIT_PARAM(percent_reads, "target read percentage"),
-    INIT_PARAM(percent_copies, "target copy percentage"),
+//    INIT_PARAM(percent_copies, "target copy percentage"),
    INIT_PARAM(percent_uncacheable, "target uncacheable percentage"),
    INIT_PARAM(progress_interval, "progress report interval (in accesses)"),
    INIT_PARAM(percent_source_unaligned,
@@ -426,18 +517,19 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(MemTest)
    INIT_PARAM(percent_dest_unaligned,
               "percent of copy dest address that are unaligned"),
    INIT_PARAM(trace_addr, "address to trace"),
-    INIT_PARAM(max_loads, "terminate when we have reached this load count")
+                              INIT_PARAM(max_loads, "terminate when we have reached this load count"),
+    INIT_PARAM(atomic, "Is the tester testing atomic mode (or timing)")

 END_INIT_SIM_OBJECT_PARAMS(MemTest)


 CREATE_SIM_OBJECT(MemTest)
 {
-    return new MemTest(getInstanceName(), cache->getInterface(), main_mem,
-                       check_mem, memory_size, percent_reads, percent_copies,
+    return new MemTest(getInstanceName(), /*cache->getInterface(),*/ /*main_mem,*/
+                       /*check_mem,*/ memory_size, percent_reads, /*percent_copies,*/
                       percent_uncacheable, progress_interval,
                       percent_source_unaligned, percent_dest_unaligned,
-                       trace_addr, max_loads);
+                       trace_addr, max_loads, atomic);
 }

 REGISTER_SIM_OBJECT("MemTest", MemTest)
--- a/src/cpu/memtest/memtest.hh
+++ b/src/cpu/memtest/memtest.hh
@@ -35,31 +35,36 @@
 #include <set>

 #include "base/statistics.hh"
-#include "mem/functional/functional.hh"
-#include "mem/mem_interface.hh"
+//#include "mem/functional/functional.hh"
+//#include "mem/mem_interface.hh"
 #include "sim/eventq.hh"
 #include "sim/sim_exit.hh"
 #include "sim/sim_object.hh"
 #include "sim/stats.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"

-class ThreadContext;
-class MemTest : public SimObject
+class Packet;
+class MemTest : public MemObject
 {
  public:

    MemTest(const std::string &name,
-            MemInterface *_cache_interface,
-            FunctionalMemory *main_mem,
-            FunctionalMemory *check_mem,
+//	    MemInterface *_cache_interface,
+//	    PhysicalMemory *main_mem,
+//	    PhysicalMemory *check_mem,
            unsigned _memorySize,
            unsigned _percentReads,
-            unsigned _percentCopies,
+//	    unsigned _percentCopies,
            unsigned _percentUncacheable,
            unsigned _progressInterval,
            unsigned _percentSourceUnaligned,
            unsigned _percentDestUnaligned,
            Addr _traceAddr,
-            Counter _max_loads);
+            Counter _max_loads,
+            bool _atomic);
+
+    virtual void init();

    // register statistics
    virtual void regStats();
@@ -69,6 +74,8 @@ class MemTest : public SimObject
    // main simulation loop (one cycle)
    void tick();

+    virtual Port *getPort(const std::string &if_name, int idx = -1);
+
  protected:
    class TickEvent : public Event
    {
@@ -82,16 +89,62 @@ class MemTest : public SimObject
    };

    TickEvent tickEvent;
+    class CpuPort : public Port
+    {

-    MemInterface *cacheInterface;
-    FunctionalMemory *mainMem;
-    FunctionalMemory *checkMem;
-    SimpleThread *thread;
+        MemTest *memtest;
+
+      public:
+
+        CpuPort(const std::string &_name, MemTest *_memtest)
+            : Port(_name), memtest(_memtest)
+        { }
+
+      protected:
+
+        virtual bool recvTiming(Packet *pkt);
+
+        virtual Tick recvAtomic(Packet *pkt);
+
+        virtual void recvFunctional(Packet *pkt);
+
+        virtual void recvStatusChange(Status status);
+
+        virtual void recvRetry();
+
+        virtual void getDeviceAddressRanges(AddrRangeList &resp,
+            AddrRangeList &snoop)
+        { resp.clear(); snoop.clear(); snoop.push_back(RangeSize(0,-1)); }
+    };
+
+    CpuPort cachePort;
+    CpuPort funcPort;
+
+    class MemTestSenderState : public Packet::SenderState
+    {
+      public:
+        /** Constructor. */
+        MemTestSenderState(uint8_t *_data)
+            : data(_data)
+        { }
+
+        // Hold onto data pointer
+        uint8_t *data;
+    };
+
+//    Request *dataReq;
+    Packet  *retryPkt;
+//    MemInterface *cacheInterface;
+//    PhysicalMemory *mainMem;
+//    PhysicalMemory *checkMem;
+//    SimpleThread *thread;
+
+    bool accessRetry;

    unsigned size;		// size of testing memory region

    unsigned percentReads;	// target percentage of read accesses
-    unsigned percentCopies;	// target percentage of copy accesses
+//    unsigned percentCopies;	// target percentage of copy accesses
    unsigned percentUncacheable;

    int id;
@@ -123,36 +176,23 @@ class MemTest : public SimObject

    uint64_t numReads;
    uint64_t maxLoads;
+
+    bool atomic;
+
    Stats::Scalar<> numReadsStat;
    Stats::Scalar<> numWritesStat;
    Stats::Scalar<> numCopiesStat;

    // called by MemCompleteEvent::process()
-    void completeRequest(MemReqPtr &req, uint8_t *data);
+    void completeRequest(Packet *pkt);
+
+    void sendPkt(Packet *pkt);
+
+    void doRetry();

    friend class MemCompleteEvent;
 };

-
-class MemCompleteEvent : public Event
-{
-    MemReqPtr req;
-    uint8_t *data;
-    MemTest *tester;
-
-  public:
-
-    MemCompleteEvent(MemReqPtr &_req, uint8_t *_data, MemTest *_tester)
-        : Event(&mainEventQueue),
-          req(_req), data(_data), tester(_tester)
-    {
-    }
-
-    void process();
-
-    virtual const char *description();
-};
-
 #endif // __CPU_MEMTEST_MEMTEST_HH__


--- a/src/cpu/o3/commit_impl.hh
+++ b/src/cpu/o3/commit_impl.hh
@@ -342,12 +342,6 @@ DefaultCommit<Impl>::drain()
 {
    drainPending = true;

-    // If it's already drained, return true.
-    if (rob->isEmpty() && !iewStage->hasStoresToWB()) {
-        cpu->signalDrained();
-        return true;
-    }
-
    return false;
 }

@@ -1218,16 +1212,16 @@ DefaultCommit<Impl>::skidInsert()

    for (int inst_num = 0; inst_num < fromRename->size; ++inst_num) {
        DynInstPtr inst = fromRename->insts[inst_num];
-        int tid = inst->threadNumber;

        if (!inst->isSquashed()) {
            DPRINTF(Commit, "Inserting PC %#x [sn:%i] [tid:%i] into ",
-                    "skidBuffer.\n", inst->readPC(), inst->seqNum, tid);
+                    "skidBuffer.\n", inst->readPC(), inst->seqNum,
+                    inst->threadNumber);
            skidBuffer.push(inst);
        } else {
            DPRINTF(Commit, "Instruction PC %#x [sn:%i] [tid:%i] was "
                    "squashed, skipping.\n",
-                    inst->readPC(), inst->seqNum, tid);
+                    inst->readPC(), inst->seqNum, inst->threadNumber);
        }
    }
 }
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -88,7 +88,7 @@ FullO3CPU<Impl>::TickEvent::description()

 template <class Impl>
 FullO3CPU<Impl>::ActivateThreadEvent::ActivateThreadEvent()
-    : Event(&mainEventQueue, CPU_Tick_Pri)
+    : Event(&mainEventQueue, CPU_Switch_Pri)
 {
 }

@@ -135,7 +135,8 @@ void
 FullO3CPU<Impl>::DeallocateContextEvent::process()
 {
    cpu->deactivateThread(tid);
-    cpu->removeThread(tid);
+    if (remove)
+        cpu->removeThread(tid);
 }

 template <class Impl>
@@ -191,7 +192,11 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
      deferRegistration(params->deferRegistration),
      numThreads(number_of_threads)
 {
-    _status = Idle;
+    if (!deferRegistration) {
+        _status = Running;
+    } else {
+        _status = Idle;
+    }

    checker = NULL;

@@ -304,6 +309,9 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)

                            tid,
                            bindRegs);
+
+        activateThreadEvent[tid].init(tid, this);
+        deallocateContextEvent[tid].init(tid, this);
    }

    rename.setRenameMap(renameMap);
@@ -447,13 +455,16 @@ FullO3CPU<Impl>::tick()
    if (!tickEvent.scheduled()) {
        if (_status == SwitchedOut ||
            getState() == SimObject::Drained) {
+            DPRINTF(O3CPU, "Switched out!\n");
            // increment stat
            lastRunningCycle = curTick;
-        } else if (!activityRec.active()) {
+        } else if (!activityRec.active() || _status == Idle) {
+            DPRINTF(O3CPU, "Idle!\n");
            lastRunningCycle = curTick;
            timesIdled++;
        } else {
            tickEvent.schedule(curTick + cycles(1));
+            DPRINTF(O3CPU, "Scheduling next tick!\n");
        }
    }

@@ -512,6 +523,8 @@ FullO3CPU<Impl>::activateThread(unsigned tid)
    list<unsigned>::iterator isActive = find(
        activeThreads.begin(), activeThreads.end(), tid);

+    DPRINTF(O3CPU, "[tid:%i]: Calling activate thread.\n", tid);
+
    if (isActive == activeThreads.end()) {
        DPRINTF(O3CPU, "[tid:%i]: Adding to active threads list\n",
                tid);
@@ -528,6 +541,8 @@ FullO3CPU<Impl>::deactivateThread(unsigned tid)
    list<unsigned>::iterator thread_it =
        find(activeThreads.begin(), activeThreads.end(), tid);

+    DPRINTF(O3CPU, "[tid:%i]: Calling deactivate thread.\n", tid);
+
    if (thread_it != activeThreads.end()) {
        DPRINTF(O3CPU,"[tid:%i]: Removing from active threads list\n",
                tid);
@@ -548,7 +563,7 @@ FullO3CPU<Impl>::activateContext(int tid, int delay)
        activateThread(tid);
    }

-    if(lastActivatedCycle < curTick) {
+    if (lastActivatedCycle < curTick) {
        scheduleTickEvent(delay);

        // Be sure to signal that there's some activity so the CPU doesn't
@@ -563,17 +578,20 @@ FullO3CPU<Impl>::activateContext(int tid, int delay)
 }

 template <class Impl>
-void
-FullO3CPU<Impl>::deallocateContext(int tid, int delay)
+bool
+FullO3CPU<Impl>::deallocateContext(int tid, bool remove, int delay)
 {
    // Schedule removal of thread data from CPU
    if (delay){
        DPRINTF(O3CPU, "[tid:%i]: Scheduling thread context to deallocate "
                "on cycle %d\n", tid, curTick + cycles(delay));
-        scheduleDeallocateContextEvent(tid, delay);
+        scheduleDeallocateContextEvent(tid, remove, delay);
+        return false;
    } else {
        deactivateThread(tid);
-        removeThread(tid);
+        if (remove)
+            removeThread(tid);
+        return true;
    }
 }

@@ -582,8 +600,9 @@ void
 FullO3CPU<Impl>::suspendContext(int tid)
 {
    DPRINTF(O3CPU,"[tid: %i]: Suspending Thread Context.\n", tid);
-    deactivateThread(tid);
-    if (activeThreads.size() == 0)
+    bool deallocated = deallocateContext(tid, false, 1);
+    // If this was the last thread then unschedule the tick event.
+    if ((activeThreads.size() == 1 && !deallocated) || activeThreads.size() == 0)
        unscheduleTickEvent();
    _status = Idle;
 }
@@ -594,7 +613,7 @@ FullO3CPU<Impl>::haltContext(int tid)
 {
    //For now, this is the same as deallocate
    DPRINTF(O3CPU,"[tid:%i]: Halt Context called. Deallocating", tid);
-    deallocateContext(tid, 1);
+    deallocateContext(tid, true, 1);
 }

 template <class Impl>
@@ -682,10 +701,17 @@ FullO3CPU<Impl>::removeThread(unsigned tid)
    assert(iew.ldstQueue.getCount(tid) == 0);

    // Reset ROB/IQ/LSQ Entries
+
+    // Commented out for now.  This should be possible to do by
+    // telling all the pipeline stages to drain first, and then
+    // checking until the drain completes.  Once the pipeline is
+    // drained, call resetEntries(). - 10-09-06 ktlim
+/*
    if (activeThreads.size() >= 1) {
        commit.rob->resetEntries();
        iew.resetEntries();
    }
+*/
 }


@@ -824,7 +850,9 @@ template <class Impl>
 void
 FullO3CPU<Impl>::resume()
 {
+#if FULL_SYSTEM
    assert(system->getMemoryMode() == System::Timing);
+#endif
    fetch.resume();
    decode.resume();
    rename.resume();
@@ -935,6 +963,25 @@ FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
    }
    if (!tickEvent.scheduled())
        tickEvent.schedule(curTick);
+
+    Port *peer;
+    Port *icachePort = fetch.getIcachePort();
+    if (icachePort->getPeer() == NULL) {
+        peer = oldCPU->getPort("icache_port")->getPeer();
+        icachePort->setPeer(peer);
+    } else {
+        peer = icachePort->getPeer();
+    }
+    peer->setPeer(icachePort);
+
+    Port *dcachePort = iew.getDcachePort();
+    if (dcachePort->getPeer() == NULL) {
+        peer = oldCPU->getPort("dcache_port")->getPeer();
+        dcachePort->setPeer(peer);
+    } else {
+        peer = dcachePort->getPeer();
+    }
+    peer->setPeer(dcachePort);
 }

 template <class Impl>
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@@ -202,9 +202,12 @@ class FullO3CPU : public BaseO3CPU
    class DeallocateContextEvent : public Event
    {
      private:
-        /** Number of Thread to Activate */
+        /** Number of Thread to deactivate */
        int tid;

+        /** Should the thread be removed from the CPU? */
+        bool remove;
+
        /** Pointer to the CPU. */
        FullO3CPU<Impl> *cpu;

@@ -218,12 +221,15 @@ class FullO3CPU : public BaseO3CPU
        /** Processes the event, calling activateThread() on the CPU. */
        void process();

+        /** Sets whether the thread should also be removed from the CPU. */
+        void setRemove(bool _remove) { remove = _remove; }
+
        /** Returns the description of the event. */
        const char *description();
    };

    /** Schedule cpu to deallocate thread context.*/
-    void scheduleDeallocateContextEvent(int tid, int delay)
+    void scheduleDeallocateContextEvent(int tid, bool remove, int delay)
    {
        // Schedule thread to activate, regardless of its current state.
        if (deallocateContextEvent[tid].squashed())
@@ -296,9 +302,9 @@ class FullO3CPU : public BaseO3CPU
    void suspendContext(int tid);

    /** Remove Thread from Active Threads List &&
-     *  Remove Thread Context from CPU.
+     *  Possibly Remove Thread Context from CPU.
     */
-    void deallocateContext(int tid, int delay = 1);
+    bool deallocateContext(int tid, bool remove, int delay = 1);

    /** Remove Thread from Active Threads List &&
     *  Remove Thread Context from CPU.
@@ -626,11 +632,6 @@ class FullO3CPU : public BaseO3CPU
    /** Pointers to all of the threads in the CPU. */
    std::vector<Thread *> thread;

-    /** Pointer to the icache interface. */
-    MemInterface *icacheInterface;
-    /** Pointer to the dcache interface. */
-    MemInterface *dcacheInterface;
-
    /** Whether or not the CPU should defer its registration. */
    bool deferRegistration;

--- a/src/cpu/o3/fetch.hh
+++ b/src/cpu/o3/fetch.hh
@@ -96,7 +96,7 @@ class DefaultFetch
        /** Returns the address ranges of this device. */
        virtual void getDeviceAddressRanges(AddrRangeList &resp,
                                            AddrRangeList &snoop)
-        { resp.clear(); snoop.clear(); }
+        { resp.clear(); snoop.clear(); snoop.push_back(RangeSize(0,-1)); }

        /** Timing version of receive.  Handles setting fetch to the
         * proper status to start fetching. */
--- a/src/cpu/o3/fetch_impl.hh
+++ b/src/cpu/o3/fetch_impl.hh
@@ -63,7 +63,7 @@ template<class Impl>
 void
 DefaultFetch<Impl>::IcachePort::recvFunctional(PacketPtr pkt)
 {
-    panic("DefaultFetch doesn't expect recvFunctional callback!");
+    warn("Default fetch doesn't update it's state from a functional call.");
 }

 template<class Impl>
@@ -599,7 +599,7 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
    if (fault == NoFault) {
 #if 0
        if (cpu->system->memctrl->badaddr(memReq[tid]->paddr) ||
-            memReq[tid]->flags & UNCACHEABLE) {
+            memReq[tid]->isUncacheable()) {
            DPRINTF(Fetch, "Fetch: Bad address %#x (hopefully on a "
                    "misspeculating path)!",
                    memReq[tid]->paddr);
@@ -623,6 +623,11 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
        // Now do the timing access to see whether or not the instruction
        // exists within the cache.
        if (!icachePort->sendTiming(data_pkt)) {
+            if (data_pkt->result == Packet::BadAddress) {
+                fault = TheISA::genMachineCheckFault();
+                delete mem_req;
+                memReq[tid] = NULL;
+            }
            assert(retryPkt == NULL);
            assert(retryTid == -1);
            DPRINTF(Fetch, "[tid:%i] Out of MSHRs!\n", tid);
--- a/src/cpu/o3/iew_impl.hh
+++ b/src/cpu/o3/iew_impl.hh
@@ -600,6 +600,11 @@ template<class Impl>
 void
 DefaultIEW<Impl>::instToCommit(DynInstPtr &inst)
 {
+    // This function should not be called after writebackInsts in a
+    // single cycle.  That will cause problems with an instruction
+    // being added to the queue to commit without being processed by
+    // writebackInsts prior to being sent to commit.
+
    // First check the time slot that this instruction will write
    // to.  If there are free write ports at the time, then go ahead
    // and write the instruction to that time.  If there are not,
@@ -1286,6 +1291,7 @@ DefaultIEW<Impl>::executeInsts()
                } else if (fault != NoFault) {
                    // If the instruction faulted, then we need to send it along to commit
                    // without the instruction completing.
+                    DPRINTF(IEW, "Store has fault! [sn:%lli]\n", inst->seqNum);

                    // Send this instruction to commit, also make sure iew stage
                    // realizes there is activity.
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -311,7 +311,7 @@ class LSQ {
        /** Returns the address ranges of this device. */
        virtual void getDeviceAddressRanges(AddrRangeList &resp,
                                            AddrRangeList &snoop)
-        { resp.clear(); snoop.clear(); }
+        { resp.clear(); snoop.clear(); snoop.push_back(RangeSize(0,-1)); }

        /** Timing version of receive.  Handles writing back and
         * completing the load or store that has returned from
--- a/src/cpu/o3/lsq_impl.hh
+++ b/src/cpu/o3/lsq_impl.hh
@@ -46,7 +46,7 @@ template <class Impl>
 void
 LSQ<Impl>::DcachePort::recvFunctional(PacketPtr pkt)
 {
-    panic("O3CPU doesn't expect recvFunctional callback!");
+    warn("O3CPU doesn't update things on a recvFunctional.");
 }

 template <class Impl>
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -492,7 +492,7 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
    // A bit of a hackish way to get uncached accesses to work only if they're
    // at the head of the LSQ and are ready to commit (at the head of the ROB
    // too).
-    if (req->getFlags() & UNCACHEABLE &&
+    if (req->isUncacheable() &&
        (load_idx != loadHead || !load_inst->isAtCommit())) {
        iewStage->rescheduleMemInst(load_inst);
        ++lsqRescheduledLoads;
@@ -509,7 +509,7 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
            load_idx, store_idx, storeHead, req->getPaddr());

 #if FULL_SYSTEM
-    if (req->getFlags() & LOCKED) {
+    if (req->isLocked()) {
        cpu->lockAddr = req->getPaddr();
        cpu->lockFlag = true;
    }
@@ -626,18 +626,30 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)

    ++usedPorts;

-    PacketPtr data_pkt = new Packet(req, Packet::ReadReq, Packet::Broadcast);
-    data_pkt->dataStatic(load_inst->memData);
-
-    LSQSenderState *state = new LSQSenderState;
-    state->isLoad = true;
-    state->idx = load_idx;
-    state->inst = load_inst;
-    data_pkt->senderState = state;
-
    // if we the cache is not blocked, do cache access
    if (!lsq->cacheBlocked()) {
+        PacketPtr data_pkt =
+            new Packet(req, Packet::ReadReq, Packet::Broadcast);
+        data_pkt->dataStatic(load_inst->memData);
+
+        LSQSenderState *state = new LSQSenderState;
+        state->isLoad = true;
+        state->idx = load_idx;
+        state->inst = load_inst;
+        data_pkt->senderState = state;
+
        if (!dcachePort->sendTiming(data_pkt)) {
+            Packet::Result result = data_pkt->result;
+
+            // Delete state and data packet because a load retry
+            // initiates a pipeline restart; it does not retry.
+            delete state;
+            delete data_pkt;
+
+            if (result == Packet::BadAddress) {
+                return TheISA::genMachineCheckFault();
+            }
+
            // If the access didn't succeed, tell the LSQ by setting
            // the retry thread id.
            lsq->setRetryTid(lsqID);
@@ -664,16 +676,6 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
        return NoFault;
    }

-    if (data_pkt->result != Packet::Success) {
-        DPRINTF(LSQUnit, "LSQUnit: D-cache miss!\n");
-        DPRINTF(Activity, "Activity: ld accessing mem miss [sn:%lli]\n",
-                load_inst->seqNum);
-    } else {
-        DPRINTF(LSQUnit, "LSQUnit: D-cache hit!\n");
-        DPRINTF(Activity, "Activity: ld accessing mem hit [sn:%lli]\n",
-                load_inst->seqNum);
-    }
-
    return NoFault;
 }

--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -416,7 +416,7 @@ LSQUnit<Impl>::executeLoad(DynInstPtr &inst)
        // realizes there is activity.
        // Mark it as executed unless it is an uncached load that
        // needs to hit the head of commit.
-        if (!(inst->req->getFlags() & UNCACHEABLE) || inst->isAtCommit()) {
+        if (!(inst->req->isUncacheable()) || inst->isAtCommit()) {
            inst->setExecuted();
        }
        iewStage->instToCommit(inst);
@@ -608,21 +608,30 @@ LSQUnit<Impl>::writebackStores()

        DPRINTF(LSQUnit, "D-Cache: Writing back store idx:%i PC:%#x "
                "to Addr:%#x, data:%#x [sn:%lli]\n",
-                storeWBIdx, storeQueue[storeWBIdx].inst->readPC(),
+                storeWBIdx, inst->readPC(),
                req->getPaddr(), *(inst->memData),
-                storeQueue[storeWBIdx].inst->seqNum);
+                inst->seqNum);

        // @todo: Remove this SC hack once the memory system handles it.
-        if (req->getFlags() & LOCKED) {
-            if (req->getFlags() & UNCACHEABLE) {
+        if (req->isLocked()) {
+            if (req->isUncacheable()) {
                req->setScResult(2);
            } else {
                if (cpu->lockFlag) {
                    req->setScResult(1);
+                    DPRINTF(LSQUnit, "Store conditional [sn:%lli] succeeded.",
+                            inst->seqNum);
                } else {
                    req->setScResult(0);
                    // Hack: Instantly complete this store.
-                    completeDataAccess(data_pkt);
+//                    completeDataAccess(data_pkt);
+                    DPRINTF(LSQUnit, "Store conditional [sn:%lli] failed.  "
+                            "Instantly completing it.\n",
+                            inst->seqNum);
+                    WritebackEvent *wb = new WritebackEvent(inst, data_pkt, this);
+                    wb->schedule(curTick + 1);
+                    delete state;
+                    completeStore(storeWBIdx);
                    incrStIdx(storeWBIdx);
                    continue;
                }
@@ -633,7 +642,13 @@ LSQUnit<Impl>::writebackStores()
        }

        if (!dcachePort->sendTiming(data_pkt)) {
+            if (data_pkt->result == Packet::BadAddress) {
+                panic("LSQ sent out a bad address for a completed store!");
+            }
            // Need to handle becoming blocked on a store.
+            DPRINTF(IEW, "D-Cache became blcoked when writing [sn:%lli], will"
+                    "retry later\n",
+                    inst->seqNum);
            isStoreBlocked = true;
            ++lsqCacheBlocked;
            assert(retryPkt == NULL);
@@ -880,6 +895,9 @@ LSQUnit<Impl>::recvRetry()
        assert(retryPkt != NULL);

        if (dcachePort->sendTiming(retryPkt)) {
+            if (retryPkt->result == Packet::BadAddress) {
+                panic("LSQ sent out a bad address for a completed store!");
+            }
            storePostSend(retryPkt);
            retryPkt = NULL;
            isStoreBlocked = false;
--- a/src/cpu/o3/thread_context_impl.hh
+++ b/src/cpu/o3/thread_context_impl.hh
@@ -165,14 +165,14 @@ template <class Impl>
 void
 O3ThreadContext<Impl>::deallocate(int delay)
 {
-    DPRINTF(O3CPU, "Calling deallocate on Thread Context %d\n",
-            getThreadNum());
+    DPRINTF(O3CPU, "Calling deallocate on Thread Context %d delay %d\n",
+            getThreadNum(), delay);

    if (thread->status() == ThreadContext::Unallocated)
        return;

    thread->setStatus(ThreadContext::Unallocated);
-    cpu->deallocateContext(thread->readTid(), delay);
+    cpu->deallocateContext(thread->readTid(), true, delay);
 }

 template <class Impl>
--- a/src/cpu/ozone/back_end.hh
+++ b/src/cpu/ozone/back_end.hh
@@ -493,7 +493,7 @@ BackEnd<Impl>::read(RequestPtr req, T &data, int load_idx)
    }
 */
 /*
-    if (!dcacheInterface && (memReq->flags & UNCACHEABLE))
+    if (!dcacheInterface && (memReq->isUncacheable()))
        recordEvent("Uncached Read");
 */
    return LSQ.read(req, data, load_idx);
@@ -534,7 +534,7 @@ BackEnd<Impl>::write(RequestPtr req, T &data, int store_idx)
        *res = memReq->result;
        */
 /*
-    if (!dcacheInterface && (memReq->flags & UNCACHEABLE))
+    if (!dcacheInterface && (memReq->isUncacheable()))
        recordEvent("Uncached Write");
 */
    return LSQ.write(req, data, store_idx);
--- a/src/cpu/ozone/back_end_impl.hh
+++ b/src/cpu/ozone/back_end_impl.hh
@@ -1256,7 +1256,7 @@ BackEnd<Impl>::executeInsts()

 //                ++iewExecStoreInsts;

-                if (!(inst->req->flags & LOCKED)) {
+                if (!(inst->req->isLocked())) {
                    inst->setExecuted();

                    instToCommit(inst);
--- a/src/cpu/ozone/cpu.hh
+++ b/src/cpu/ozone/cpu.hh
@@ -455,12 +455,12 @@ class OzoneCPU : public BaseCPU
    {
 #if 0
 #if FULL_SYSTEM && defined(TARGET_ALPHA)
-        if (req->flags & LOCKED) {
+        if (req->isLocked()) {
            req->xc->setMiscReg(TheISA::Lock_Addr_DepTag, req->paddr);
            req->xc->setMiscReg(TheISA::Lock_Flag_DepTag, true);
        }
 #endif
-        if (req->flags & LOCKED) {
+        if (req->isLocked()) {
            lockAddrList.insert(req->paddr);
            lockFlag = true;
        }
@@ -489,10 +489,10 @@ class OzoneCPU : public BaseCPU
        ExecContext *xc;

        // If this is a store conditional, act appropriately
-        if (req->flags & LOCKED) {
+        if (req->isLocked()) {
            xc = req->xc;

-            if (req->flags & UNCACHEABLE) {
+            if (req->isUncacheable()) {
                // Don't update result register (see stq_c in isa_desc)
                req->result = 2;
                xc->setStCondFailures(0);//Needed? [RGD]
@@ -532,8 +532,8 @@ class OzoneCPU : public BaseCPU

 #endif

-        if (req->flags & LOCKED) {
-            if (req->flags & UNCACHEABLE) {
+        if (req->isLocked()) {
+            if (req->isUncacheable()) {
                req->result = 2;
            } else {
                if (this->lockFlag) {
--- a/src/cpu/ozone/front_end.hh
+++ b/src/cpu/ozone/front_end.hh
@@ -92,7 +92,7 @@ class FrontEnd
        /** Returns the address ranges of this device. */
        virtual void getDeviceAddressRanges(AddrRangeList &resp,
                                            AddrRangeList &snoop)
-        { resp.clear(); snoop.clear(); }
+        { resp.clear(); snoop.clear(); snoop.push_back(RangeSize(0,-1)); }

        /** Timing version of receive.  Handles setting fetch to the
         * proper status to start fetching. */
--- a/src/cpu/ozone/front_end_impl.hh
+++ b/src/cpu/ozone/front_end_impl.hh
@@ -59,7 +59,7 @@ template<class Impl>
 void
 FrontEnd<Impl>::IcachePort::recvFunctional(PacketPtr pkt)
 {
-    panic("FrontEnd doesn't expect recvFunctional callback!");
+    warn("FrontEnd doesn't update state from functional calls");
 }

 template<class Impl>
@@ -493,7 +493,7 @@ FrontEnd<Impl>::fetchCacheLine()
    if (fault == NoFault) {
 #if 0
        if (cpu->system->memctrl->badaddr(memReq->paddr) ||
-            memReq->flags & UNCACHEABLE) {
+            memReq->isUncacheable()) {
            DPRINTF(FE, "Fetch: Bad address %#x (hopefully on a "
                    "misspeculating path!",
                    memReq->paddr);
--- a/src/cpu/ozone/inorder_back_end.hh
+++ b/src/cpu/ozone/inorder_back_end.hh
@@ -231,7 +231,7 @@ InorderBackEnd<Impl>::read(Addr addr, T &data, unsigned flags)
        }
    }
 /*
-    if (!dcacheInterface && (memReq->flags & UNCACHEABLE))
+    if (!dcacheInterface && (memReq->isUncacheable()))
        recordEvent("Uncached Read");
 */
    return fault;
@@ -243,7 +243,7 @@ Fault
 InorderBackEnd<Impl>::read(MemReqPtr &req, T &data)
 {
 #if FULL_SYSTEM && defined(TARGET_ALPHA)
-    if (req->flags & LOCKED) {
+    if (req->isLocked()) {
        req->xc->setMiscReg(TheISA::Lock_Addr_DepTag, req->paddr);
        req->xc->setMiscReg(TheISA::Lock_Flag_DepTag, true);
    }
@@ -291,7 +291,7 @@ InorderBackEnd<Impl>::write(T data, Addr addr, unsigned flags, uint64_t *res)
    if (res && (fault == NoFault))
        *res = memReq->result;
 /*
-    if (!dcacheInterface && (memReq->flags & UNCACHEABLE))
+    if (!dcacheInterface && (memReq->isUncacheable()))
        recordEvent("Uncached Write");
 */
    return fault;
@@ -306,10 +306,10 @@ InorderBackEnd<Impl>::write(MemReqPtr &req, T &data)
    ExecContext *xc;

    // If this is a store conditional, act appropriately
-    if (req->flags & LOCKED) {
+    if (req->isLocked()) {
        xc = req->xc;

-        if (req->flags & UNCACHEABLE) {
+        if (req->isUncacheable()) {
            // Don't update result register (see stq_c in isa_desc)
            req->result = 2;
            xc->setStCondFailures(0);//Needed? [RGD]
@@ -391,7 +391,7 @@ InorderBackEnd<Impl>::read(MemReqPtr &req, T &data, int load_idx)
    }

 /*
-    if (!dcacheInterface && (req->flags & UNCACHEABLE))
+    if (!dcacheInterface && (req->isUncacheable()))
        recordEvent("Uncached Read");
 */
    return NoFault;
@@ -455,8 +455,8 @@ InorderBackEnd<Impl>::write(MemReqPtr &req, T &data, int store_idx)
        }
    }
 /*
-    if (req->flags & LOCKED) {
-        if (req->flags & UNCACHEABLE) {
+    if (req->isLocked()) {
+        if (req->isUncacheable()) {
            // Don't update result register (see stq_c in isa_desc)
            req->result = 2;
        } else {
@@ -469,7 +469,7 @@ InorderBackEnd<Impl>::write(MemReqPtr &req, T &data, int store_idx)
        *res = req->result;
        */
 /*
-    if (!dcacheInterface && (req->flags & UNCACHEABLE))
+    if (!dcacheInterface && (req->isUncacheable()))
        recordEvent("Uncached Write");
 */
    return NoFault;
--- a/src/cpu/ozone/lsq_unit.hh
+++ b/src/cpu/ozone/lsq_unit.hh
@@ -426,7 +426,7 @@ OzoneLSQ<Impl>::read(MemReqPtr &req, T &data, int load_idx)
    // at the head of the LSQ and are ready to commit (at the head of the ROB
    // too).
    // @todo: Fix uncached accesses.
-    if (req->flags & UNCACHEABLE &&
+    if (req->isUncacheable() &&
        (load_idx != loadHead || !loadQueue[load_idx]->readyToCommit())) {

        return TheISA::genMachineCheckFault();
--- a/src/cpu/ozone/lsq_unit_impl.hh
+++ b/src/cpu/ozone/lsq_unit_impl.hh
@@ -577,7 +577,7 @@ OzoneLSQ<Impl>::writebackStores()
            MemAccessResult result = dcacheInterface->access(req);

            //@todo temp fix for LL/SC (works fine for 1 CPU)
-            if (req->flags & LOCKED) {
+            if (req->isLocked()) {
                req->result=1;
                panic("LL/SC! oh no no support!!!");
            }
@@ -596,7 +596,7 @@ OzoneLSQ<Impl>::writebackStores()
                Event *wb = NULL;
 /*
                typename IEW::LdWritebackEvent *wb = NULL;
-                if (req->flags & LOCKED) {
+                if (req->isLocked()) {
                    // Stx_C does not generate a system port transaction.
                    req->result=0;
                    wb = new typename IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
@@ -630,7 +630,7 @@ OzoneLSQ<Impl>::writebackStores()
 //                DPRINTF(Activity, "Active st accessing mem hit [sn:%lli]\n",
 //                        storeQueue[storeWBIdx].inst->seqNum);

-                if (req->flags & LOCKED) {
+                if (req->isLocked()) {
                    // Stx_C does not generate a system port transaction.
                    req->result=1;
                    typename BackEnd::LdWritebackEvent *wb =
--- a/src/cpu/ozone/lw_lsq.hh
+++ b/src/cpu/ozone/lw_lsq.hh
@@ -260,7 +260,7 @@ class OzoneLWLSQ {

        virtual void getDeviceAddressRanges(AddrRangeList &resp,
                                            AddrRangeList &snoop)
-        { resp.clear(); snoop.clear(); }
+        { resp.clear(); snoop.clear(); snoop.push_back(RangeSize(0,-1); }

        virtual bool recvTiming(PacketPtr pkt);

@@ -507,7 +507,7 @@ OzoneLWLSQ<Impl>::read(RequestPtr req, T &data, int load_idx)
    // at the head of the LSQ and are ready to commit (at the head of the ROB
    // too).
    // @todo: Fix uncached accesses.
-    if (req->getFlags() & UNCACHEABLE &&
+    if (req->isUncacheable() &&
        (inst != loadQueue.back() || !inst->isAtCommit())) {
        DPRINTF(OzoneLSQ, "[sn:%lli] Uncached load and not head of "
                "commit/LSQ!\n",
@@ -659,7 +659,7 @@ OzoneLWLSQ<Impl>::read(RequestPtr req, T &data, int load_idx)
        return NoFault;
    }

-    if (req->getFlags() & LOCKED) {
+    if (req->isLocked()) {
        cpu->lockFlag = true;
    }

--- a/src/cpu/ozone/lw_lsq_impl.hh
+++ b/src/cpu/ozone/lw_lsq_impl.hh
@@ -72,7 +72,7 @@ template <class Impl>
 void
 OzoneLWLSQ<Impl>::DcachePort::recvFunctional(PacketPtr pkt)
 {
-    panic("O3CPU doesn't expect recvFunctional callback!");
+    warn("O3CPU doesn't update things on a recvFunctional");
 }

 template <class Impl>
@@ -394,7 +394,7 @@ OzoneLWLSQ<Impl>::executeLoad(DynInstPtr &inst)
    // Actually probably want the oldest faulting load
    if (load_fault != NoFault) {
        DPRINTF(OzoneLSQ, "Load [sn:%lli] has a fault\n", inst->seqNum);
-        if (!(inst->req->getFlags() & UNCACHEABLE && !inst->isAtCommit())) {
+        if (!(inst->req->isUncacheable() && !inst->isAtCommit())) {
            inst->setExecuted();
        }
        // Maybe just set it as can commit here, although that might cause
@@ -605,8 +605,8 @@ OzoneLWLSQ<Impl>::writebackStores()
                inst->seqNum);

        // @todo: Remove this SC hack once the memory system handles it.
-        if (req->getFlags() & LOCKED) {
-            if (req->getFlags() & UNCACHEABLE) {
+        if (req->isLocked()) {
+            if (req->isUncacheable()) {
                req->setScResult(2);
            } else {
                if (cpu->lockFlag) {
@@ -663,7 +663,7 @@ OzoneLWLSQ<Impl>::writebackStores()
            if (result != MA_HIT && dcacheInterface->doEvents()) {
                store_event->miss = true;
                typename BackEnd::LdWritebackEvent *wb = NULL;
-                if (req->flags & LOCKED) {
+                if (req->isLocked()) {
                    wb = new typename BackEnd::LdWritebackEvent(inst,
                                                            be);
                    store_event->wbEvent = wb;
@@ -690,7 +690,7 @@ OzoneLWLSQ<Impl>::writebackStores()
 //                DPRINTF(Activity, "Active st accessing mem hit [sn:%lli]\n",
 //                        inst->seqNum);

-                if (req->flags & LOCKED) {
+                if (req->isLocked()) {
                    // Stx_C does not generate a system port
                    // transaction in the 21264, but that might be
                    // hard to accomplish in this model.
--- a/src/cpu/simple/atomic.cc
+++ b/src/cpu/simple/atomic.cc
@@ -28,6 +28,7 @@
 * Authors: Steve Reinhardt
 */

+#include "arch/locked_mem.hh"
 #include "arch/utility.hh"
 #include "cpu/exetrace.hh"
 #include "cpu/simple/atomic.hh"
@@ -93,7 +94,7 @@ AtomicSimpleCPU::init()
 bool
 AtomicSimpleCPU::CpuPort::recvTiming(Packet *pkt)
 {
-    panic("AtomicSimpleCPU doesn't expect recvAtomic callback!");
+    panic("AtomicSimpleCPU doesn't expect recvTiming callback!");
    return true;
 }

@@ -107,7 +108,8 @@ AtomicSimpleCPU::CpuPort::recvAtomic(Packet *pkt)
 void
 AtomicSimpleCPU::CpuPort::recvFunctional(Packet *pkt)
 {
-    panic("AtomicSimpleCPU doesn't expect recvFunctional callback!");
+    //No internal storage to update, just return
+    return;
 }

 void
@@ -133,20 +135,19 @@ AtomicSimpleCPU::AtomicSimpleCPU(Params *p)
 {
    _status = Idle;

-    // @todo fix me and get the real cpu id & thread number!!!
    ifetch_req = new Request();
-    ifetch_req->setThreadContext(0,0); //Need CPU/Thread IDS HERE
+    ifetch_req->setThreadContext(p->cpu_id, 0); // Add thread ID if we add MT
    ifetch_pkt = new Packet(ifetch_req, Packet::ReadReq, Packet::Broadcast);
    ifetch_pkt->dataStatic(&inst);

    data_read_req = new Request();
-    data_read_req->setThreadContext(0,0); //Need CPU/Thread IDS HERE
+    data_read_req->setThreadContext(p->cpu_id, 0); // Add thread ID here too
    data_read_pkt = new Packet(data_read_req, Packet::ReadReq,
                               Packet::Broadcast);
    data_read_pkt->dataStatic(&dataReg);

    data_write_req = new Request();
-    data_write_req->setThreadContext(0,0); //Need CPU/Thread IDS HERE
+    data_write_req->setThreadContext(p->cpu_id, 0); // Add thread ID here too
    data_write_pkt = new Packet(data_write_req, Packet::WriteReq,
                                Packet::Broadcast);
 }
@@ -161,9 +162,11 @@ AtomicSimpleCPU::serialize(ostream &os)
 {
    SimObject::State so_state = SimObject::getState();
    SERIALIZE_ENUM(so_state);
+    Status _status = status();
+    SERIALIZE_ENUM(_status);
+    BaseSimpleCPU::serialize(os);
    nameOut(os, csprintf("%s.tickEvent", name()));
    tickEvent.serialize(os);
-    BaseSimpleCPU::serialize(os);
 }

 void
@@ -171,8 +174,9 @@ AtomicSimpleCPU::unserialize(Checkpoint *cp, const string &section)
 {
    SimObject::State so_state;
    UNSERIALIZE_ENUM(so_state);
-    tickEvent.unserialize(cp, csprintf("%s.tickEvent", section));
+    UNSERIALIZE_ENUM(_status);
    BaseSimpleCPU::unserialize(cp, section);
+    tickEvent.unserialize(cp, csprintf("%s.tickEvent", section));
 }

 void
@@ -253,29 +257,36 @@ template <class T>
 Fault
 AtomicSimpleCPU::read(Addr addr, T &data, unsigned flags)
 {
-    data_read_req->setVirt(0, addr, sizeof(T), flags, thread->readPC());
+    // use the CPU's statically allocated read request and packet objects
+    Request *req = data_read_req;
+    Packet  *pkt = data_read_pkt;
+
+    req->setVirt(0, addr, sizeof(T), flags, thread->readPC());

    if (traceData) {
        traceData->setAddr(addr);
    }

    // translate to physical address
-    Fault fault = thread->translateDataReadReq(data_read_req);
+    Fault fault = thread->translateDataReadReq(req);

    // Now do the access.
    if (fault == NoFault) {
-        data_read_pkt->reinitFromRequest();
+        pkt->reinitFromRequest();

-        dcache_latency = dcachePort.sendAtomic(data_read_pkt);
+        dcache_latency = dcachePort.sendAtomic(pkt);
        dcache_access = true;

-        assert(data_read_pkt->result == Packet::Success);
-        data = data_read_pkt->get<T>();
+        assert(pkt->result == Packet::Success);
+        data = pkt->get<T>();

+        if (req->isLocked()) {
+            TheISA::handleLockedRead(thread, req);
+        }
    }

    // This will need a new way to tell if it has a dcache attached.
-    if (data_read_req->getFlags() & UNCACHEABLE)
+    if (req->isUncacheable())
        recordEvent("Uncached Read");

    return fault;
@@ -328,33 +339,52 @@ template <class T>
 Fault
 AtomicSimpleCPU::write(T data, Addr addr, unsigned flags, uint64_t *res)
 {
-    data_write_req->setVirt(0, addr, sizeof(T), flags, thread->readPC());
+    // use the CPU's statically allocated write request and packet objects
+    Request *req = data_write_req;
+    Packet  *pkt = data_write_pkt;
+
+    req->setVirt(0, addr, sizeof(T), flags, thread->readPC());

    if (traceData) {
        traceData->setAddr(addr);
    }

    // translate to physical address
-    Fault fault = thread->translateDataWriteReq(data_write_req);
+    Fault fault = thread->translateDataWriteReq(req);

    // Now do the access.
    if (fault == NoFault) {
-        data = htog(data);
-        data_write_pkt->reinitFromRequest();
-        data_write_pkt->dataStatic(&data);
+        bool do_access = true;  // flag to suppress cache access

-        dcache_latency = dcachePort.sendAtomic(data_write_pkt);
-        dcache_access = true;
+        if (req->isLocked()) {
+            do_access = TheISA::handleLockedWrite(thread, req);
+        }

-        assert(data_write_pkt->result == Packet::Success);
+        if (do_access) {
+            data = htog(data);
+            pkt->reinitFromRequest();
+            pkt->dataStatic(&data);

-        if (res && data_write_req->getFlags() & LOCKED) {
-            *res = data_write_req->getScResult();
+            dcache_latency = dcachePort.sendAtomic(pkt);
+            dcache_access = true;
+
+            assert(pkt->result == Packet::Success);
+        }
+
+        if (req->isLocked()) {
+            uint64_t scResult = req->getScResult();
+            if (scResult != 0) {
+                // clear failure counter
+                thread->setStCondFailures(0);
+            }
+            if (res) {
+                *res = req->getScResult();
+            }
        }
    }

    // This will need a new way to tell if it's hooked up to a cache or not.
-    if (data_write_req->getFlags() & UNCACHEABLE)
+    if (req->isUncacheable())
        recordEvent("Uncached Write");

    // If the write needs to have a fault on the access, consider calling
@@ -467,11 +497,11 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(AtomicSimpleCPU)
    Param<Tick> progress_interval;
    SimObjectParam<MemObject *> mem;
    SimObjectParam<System *> system;
+    Param<int> cpu_id;

 #if FULL_SYSTEM
    SimObjectParam<AlphaITB *> itb;
    SimObjectParam<AlphaDTB *> dtb;
-    Param<int> cpu_id;
    Param<Tick> profile;
 #else
    SimObjectParam<Process *> workload;
@@ -500,11 +530,11 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(AtomicSimpleCPU)
    INIT_PARAM(progress_interval, "Progress interval"),
    INIT_PARAM(mem, "memory"),
    INIT_PARAM(system, "system object"),
+    INIT_PARAM(cpu_id, "processor ID"),

 #if FULL_SYSTEM
    INIT_PARAM(itb, "Instruction TLB"),
    INIT_PARAM(dtb, "Data TLB"),
-    INIT_PARAM(cpu_id, "processor ID"),
    INIT_PARAM(profile, ""),
 #else
    INIT_PARAM(workload, "processes to run"),
@@ -538,11 +568,11 @@ CREATE_SIM_OBJECT(AtomicSimpleCPU)
    params->simulate_stalls = simulate_stalls;
    params->mem = mem;
    params->system = system;
+    params->cpu_id = cpu_id;

 #if FULL_SYSTEM
    params->itb = itb;
    params->dtb = dtb;
-    params->cpu_id = cpu_id;
    params->profile = profile;
 #else
    params->process = workload;
--- a/src/cpu/simple/atomic.hh
+++ b/src/cpu/simple/atomic.hh
@@ -104,9 +104,9 @@ class AtomicSimpleCPU : public BaseSimpleCPU

        virtual void getDeviceAddressRanges(AddrRangeList &resp,
            AddrRangeList &snoop)
-        { resp.clear(); snoop.clear(); }
-    };
+        { resp.clear(); snoop.clear(); snoop.push_back(RangeSize(0,-1)); }

+    };
    CpuPort icachePort;
    CpuPort dcachePort;

--- a/src/cpu/simple/timing.cc
+++ b/src/cpu/simple/timing.cc
@@ -28,6 +28,7 @@
 * Authors: Steve Reinhardt
 */

+#include "arch/locked_mem.hh"
 #include "arch/utility.hh"
 #include "cpu/exetrace.hh"
 #include "cpu/simple/timing.hh"
@@ -73,7 +74,8 @@ TimingSimpleCPU::CpuPort::recvAtomic(Packet *pkt)
 void
 TimingSimpleCPU::CpuPort::recvFunctional(Packet *pkt)
 {
-    panic("TimingSimpleCPU doesn't expect recvFunctional callback!");
+    //No internal storage to update, jusst return
+    return;
 }

 void
@@ -94,12 +96,14 @@ TimingSimpleCPU::CpuPort::TickEvent::schedule(Packet *_pkt, Tick t)
 }

 TimingSimpleCPU::TimingSimpleCPU(Params *p)
-    : BaseSimpleCPU(p), icachePort(this, p->clock), dcachePort(this, p->clock)
+    : BaseSimpleCPU(p), icachePort(this, p->clock), dcachePort(this, p->clock),
+      cpu_id(p->cpu_id)
 {
    _status = Idle;
    ifetch_pkt = dcache_pkt = NULL;
    drainEvent = NULL;
    fetchEvent = NULL;
+    previousTick = 0;
    changeState(SimObject::Running);
 }

@@ -158,6 +162,7 @@ TimingSimpleCPU::resume()

    assert(system->getMemoryMode() == System::Timing);
    changeState(SimObject::Running);
+    previousTick = curTick;
 }

 void
@@ -165,6 +170,7 @@ TimingSimpleCPU::switchOut()
 {
    assert(status() == Running || status() == Idle);
    _status = SwitchedOut;
+    numCycles += curTick - previousTick;

    // If we've been scheduled to resume but are then told to switch out,
    // we'll need to cancel it.
@@ -187,6 +193,27 @@ TimingSimpleCPU::takeOverFrom(BaseCPU *oldCPU)
            break;
        }
    }
+
+    if (_status != Running) {
+        _status = Idle;
+    }
+
+    Port *peer;
+    if (icachePort.getPeer() == NULL) {
+        peer = oldCPU->getPort("icache_port")->getPeer();
+        icachePort.setPeer(peer);
+    } else {
+        peer = icachePort.getPeer();
+    }
+    peer->setPeer(&icachePort);
+
+    if (dcachePort.getPeer() == NULL) {
+        peer = oldCPU->getPort("dcache_port")->getPeer();
+        dcachePort.setPeer(peer);
+    } else {
+        peer = dcachePort.getPeer();
+    }
+    peer->setPeer(&dcachePort);
 }


@@ -227,35 +254,35 @@ template <class T>
 Fault
 TimingSimpleCPU::read(Addr addr, T &data, unsigned flags)
 {
-    // need to fill in CPU & thread IDs here
-    Request *data_read_req = new Request();
-    data_read_req->setThreadContext(0,0); //Need CPU/Thread IDS HERE
-    data_read_req->setVirt(0, addr, sizeof(T), flags, thread->readPC());
+    Request *req =
+        new Request(/* asid */ 0, addr, sizeof(T), flags, thread->readPC(),
+                    cpu_id, /* thread ID */ 0);

    if (traceData) {
-        traceData->setAddr(data_read_req->getVaddr());
+        traceData->setAddr(req->getVaddr());
    }

   // translate to physical address
-    Fault fault = thread->translateDataReadReq(data_read_req);
+    Fault fault = thread->translateDataReadReq(req);

    // Now do the access.
    if (fault == NoFault) {
-        Packet *data_read_pkt =
-            new Packet(data_read_req, Packet::ReadReq, Packet::Broadcast);
-        data_read_pkt->dataDynamic<T>(new T);
+        Packet *pkt =
+            new Packet(req, Packet::ReadReq, Packet::Broadcast);
+        pkt->dataDynamic<T>(new T);

-        if (!dcachePort.sendTiming(data_read_pkt)) {
+        if (!dcachePort.sendTiming(pkt)) {
            _status = DcacheRetry;
-            dcache_pkt = data_read_pkt;
+            dcache_pkt = pkt;
        } else {
            _status = DcacheWaitResponse;
+            // memory system takes ownership of packet
            dcache_pkt = NULL;
        }
    }

    // This will need a new way to tell if it has a dcache attached.
-    if (data_read_req->getFlags() & UNCACHEABLE)
+    if (req->isUncacheable())
        recordEvent("Uncached Read");

    return fault;
@@ -308,31 +335,39 @@ template <class T>
 Fault
 TimingSimpleCPU::write(T data, Addr addr, unsigned flags, uint64_t *res)
 {
-    // need to fill in CPU & thread IDs here
-    Request *data_write_req = new Request();
-    data_write_req->setThreadContext(0,0); //Need CPU/Thread IDS HERE
-    data_write_req->setVirt(0, addr, sizeof(T), flags, thread->readPC());
+    Request *req =
+        new Request(/* asid */ 0, addr, sizeof(T), flags, thread->readPC(),
+                    cpu_id, /* thread ID */ 0);

    // translate to physical address
-    Fault fault = thread->translateDataWriteReq(data_write_req);
+    Fault fault = thread->translateDataWriteReq(req);
+
    // Now do the access.
    if (fault == NoFault) {
-        Packet *data_write_pkt =
-            new Packet(data_write_req, Packet::WriteReq, Packet::Broadcast);
-        data_write_pkt->allocate();
-        data_write_pkt->set(data);
+        assert(dcache_pkt == NULL);
+        dcache_pkt = new Packet(req, Packet::WriteReq, Packet::Broadcast);
+        dcache_pkt->allocate();
+        dcache_pkt->set(data);

-        if (!dcachePort.sendTiming(data_write_pkt)) {
-            _status = DcacheRetry;
-            dcache_pkt = data_write_pkt;
-        } else {
-            _status = DcacheWaitResponse;
-            dcache_pkt = NULL;
+        bool do_access = true;  // flag to suppress cache access
+
+        if (req->isLocked()) {
+            do_access = TheISA::handleLockedWrite(thread, req);
+        }
+
+        if (do_access) {
+            if (!dcachePort.sendTiming(dcache_pkt)) {
+                _status = DcacheRetry;
+            } else {
+                _status = DcacheWaitResponse;
+                // memory system takes ownership of packet
+                dcache_pkt = NULL;
+            }
        }
    }

    // This will need a new way to tell if it's hooked up to a cache or not.
-    if (data_write_req->getFlags() & UNCACHEABLE)
+    if (req->isUncacheable())
        recordEvent("Uncached Write");

    // If the write needs to have a fault on the access, consider calling
@@ -392,9 +427,8 @@ TimingSimpleCPU::fetch()
 {
    checkForInterrupts();

-    // need to fill in CPU & thread IDs here
    Request *ifetch_req = new Request();
-    ifetch_req->setThreadContext(0,0); //Need CPU/Thread IDS HERE
+    ifetch_req->setThreadContext(cpu_id, /* thread ID */ 0);
    Fault fault = setupFetchRequest(ifetch_req);

    ifetch_pkt = new Packet(ifetch_req, Packet::ReadReq, Packet::Broadcast);
@@ -414,6 +448,9 @@ TimingSimpleCPU::fetch()
        // fetch fault: advance directly to next instruction (fault handler)
        advanceInst(fault);
    }
+
+    numCycles += curTick - previousTick;
+    previousTick = curTick;
 }


@@ -444,6 +481,9 @@ TimingSimpleCPU::completeIfetch(Packet *pkt)
    delete pkt->req;
    delete pkt;

+    numCycles += curTick - previousTick;
+    previousTick = curTick;
+
    if (getState() == SimObject::Draining) {
        completeDrain();
        return;
@@ -453,12 +493,20 @@ TimingSimpleCPU::completeIfetch(Packet *pkt)
    if (curStaticInst->isMemRef() && !curStaticInst->isDataPrefetch()) {
        // load or store: just send to dcache
        Fault fault = curStaticInst->initiateAcc(this, traceData);
-        if (fault == NoFault) {
-            // successfully initiated access: instruction will
-            // complete in dcache response callback
-            assert(_status == DcacheWaitResponse);
+        if (_status != Running) {
+            // instruction will complete in dcache response callback
+            assert(_status == DcacheWaitResponse || _status == DcacheRetry);
+            assert(fault == NoFault);
        } else {
-            // fault: complete now to invoke fault handler
+            if (fault == NoFault) {
+                // early fail on store conditional: complete now
+                assert(dcache_pkt != NULL);
+                fault = curStaticInst->completeAcc(dcache_pkt, this,
+                                                   traceData);
+                delete dcache_pkt->req;
+                delete dcache_pkt;
+                dcache_pkt = NULL;
+            }
            postExecute();
            advanceInst(fault);
        }
@@ -479,8 +527,7 @@ TimingSimpleCPU::IcachePort::ITickEvent::process()
 bool
 TimingSimpleCPU::IcachePort::recvTiming(Packet *pkt)
 {
-    // These next few lines could be replaced with something faster
-    // who knows what though
+    // delay processing of returned data until next CPU clock edge
    Tick time = pkt->req->getTime();
    while (time < curTick)
        time += lat;
@@ -516,21 +563,27 @@ TimingSimpleCPU::completeDataAccess(Packet *pkt)
    assert(_status == DcacheWaitResponse);
    _status = Running;

-    if (getState() == SimObject::Draining) {
-        completeDrain();
-
-        delete pkt->req;
-        delete pkt;
-
-        return;
-    }
+    numCycles += curTick - previousTick;
+    previousTick = curTick;

    Fault fault = curStaticInst->completeAcc(pkt, this, traceData);

+    if (pkt->isRead() && pkt->req->isLocked()) {
+        TheISA::handleLockedRead(thread, pkt->req);
+    }
+
    delete pkt->req;
    delete pkt;

    postExecute();
+
+    if (getState() == SimObject::Draining) {
+        advancePC(fault);
+        completeDrain();
+
+        return;
+    }
+
    advanceInst(fault);
 }

@@ -546,6 +599,7 @@ TimingSimpleCPU::completeDrain()
 bool
 TimingSimpleCPU::DcachePort::recvTiming(Packet *pkt)
 {
+    // delay processing of returned data until next CPU clock edge
    Tick time = pkt->req->getTime();
    while (time < curTick)
        time += lat;
@@ -574,6 +628,7 @@ TimingSimpleCPU::DcachePort::recvRetry()
    Packet *tmp = cpu->dcache_pkt;
    if (sendTiming(tmp)) {
        cpu->_status = DcacheWaitResponse;
+        // memory system takes ownership of packet
        cpu->dcache_pkt = NULL;
    }
 }
@@ -592,11 +647,11 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(TimingSimpleCPU)
    Param<Tick> progress_interval;
    SimObjectParam<MemObject *> mem;
    SimObjectParam<System *> system;
+    Param<int> cpu_id;

 #if FULL_SYSTEM
    SimObjectParam<AlphaITB *> itb;
    SimObjectParam<AlphaDTB *> dtb;
-    Param<int> cpu_id;
    Param<Tick> profile;
 #else
    SimObjectParam<Process *> workload;
@@ -625,11 +680,11 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(TimingSimpleCPU)
    INIT_PARAM(progress_interval, "Progress interval"),
    INIT_PARAM(mem, "memory"),
    INIT_PARAM(system, "system object"),
+    INIT_PARAM(cpu_id, "processor ID"),

 #if FULL_SYSTEM
    INIT_PARAM(itb, "Instruction TLB"),
    INIT_PARAM(dtb, "Data TLB"),
-    INIT_PARAM(cpu_id, "processor ID"),
    INIT_PARAM(profile, ""),
 #else
    INIT_PARAM(workload, "processes to run"),
@@ -661,11 +716,11 @@ CREATE_SIM_OBJECT(TimingSimpleCPU)
    params->functionTraceStart = function_trace_start;
    params->mem = mem;
    params->system = system;
+    params->cpu_id = cpu_id;

 #if FULL_SYSTEM
    params->itb = itb;
    params->dtb = dtb;
-    params->cpu_id = cpu_id;
    params->profile = profile;
 #else
    params->process = workload;
--- a/src/cpu/simple/timing.hh
+++ b/src/cpu/simple/timing.hh
@@ -92,7 +92,7 @@ class TimingSimpleCPU : public BaseSimpleCPU

        virtual void getDeviceAddressRanges(AddrRangeList &resp,
            AddrRangeList &snoop)
-        { resp.clear(); snoop.clear(); }
+        { resp.clear(); snoop.clear(); snoop.push_back(RangeSize(0,-1)); }

        struct TickEvent : public Event
        {
@@ -166,6 +166,9 @@ class TimingSimpleCPU : public BaseSimpleCPU
    Packet *ifetch_pkt;
    Packet *dcache_pkt;

+    int cpu_id;
+    Tick previousTick;
+
  public:

    virtual Port *getPort(const std::string &if_name, int idx = -1);
--- a/src/cpu/simple_thread.hh
+++ b/src/cpu/simple_thread.hh
@@ -237,7 +237,7 @@ class SimpleThread : public ThreadState
    Fault read(RequestPtr &req, T &data)
    {
 #if FULL_SYSTEM && THE_ISA == ALPHA_ISA
-        if (req->flags & LOCKED) {
+        if (req->isLocked()) {
            req->xc->setMiscReg(TheISA::Lock_Addr_DepTag, req->paddr);
            req->xc->setMiscReg(TheISA::Lock_Flag_DepTag, true);
        }
@@ -256,10 +256,10 @@ class SimpleThread : public ThreadState
        ExecContext *xc;

        // If this is a store conditional, act appropriately
-        if (req->flags & LOCKED) {
+        if (req->isLocked()) {
            xc = req->xc;

-            if (req->flags & UNCACHEABLE) {
+            if (req->isUncacheable()) {
                // Don't update result register (see stq_c in isa_desc)
                req->result = 2;
                xc->setStCondFailures(0);//Needed? [RGD]
--- a/src/dev/ide_ctrl.cc
+++ b/src/dev/ide_ctrl.cc
@@ -742,7 +742,6 @@ IdeController::unserialize(Checkpoint *cp, const std::string &section)
    UNSERIALIZE_SCALAR(bm_enabled);
    UNSERIALIZE_ARRAY(cmd_in_progress,
                      sizeof(cmd_in_progress) / sizeof(cmd_in_progress[0]));
-    pioPort->sendStatusChange(Port::RangeChange);
 }

 #ifndef DOXYGEN_SHOULD_SKIP_THIS
--- a/src/dev/pcidev.cc
+++ b/src/dev/pcidev.cc
@@ -302,6 +302,8 @@ PciDev::unserialize(Checkpoint *cp, const std::string &section)
    UNSERIALIZE_ARRAY(BARAddrs, sizeof(BARAddrs) / sizeof(BARAddrs[0]));
    UNSERIALIZE_ARRAY(config.data,
                      sizeof(config.data) / sizeof(config.data[0]));
+    pioPort->sendStatusChange(Port::RangeChange);
+
 }

 #ifndef DOXYGEN_SHOULD_SKIP_THIS
--- a/src/kern/tru64/tru64.hh
+++ b/src/kern/tru64/tru64.hh
@@ -532,16 +532,26 @@ class Tru64 : public OperatingSystem

        argp.copyIn(tc->getMemPort());

+        int stack_size =
+            gtoh(argp->rsize) + gtoh(argp->ysize) + gtoh(argp->gsize);
+
        // if the user chose an address, just let them have it.  Otherwise
        // pick one for them.
-        if (htog(argp->address) == 0) {
-            argp->address = htog(process->next_thread_stack_base);
-            int stack_size = (htog(argp->rsize) + htog(argp->ysize) +
-                    htog(argp->gsize));
+        Addr stack_base = gtoh(argp->address);
+
+        if (stack_base == 0) {
+            stack_base = process->next_thread_stack_base;
            process->next_thread_stack_base -= stack_size;
-            argp.copyOut(tc->getMemPort());
        }

+        stack_base = roundDown(stack_base, VMPageSize);
+
+        // map memory
+        process->pTable->allocate(stack_base, roundUp(stack_size, VMPageSize));
+
+        argp->address = gtoh(stack_base);
+        argp.copyOut(tc->getMemPort());
+
        return 0;
    }

@@ -577,7 +587,7 @@ class Tru64 : public OperatingSystem
            abort();
        }

-        const Addr base_addr = 0x12000; // was 0x3f0000000LL;
+        Addr base_addr = 0x12000; // was 0x3f0000000LL;
        Addr cur_addr = base_addr; // next addresses to use
        // first comes the config_info struct
        Addr config_addr = cur_addr;
@@ -603,8 +613,6 @@ class Tru64 : public OperatingSystem
        config->nxm_slot_state = htog(slot_state_addr);
        config->nxm_rad[0] = htog(rad_state_addr);

-        config.copyOut(tc->getMemPort());
-
        // initialize the slot_state array and copy it out
        TypedBufferArg<Tru64::nxm_slot_state_t> slot_state(slot_state_addr,
                                                           slot_state_size);
@@ -616,8 +624,6 @@ class Tru64 : public OperatingSystem
                (i == 0) ? Tru64::NXM_SLOT_BOUND : Tru64::NXM_SLOT_AVAIL;
        }

-        slot_state.copyOut(tc->getMemPort());
-
        // same for the per-RAD "shared" struct.  Note that we need to
        // allocate extra bytes for the per-VP array which is embedded at
        // the end.
@@ -650,17 +656,20 @@ class Tru64 : public OperatingSystem
            }
        }

-        rad_state.copyOut(tc->getMemPort());
-
        //
        // copy pointer to shared config area out to user
        //
        *configptr_ptr = htog(config_addr);
-        configptr_ptr.copyOut(tc->getMemPort());

        // Register this as a valid address range with the process
-        process->nxm_start = base_addr;
-        process->nxm_end = cur_addr;
+        base_addr = roundDown(base_addr, VMPageSize);
+        int size = cur_addr - base_addr;
+        process->pTable->allocate(base_addr, roundUp(size, VMPageSize));
+
+        config.copyOut(tc->getMemPort());
+        slot_state.copyOut(tc->getMemPort());
+        rad_state.copyOut(tc->getMemPort());
+        configptr_ptr.copyOut(tc->getMemPort());

        return 0;
    }
--- a/src/mem/bus.cc
+++ b/src/mem/bus.cc
@@ -61,12 +61,79 @@ Bus::getPort(const std::string &if_name, int idx)
 void
 Bus::init()
 {
-    std::vector<Port*>::iterator intIter;
+    std::vector<BusPort*>::iterator intIter;

    for (intIter = interfaces.begin(); intIter != interfaces.end(); intIter++)
        (*intIter)->sendStatusChange(Port::RangeChange);
 }

+Bus::BusFreeEvent::BusFreeEvent(Bus *_bus) : Event(&mainEventQueue), bus(_bus)
+{}
+
+void Bus::BusFreeEvent::process()
+{
+    bus->recvRetry(-1);
+}
+
+const char * Bus::BusFreeEvent::description()
+{
+    return "bus became available";
+}
+
+void Bus::occupyBus(PacketPtr pkt)
+{
+    //Bring tickNextIdle up to the present tick
+    //There is some potential ambiguity where a cycle starts, which might make
+    //a difference when devices are acting right around a cycle boundary. Using
+    //a < allows things which happen exactly on a cycle boundary to take up only
+    //the following cycle. Anthing that happens later will have to "wait" for
+    //the end of that cycle, and then start using the bus after that.
+    while (tickNextIdle < curTick)
+        tickNextIdle += clock;
+
+    // The packet will be sent. Figure out how long it occupies the bus, and
+    // how much of that time is for the first "word", aka bus width.
+    int numCycles = 0;
+    // Requests need one cycle to send an address
+    if (pkt->isRequest())
+        numCycles++;
+    else if (pkt->isResponse() || pkt->hasData()) {
+        // If a packet has data, it needs ceil(size/width) cycles to send it
+        // We're using the "adding instead of dividing" trick again here
+        if (pkt->hasData()) {
+            int dataSize = pkt->getSize();
+            for (int transmitted = 0; transmitted < dataSize;
+                    transmitted += width) {
+                numCycles++;
+            }
+        } else {
+            // If the packet didn't have data, it must have been a response.
+            // Those use the bus for one cycle to send their data.
+            numCycles++;
+        }
+    }
+
+    // The first word will be delivered after the current tick, the delivery
+    // of the address if any, and one bus cycle to deliver the data
+    pkt->firstWordTime =
+        tickNextIdle +
+        pkt->isRequest() ? clock : 0 +
+        clock;
+
+    //Advance it numCycles bus cycles.
+    //XXX Should this use the repeated addition trick as well?
+    tickNextIdle += (numCycles * clock);
+    if (!busIdle.scheduled()) {
+        busIdle.schedule(tickNextIdle);
+    } else {
+        busIdle.reschedule(tickNextIdle);
+    }
+    DPRINTF(Bus, "The bus is now occupied from tick %d to %d\n",
+            curTick, tickNextIdle);
+
+    // The bus will become idle once the current packet is delivered.
+    pkt->finishTime = tickNextIdle;
+}

 /** Function called by the port when the bus is receiving a Timing
 * transaction.*/
@@ -77,17 +144,40 @@ Bus::recvTiming(Packet *pkt)
    DPRINTF(Bus, "recvTiming: packet src %d dest %d addr 0x%x cmd %s\n",
            pkt->getSrc(), pkt->getDest(), pkt->getAddr(), pkt->cmdString());

+    BusPort *pktPort;
+    if (pkt->getSrc() == defaultId)
+        pktPort = defaultPort;
+    else pktPort = interfaces[pkt->getSrc()];
+
+    // If the bus is busy, or other devices are in line ahead of the current
+    // one, put this device on the retry list.
+    if (tickNextIdle > curTick ||
+            (retryList.size() && (!inRetry || pktPort != retryList.front()))) {
+        addToRetryList(pktPort);
+        return false;
+    }
+
    short dest = pkt->getDest();
    if (dest == Packet::Broadcast) {
-        if ( timingSnoopPhase1(pkt) )
-        {
-            timingSnoopPhase2(pkt);
+        if (timingSnoop(pkt)) {
+            pkt->flags |= SNOOP_COMMIT;
+            bool success = timingSnoop(pkt);
+            assert(success);
+            if (pkt->flags & SATISFIED) {
+                //Cache-Cache transfer occuring
+                if (inRetry) {
+                    retryList.front()->onRetryList(false);
+                    retryList.pop_front();
+                    inRetry = false;
+                }
+                occupyBus(pkt);
+                return true;
+            }
            port = findPort(pkt->getAddr(), pkt->getSrc());
-        }
-        else
-        {
+        } else {
            //Snoop didn't succeed
-            retryList.push_back(interfaces[pkt->getSrc()]);
+            DPRINTF(Bus, "Adding a retry to RETRY list %i\n", pktPort);
+            addToRetryList(pktPort);
            return false;
        }
    } else {
@@ -95,35 +185,60 @@ Bus::recvTiming(Packet *pkt)
        assert(dest != pkt->getSrc()); // catch infinite loops
        port = interfaces[dest];
    }
+
+    occupyBus(pkt);
+
    if (port->sendTiming(pkt))  {
-        // packet was successfully sent, just return true.
+        // Packet was successfully sent. Return true.
+        // Also take care of retries
+        if (inRetry) {
+            DPRINTF(Bus, "Remove retry from list %i\n", retryList.front());
+            retryList.front()->onRetryList(false);
+            retryList.pop_front();
+            inRetry = false;
+        }
        return true;
    }

-    // packet not successfully sent
-    retryList.push_back(interfaces[pkt->getSrc()]);
+    // Packet not successfully sent. Leave or put it on the retry list.
+    DPRINTF(Bus, "Adding a retry to RETRY list %i\n", pktPort);
+    addToRetryList(pktPort);
    return false;
 }

 void
 Bus::recvRetry(int id)
 {
-    // Go through all the elements on the list calling sendRetry on each
-    // This is not very efficient at all but it works. Ultimately we should end
-    // up with something that is more intelligent.
-    int initialSize = retryList.size();
-    int i;
-    Port *p;
+    DPRINTF(Bus, "Received a retry\n");
+    // If there's anything waiting, and the bus isn't busy...
+    if (retryList.size() && curTick >= tickNextIdle) {
+        //retryingPort = retryList.front();
+        inRetry = true;
+        DPRINTF(Bus, "Sending a retry\n");
+        retryList.front()->sendRetry();
+        // If inRetry is still true, sendTiming wasn't called
+        if (inRetry)
+        {
+            retryList.front()->onRetryList(false);
+            retryList.pop_front();
+            inRetry = false;

-    for (i = 0; i < initialSize; i++) {
-        assert(retryList.size() > 0);
-        p = retryList.front();
-        retryList.pop_front();
-        p->sendRetry();
+            //Bring tickNextIdle up to the present
+            while (tickNextIdle < curTick)
+                tickNextIdle += clock;
+
+            //Burn a cycle for the missed grant.
+            tickNextIdle += clock;
+
+            if (!busIdle.scheduled()) {
+                busIdle.schedule(tickNextIdle);
+            } else {
+                busIdle.reschedule(tickNextIdle);
+            }
+        }
    }
 }

-
 Port *
 Bus::findPort(Addr addr, int id)
 {
@@ -174,63 +289,59 @@ Bus::findSnoopPorts(Addr addr, int id)
            //Careful  to not overlap ranges
            //or snoop will be called more than once on the port
            ports.push_back(portSnoopList[i].portId);
-            DPRINTF(Bus, "  found snoop addr %#llx on device%d\n", addr,
-                    portSnoopList[i].portId);
+//            DPRINTF(Bus, "  found snoop addr %#llx on device%d\n", addr,
+//                    portSnoopList[i].portId);
        }
        i++;
    }
    return ports;
 }

-void
+Tick
 Bus::atomicSnoop(Packet *pkt)
+{
+    std::vector<int> ports = findSnoopPorts(pkt->getAddr(), pkt->getSrc());
+    Tick response_time = 0;
+
+    while (!ports.empty())
+    {
+        Tick response = interfaces[ports.back()]->sendAtomic(pkt);
+        if (response) {
+            assert(!response_time);  //Multiple responders
+            response_time = response;
+        }
+        ports.pop_back();
+    }
+    return response_time;
+}
+
+void
+Bus::functionalSnoop(Packet *pkt)
 {
    std::vector<int> ports = findSnoopPorts(pkt->getAddr(), pkt->getSrc());

    while (!ports.empty())
    {
-        interfaces[ports.back()]->sendAtomic(pkt);
+        interfaces[ports.back()]->sendFunctional(pkt);
        ports.pop_back();
    }
 }

 bool
-Bus::timingSnoopPhase1(Packet *pkt)
+Bus::timingSnoop(Packet *pkt)
 {
    std::vector<int> ports = findSnoopPorts(pkt->getAddr(), pkt->getSrc());
    bool success = true;

    while (!ports.empty() && success)
    {
-        snoopCallbacks.push_back(ports.back());
        success = interfaces[ports.back()]->sendTiming(pkt);
        ports.pop_back();
    }
-    if (!success)
-    {
-        while (!snoopCallbacks.empty())
-        {
-            interfaces[snoopCallbacks.back()]->sendStatusChange(Port::SnoopSquash);
-            snoopCallbacks.pop_back();
-        }
-        return false;
-    }
-    return true;
+
+    return success;
 }

-void
-Bus::timingSnoopPhase2(Packet *pkt)
-{
-    bool success;
-    pkt->flags |= SNOOP_COMMIT;
-    while (!snoopCallbacks.empty())
-    {
-        success = interfaces[snoopCallbacks.back()]->sendTiming(pkt);
-        //We should not fail on snoop callbacks
-        assert(success);
-        snoopCallbacks.pop_back();
-    }
-}

 /** Function called by the port when the bus is receiving a Atomic
 * transaction.*/
@@ -240,8 +351,11 @@ Bus::recvAtomic(Packet *pkt)
    DPRINTF(Bus, "recvAtomic: packet src %d dest %d addr 0x%x cmd %s\n",
            pkt->getSrc(), pkt->getDest(), pkt->getAddr(), pkt->cmdString());
    assert(pkt->getDest() == Packet::Broadcast);
-    atomicSnoop(pkt);
-    return findPort(pkt->getAddr(), pkt->getSrc())->sendAtomic(pkt);
+    Tick snoopTime = atomicSnoop(pkt);
+    if (snoopTime)
+        return snoopTime;  //Snoop satisfies it
+    else
+        return findPort(pkt->getAddr(), pkt->getSrc())->sendAtomic(pkt);
 }

 /** Function called by the port when the bus is receiving a Functional
@@ -252,6 +366,7 @@ Bus::recvFunctional(Packet *pkt)
    DPRINTF(Bus, "recvFunctional: packet src %d dest %d addr 0x%x cmd %s\n",
            pkt->getSrc(), pkt->getDest(), pkt->getAddr(), pkt->cmdString());
    assert(pkt->getDest() == Packet::Broadcast);
+    functionalSnoop(pkt);
    findPort(pkt->getAddr(), pkt->getSrc())->sendFunctional(pkt);
 }

@@ -280,7 +395,7 @@ Bus::recvStatusChange(Port::Status status, int id)
        }
    } else {

-        assert((id < interfaces.size() && id >= 0) || id == -1);
+        assert((id < interfaces.size() && id >= 0) || id == defaultId);
        Port *port = interfaces[id];
        std::vector<DevMap>::iterator portIter;
        std::vector<DevMap>::iterator snoopIter;
@@ -380,16 +495,20 @@ Bus::addressRanges(AddrRangeList &resp, AddrRangeList &snoop, int id)
 BEGIN_DECLARE_SIM_OBJECT_PARAMS(Bus)

    Param<int> bus_id;
+    Param<int> clock;
+    Param<int> width;

 END_DECLARE_SIM_OBJECT_PARAMS(Bus)

 BEGIN_INIT_SIM_OBJECT_PARAMS(Bus)
-    INIT_PARAM(bus_id, "a globally unique bus id")
+    INIT_PARAM(bus_id, "a globally unique bus id"),
+    INIT_PARAM(clock, "bus clock speed"),
+    INIT_PARAM(width, "width of the bus (bits)")
 END_INIT_SIM_OBJECT_PARAMS(Bus)

 CREATE_SIM_OBJECT(Bus)
 {
-    return new Bus(getInstanceName(), bus_id);
+    return new Bus(getInstanceName(), bus_id, clock, width);
 }

 REGISTER_SIM_OBJECT("Bus", Bus)
--- a/src/mem/bus.hh
+++ b/src/mem/bus.hh
@@ -46,13 +46,20 @@
 #include "mem/packet.hh"
 #include "mem/port.hh"
 #include "mem/request.hh"
+#include "sim/eventq.hh"

 class Bus : public MemObject
 {
    /** a globally unique id for this bus. */
    int busId;
+    /** the clock speed for the bus */
+    int clock;
+    /** the width of the bus in bytes */
+    int width;
+    /** the next tick at which the bus will be idle */
+    Tick tickNextIdle;

-    static const int defaultId = -1;
+    static const int defaultId = -3; //Make it unique from Broadcast

    struct DevMap {
        int portId;
@@ -62,9 +69,6 @@ class Bus : public MemObject
    AddrRangeList defaultRange;
    std::vector<DevMap> portSnoopList;

-    std::vector<int> snoopCallbacks;
-
-
    /** Function called by the port when the bus is recieving a Timing
      transaction.*/
    bool recvTiming(Packet *pkt);
@@ -103,18 +107,16 @@ class Bus : public MemObject
    std::vector<int> findSnoopPorts(Addr addr, int id);

    /** Snoop all relevant ports atomicly. */
-    void atomicSnoop(Packet *pkt);
+    Tick atomicSnoop(Packet *pkt);

-    /** Snoop for NACK and Blocked in phase 1
+    /** Snoop all relevant ports functionally. */
+    void functionalSnoop(Packet *pkt);
+
+    /** Call snoop on caches, be sure to set SNOOP_COMMIT bit if you want
+     * the snoop to happen
     * @return True if succeds.
     */
-    bool timingSnoopPhase1(Packet *pkt);
-
-    /** @todo Don't need to commit all snoops just those that need it
-     *(register somehow). */
-    /** Commit all snoops now that we know if any of them would have blocked.
-     */
-    void timingSnoopPhase2(Packet *pkt);
+    bool timingSnoop(Packet *pkt);

    /** Process address range request.
     * @param resp addresses that we can respond to
@@ -123,11 +125,15 @@ class Bus : public MemObject
     */
    void addressRanges(AddrRangeList &resp, AddrRangeList &snoop, int id);

+    /** Occupy the bus with transmitting the packet pkt */
+    void occupyBus(PacketPtr pkt);

    /** Declaration of the buses port type, one will be instantiated for each
        of the interfaces connecting to the bus. */
    class BusPort : public Port
    {
+        bool _onRetryList;
+
        /** A pointer to the bus to which this port belongs. */
        Bus *bus;

@@ -138,9 +144,15 @@ class Bus : public MemObject

        /** Constructor for the BusPort.*/
        BusPort(const std::string &_name, Bus *_bus, int _id)
-            : Port(_name), bus(_bus), id(_id)
+            : Port(_name), _onRetryList(false), bus(_bus), id(_id)
        { }

+        bool onRetryList()
+        { return _onRetryList; }
+
+        void onRetryList(bool newVal)
+        { _onRetryList = newVal; }
+
      protected:

        /** When reciving a timing request from the peer port (at id),
@@ -181,16 +193,52 @@ class Bus : public MemObject

    };

+    class BusFreeEvent : public Event
+    {
+        Bus * bus;
+
+      public:
+        BusFreeEvent(Bus * _bus);
+        void process();
+        const char *description();
+    };
+
+    BusFreeEvent busIdle;
+
+    bool inRetry;
+
    /** An array of pointers to the peer port interfaces
        connected to this bus.*/
-    std::vector<Port*> interfaces;
+    std::vector<BusPort*> interfaces;

    /** An array of pointers to ports that retry should be called on because the
     * original send failed for whatever reason.*/
-    std::list<Port*> retryList;
+    std::list<BusPort*> retryList;
+
+    void addToRetryList(BusPort * port)
+    {
+        if (!inRetry) {
+            // The device wasn't retrying a packet, or wasn't at an appropriate
+            // time.
+            assert(!port->onRetryList());
+            port->onRetryList(true);
+            retryList.push_back(port);
+        } else {
+            if (port->onRetryList()) {
+                // The device was retrying a packet. It didn't work, so we'll leave
+                // it at the head of the retry list.
+                assert(port == retryList.front());
+                inRetry = false;
+            }
+            else {
+                port->onRetryList(true);
+                retryList.push_back(port);
+            }
+        }
+    }

    /** Port that handles requests that don't match any of the interfaces.*/
-    Port *defaultPort;
+    BusPort *defaultPort;

  public:

@@ -199,8 +247,16 @@ class Bus : public MemObject

    virtual void init();

-    Bus(const std::string &n, int bus_id)
-        : MemObject(n), busId(bus_id), defaultPort(NULL)  {}
+    Bus(const std::string &n, int bus_id, int _clock, int _width)
+        : MemObject(n), busId(bus_id), clock(_clock), width(_width),
+        tickNextIdle(0), busIdle(this), inRetry(false), defaultPort(NULL)
+    {
+        //Both the width and clock period must be positive
+        if (width <= 0)
+            fatal("Bus width must be positive\n");
+        if (clock <= 0)
+            fatal("Bus clock period must be positive\n");
+    }

 };

--- a/src/mem/cache/base_cache.cc
+++ b/src/mem/cache/base_cache.cc
@@ -44,6 +44,8 @@ BaseCache::CachePort::CachePort(const std::string &_name, BaseCache *_cache,
    : Port(_name), cache(_cache), isCpuSide(_isCpuSide)
 {
    blocked = false;
+    cshrRetry = NULL;
+    waitingOnRetry = false;
    //Start ports at null if more than one is created we should panic
    //cpuSidePort = NULL;
    //memSidePort = NULL;
@@ -71,7 +73,23 @@ BaseCache::CachePort::deviceBlockSize()
 bool
 BaseCache::CachePort::recvTiming(Packet *pkt)
 {
-    if (blocked)
+    if (isCpuSide
+        && !pkt->req->isUncacheable()
+        && pkt->isInvalidate()
+        && !pkt->isRead() && !pkt->isWrite()) {
+        //Upgrade or Invalidate
+        //Look into what happens if two slave caches on bus
+        DPRINTF(Cache, "%s %x ? blk_addr: %x\n", pkt->cmdString(),
+                pkt->getAddr() & (((ULL(1))<<48)-1),
+                pkt->getAddr() & ~((Addr)cache->blkSize - 1));
+
+        assert(!(pkt->flags & SATISFIED));
+        pkt->flags |= SATISFIED;
+        //Invalidates/Upgrades need no response if they get the bus
+        return true;
+    }
+
+    if (pkt->isRequest() && blocked)
    {
        DPRINTF(Cache,"Scheduling a retry while blocked\n");
        mustSendRetry = true;
@@ -96,16 +114,44 @@ void
 BaseCache::CachePort::recvRetry()
 {
    Packet *pkt;
+    assert(waitingOnRetry);
+    if (!drainList.empty()) {
+        DPRINTF(CachePort, "%s attempting to send a retry for response\n", name());
+        //We have some responses to drain first
+        if (sendTiming(drainList.front())) {
+            DPRINTF(CachePort, "%s sucessful in sending a retry for response\n", name());
+            drainList.pop_front();
+            if (!drainList.empty() ||
+                !isCpuSide && cache->doMasterRequest() ||
+                isCpuSide && cache->doSlaveRequest()) {

-    if (!isCpuSide)
+                DPRINTF(CachePort, "%s has more responses/requests\n", name());
+                BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(this);
+                reqCpu->schedule(curTick + 1);
+            }
+            waitingOnRetry = false;
+        }
+    }
+    else if (!isCpuSide)
    {
+        DPRINTF(CachePort, "%s attempting to send a retry for MSHR\n", name());
+        if (!cache->doMasterRequest()) {
+            //This can happen if I am the owner of a block and see an upgrade
+            //while the block was in my WB Buffers.  I just remove the
+            //wb and de-assert the masterRequest
+            waitingOnRetry = false;
+            return;
+        }
        pkt = cache->getPacket();
+        MSHR* mshr = (MSHR*)pkt->senderState;
        bool success = sendTiming(pkt);
        DPRINTF(Cache, "Address %x was %s in sending the timing request\n",
                pkt->getAddr(), success ? "succesful" : "unsuccesful");
-        cache->sendResult(pkt, success);
+        cache->sendResult(pkt, mshr, success);
+        waitingOnRetry = !success;
        if (success && cache->doMasterRequest())
        {
+            DPRINTF(CachePort, "%s has more requests\n", name());
            //Still more to issue, rerequest in 1 cycle
            pkt = NULL;
            BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(this);
@@ -114,17 +160,23 @@ BaseCache::CachePort::recvRetry()
    }
    else
    {
-        pkt = cache->getCoherencePacket();
+        assert(cshrRetry);
+        //pkt = cache->getCoherencePacket();
+        //We save the packet, no reordering on CSHRS
+        pkt = cshrRetry;
        bool success = sendTiming(pkt);
+        waitingOnRetry = !success;
        if (success && cache->doSlaveRequest())
        {
            //Still more to issue, rerequest in 1 cycle
            pkt = NULL;
            BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(this);
            reqCpu->schedule(curTick + 1);
+            cshrRetry = NULL;
        }
-
    }
+    if (waitingOnRetry) DPRINTF(CachePort, "%s STILL Waiting on retry\n", name());
+    else DPRINTF(CachePort, "%s no longer waiting on retry\n", name());
    return;
 }
 void
@@ -169,16 +221,47 @@ BaseCache::CacheEvent::process()
 {
    if (!pkt)
    {
-        if (!cachePort->isCpuSide)
-        {
-            //MSHR
+        if (cachePort->waitingOnRetry) return;
+       //We have some responses to drain first
+        if (!cachePort->drainList.empty()) {
+            DPRINTF(CachePort, "%s trying to drain a response\n", cachePort->name());
+            if (cachePort->sendTiming(cachePort->drainList.front())) {
+                DPRINTF(CachePort, "%s drains a response succesfully\n", cachePort->name());
+                cachePort->drainList.pop_front();
+                if (!cachePort->drainList.empty() ||
+                    !cachePort->isCpuSide && cachePort->cache->doMasterRequest() ||
+                    cachePort->isCpuSide && cachePort->cache->doSlaveRequest()) {
+
+                    DPRINTF(CachePort, "%s still has outstanding bus reqs\n", cachePort->name());
+                    this->schedule(curTick + 1);
+                }
+            }
+            else {
+                cachePort->waitingOnRetry = true;
+                DPRINTF(CachePort, "%s now waiting on a retry\n", cachePort->name());
+            }
+        }
+        else if (!cachePort->isCpuSide)
+        {            //MSHR
+            DPRINTF(CachePort, "%s trying to send a MSHR request\n", cachePort->name());
+            if (!cachePort->cache->doMasterRequest()) {
+                //This can happen if I am the owner of a block and see an upgrade
+                //while the block was in my WB Buffers.  I just remove the
+                //wb and de-assert the masterRequest
+                return;
+            }
+
            pkt = cachePort->cache->getPacket();
+            MSHR* mshr = (MSHR*) pkt->senderState;
            bool success = cachePort->sendTiming(pkt);
            DPRINTF(Cache, "Address %x was %s in sending the timing request\n",
                    pkt->getAddr(), success ? "succesful" : "unsuccesful");
-            cachePort->cache->sendResult(pkt, success);
+            cachePort->cache->sendResult(pkt, mshr, success);
+            cachePort->waitingOnRetry = !success;
+            if (cachePort->waitingOnRetry) DPRINTF(CachePort, "%s now waiting on a retry\n", cachePort->name());
            if (success && cachePort->cache->doMasterRequest())
            {
+                DPRINTF(CachePort, "%s still more MSHR requests to send\n", cachePort->name());
                //Still more to issue, rerequest in 1 cycle
                pkt = NULL;
                this->schedule(curTick+1);
@@ -186,10 +269,16 @@ BaseCache::CacheEvent::process()
        }
        else
        {
+            assert(cachePort->cache->doSlaveRequest());
            //CSHR
            pkt = cachePort->cache->getCoherencePacket();
            bool success = cachePort->sendTiming(pkt);
-            if (success && cachePort->cache->doSlaveRequest())
+            if (!success) {
+                //Need to send on a retry
+                cachePort->cshrRetry = pkt;
+                cachePort->waitingOnRetry = true;
+            }
+            else if (cachePort->cache->doSlaveRequest())
            {
                //Still more to issue, rerequest in 1 cycle
                pkt = NULL;
@@ -199,8 +288,24 @@ BaseCache::CacheEvent::process()
        return;
    }
    //Response
-    //Know the packet to send, no need to mark in service (must succed)
-    assert(cachePort->sendTiming(pkt));
+    //Know the packet to send
+    if (pkt->flags & NACKED_LINE)
+        pkt->result = Packet::Nacked;
+    else
+        pkt->result = Packet::Success;
+    pkt->makeTimingResponse();
+    DPRINTF(CachePort, "%s attempting to send a response\n", cachePort->name());
+    if (!cachePort->drainList.empty() || cachePort->waitingOnRetry) {
+        //Already have a list, just append
+        cachePort->drainList.push_back(pkt);
+        DPRINTF(CachePort, "%s appending response onto drain list\n", cachePort->name());
+    }
+    else if (!cachePort->sendTiming(pkt)) {
+        //It failed, save it to list of drain events
+        DPRINTF(CachePort, "%s now waiting for a retry\n", cachePort->name());
+        cachePort->drainList.push_back(pkt);
+        cachePort->waitingOnRetry = true;
+    }
 }

 const char *
--- a/src/mem/cache/base_cache.hh
+++ b/src/mem/cache/base_cache.hh
@@ -72,6 +72,7 @@ enum RequestCause{
    Request_PF
 };

+class MSHR;
 /**
 * A basic cache interface. Implements some common functions for speed.
 */
@@ -110,6 +111,12 @@ class BaseCache : public MemObject
        bool mustSendRetry;

        bool isCpuSide;
+
+        bool waitingOnRetry;
+
+        std::list<Packet *> drainList;
+
+        Packet *cshrRetry;
    };

    struct CacheEvent : public Event
@@ -127,6 +134,8 @@ class BaseCache : public MemObject
    CachePort *cpuSidePort;
    CachePort *memSidePort;

+    bool snoopRangesSent;
+
  public:
    virtual Port *getPort(const std::string &if_name, int idx = -1);

@@ -149,14 +158,15 @@ class BaseCache : public MemObject

    void recvStatusChange(Port::Status status, bool isCpuSide)
    {
-        if (status == Port::RangeChange)
-        {
-            if (!isCpuSide)
-            {
+        if (status == Port::RangeChange){
+            if (!isCpuSide) {
                cpuSidePort->sendStatusChange(Port::RangeChange);
+                if (!snoopRangesSent) {
+                    snoopRangesSent = true;
+                    memSidePort->sendStatusChange(Port::RangeChange);
+                }
            }
-            else
-            {
+            else {
                memSidePort->sendStatusChange(Port::RangeChange);
            }
        }
@@ -172,7 +182,7 @@ class BaseCache : public MemObject
        fatal("No implementation");
    }

-    virtual void sendResult(Packet* &pkt, bool success)
+    virtual void sendResult(Packet* &pkt, MSHR* mshr, bool success)
    {

        fatal("No implementation");
@@ -205,6 +215,7 @@ class BaseCache : public MemObject
    /** True if this cache is connected to the CPU. */
    bool topLevelCache;

+
    /** Stores time the cache blocked for statistics. */
    Tick blockedCycle;

@@ -332,6 +343,7 @@ class BaseCache : public MemObject
        //Start ports at null if more than one is created we should panic
        cpuSidePort = NULL;
        memSidePort = NULL;
+        snoopRangesSent = false;
    }

    virtual void init();
@@ -382,9 +394,14 @@ class BaseCache : public MemObject
            blocked_causes[cause]++;
            blockedCycle = curTick;
        }
-        blocked |= flag;
-        DPRINTF(Cache,"Blocking for cause %s\n", cause);
-        cpuSidePort->setBlocked();
+        int old_state = blocked;
+        if (!(blocked & flag)) {
+            //Wasn't already blocked for this cause
+            blocked |= flag;
+            DPRINTF(Cache,"Blocking for cause %s\n", cause);
+            if (!old_state)
+                cpuSidePort->setBlocked();
+        }
    }

    /**
@@ -395,8 +412,13 @@ class BaseCache : public MemObject
    void setBlockedForSnoop(BlockedCause cause)
    {
        uint8_t flag = 1 << cause;
-        blockedSnoop |= flag;
-        memSidePort->setBlocked();
+        uint8_t old_state = blockedSnoop;
+        if (!(blockedSnoop & flag)) {
+            //Wasn't already blocked for this cause
+            blockedSnoop |= flag;
+            if (!old_state)
+                memSidePort->setBlocked();
+        }
    }

    /**
@@ -445,7 +467,7 @@ class BaseCache : public MemObject
     */
    void setMasterRequest(RequestCause cause, Tick time)
    {
-        if (!doMasterRequest())
+        if (!doMasterRequest() && !memSidePort->waitingOnRetry)
        {
            BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(memSidePort);
            reqCpu->schedule(time);
@@ -503,10 +525,14 @@ class BaseCache : public MemObject
     */
    void respond(Packet *pkt, Tick time)
    {
-        pkt->makeTimingResponse();
-        pkt->result = Packet::Success;
-        CacheEvent *reqCpu = new CacheEvent(cpuSidePort, pkt);
-        reqCpu->schedule(time);
+        if (pkt->needsResponse()) {
+            CacheEvent *reqCpu = new CacheEvent(cpuSidePort, pkt);
+            reqCpu->schedule(time);
+        }
+        else {
+            if (pkt->cmd == Packet::Writeback) delete pkt->req;
+            delete pkt;
+        }
    }

    /**
@@ -517,22 +543,29 @@ class BaseCache : public MemObject
    void respondToMiss(Packet *pkt, Tick time)
    {
        if (!pkt->req->isUncacheable()) {
-            missLatency[pkt->cmdToIndex()][pkt->req->getThreadNum()] += time - pkt->time;
+            missLatency[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/] += time - pkt->time;
+        }
+        if (pkt->needsResponse()) {
+            CacheEvent *reqCpu = new CacheEvent(cpuSidePort, pkt);
+            reqCpu->schedule(time);
+        }
+        else {
+            if (pkt->cmd == Packet::Writeback) delete pkt->req;
+            delete pkt;
        }
-        pkt->makeTimingResponse();
-        pkt->result = Packet::Success;
-        CacheEvent *reqCpu = new CacheEvent(cpuSidePort, pkt);
-        reqCpu->schedule(time);
    }

    /**
     * Suppliess the data if cache to cache transfers are enabled.
     * @param pkt The bus transaction to fulfill.
     */
-    void respondToSnoop(Packet *pkt)
+    void respondToSnoop(Packet *pkt, Tick time)
    {
-        assert("Implement\n" && 0);
+//        assert("Implement\n" && 0);
 //	mi->respond(pkt,curTick + hitLatency);
+        assert (pkt->needsResponse());
+        CacheEvent *reqMem = new CacheEvent(memSidePort, pkt);
+        reqMem->schedule(time);
    }

    /**
@@ -551,6 +584,16 @@ class BaseCache : public MemObject
        else
        {
            //This is where snoops get updated
+            AddrRangeList dummy;
+//            if (!topLevelCache)
+//            {
+                cpuSidePort->getPeerAddressRanges(dummy, snoop);
+//            }
+//            else
+//            {
+//                snoop.push_back(RangeSize(0,-1));
+//            }
+
            return;
        }
    }
--- a/src/mem/cache/cache.hh
+++ b/src/mem/cache/cache.hh
@@ -103,6 +103,7 @@ class Cache : public BaseCache
      * Used to append to target list, to cause an invalidation.
      */
    Packet * invalidatePkt;
+    Request *invalidateReq;

    /**
     * Temporarily move a block into a MSHR.
@@ -175,7 +176,7 @@ class Cache : public BaseCache
     * @param pkt The request.
     * @param success True if the request was sent successfully.
     */
-    virtual void sendResult(Packet * &pkt, bool success);
+    virtual void sendResult(Packet * &pkt, MSHR* mshr, bool success);

    /**
     * Handles a response (cache line fill/write ack) from the bus.
@@ -251,7 +252,7 @@ class Cache : public BaseCache
     * request.
     * @return The estimated completion time.
     */
-    Tick probe(Packet * &pkt, bool update);
+    Tick probe(Packet * &pkt, bool update, CachePort * otherSidePort);

    /**
     * Snoop for the provided request in the cache and return the estimated
@@ -262,7 +263,7 @@ class Cache : public BaseCache
     * request.
     * @return The estimated completion time.
     */
-    Tick snoopProbe(Packet * &pkt, bool update);
+    Tick snoopProbe(Packet * &pkt);
 };

 #endif // __CACHE_HH__
--- a/src/mem/cache/cache_impl.hh
+++ b/src/mem/cache/cache_impl.hh
@@ -51,7 +51,7 @@
 #include "mem/cache/miss/mshr.hh"
 #include "mem/cache/prefetch/prefetcher.hh"

-#include "sim/sim_events.hh" // for SimExitEvent
+#include "sim/sim_exit.hh" // for SimExitEvent

 template<class TagStore, class Buffering, class Coherence>
 bool
@@ -60,17 +60,21 @@ doTimingAccess(Packet *pkt, CachePort *cachePort, bool isCpuSide)
 {
    if (isCpuSide)
    {
-        if (pkt->isWrite() && (pkt->req->getFlags() & LOCKED)) {
+        if (pkt->isWrite() && (pkt->req->isLocked())) {
            pkt->req->setScResult(1);
        }
        access(pkt);
+
    }
    else
    {
        if (pkt->isResponse())
            handleResponse(pkt);
-        else
-            snoop(pkt);
+        else {
+            //Check if we should do the snoop
+            if (pkt->flags & SNOOP_COMMIT)
+                snoop(pkt);
+        }
    }
    return true;
 }
@@ -83,11 +87,11 @@ doAtomicAccess(Packet *pkt, bool isCpuSide)
    if (isCpuSide)
    {
        //Temporary solution to LL/SC
-        if (pkt->isWrite() && (pkt->req->getFlags() & LOCKED)) {
+        if (pkt->isWrite() && (pkt->req->isLocked())) {
            pkt->req->setScResult(1);
        }

-        probe(pkt, true);
+        probe(pkt, true, NULL);
        //TEMP ALWAYS SUCCES FOR NOW
        pkt->result = Packet::Success;
    }
@@ -96,7 +100,7 @@ doAtomicAccess(Packet *pkt, bool isCpuSide)
        if (pkt->isResponse())
            handleResponse(pkt);
        else
-            snoopProbe(pkt, true);
+            return snoopProbe(pkt);
    }
    //Fix this timing info
    return hitLatency;
@@ -113,20 +117,17 @@ doFunctionalAccess(Packet *pkt, bool isCpuSide)
        pkt->req->setThreadContext(0,0);

        //Temporary solution to LL/SC
-        if (pkt->isWrite() && (pkt->req->getFlags() & LOCKED)) {
+        if (pkt->isWrite() && (pkt->req->isLocked())) {
            assert("Can't handle LL/SC on functional path\n");
        }

-        probe(pkt, true);
+        probe(pkt, false, memSidePort);
        //TEMP ALWAYS SUCCESFUL FOR NOW
        pkt->result = Packet::Success;
    }
    else
    {
-        if (pkt->isResponse())
-            handleResponse(pkt);
-        else
-            snoopProbe(pkt, true);
+            probe(pkt, false, cpuSidePort);
    }
 }

@@ -147,7 +148,8 @@ Cache(const std::string &_name,
      prefetchAccess(params.prefetchAccess),
      tags(params.tags), missQueue(params.missQueue),
      coherence(params.coherence), prefetcher(params.prefetcher),
-      doCopy(params.doCopy), blockOnCopy(params.blockOnCopy)
+      doCopy(params.doCopy), blockOnCopy(params.blockOnCopy),
+      hitLatency(params.hitLatency)
 {
 //FIX BUS POINTERS
 //    if (params.in == NULL) {
@@ -162,10 +164,8 @@ Cache(const std::string &_name,
    prefetcher->setCache(this);
    prefetcher->setTags(tags);
    prefetcher->setBuffer(missQueue);
-#if 0
-    invalidatePkt = new Packet;
-    invalidatePkt->cmd = Packet::InvalidateReq;
-#endif
+    invalidateReq = new Request((Addr) NULL, blkSize, 0);
+    invalidatePkt = new Packet(invalidateReq, Packet::InvalidateReq, 0);
 }

 template<class TagStore, class Buffering, class Coherence>
@@ -194,20 +194,6 @@ Cache<TagStore,Buffering,Coherence>::access(PacketPtr &pkt)
        prefetcher->handleMiss(pkt, curTick);
    }
    if (!pkt->req->isUncacheable()) {
-        if (pkt->isInvalidate() && !pkt->isRead()
-            && !pkt->isWrite()) {
-            //Upgrade or Invalidate
-            //Look into what happens if two slave caches on bus
-            DPRINTF(Cache, "%s %x ? blk_addr: %x\n", pkt->cmdString(),
-                    pkt->getAddr() & (((ULL(1))<<48)-1),
-                    pkt->getAddr() & ~((Addr)blkSize - 1));
-
-            //@todo Should this return latency have the hit latency in it?
-//	    respond(pkt,curTick+lat);
-            pkt->flags |= SATISFIED;
-//            return MA_HIT; //@todo, return values
-            return true;
-        }
        blk = tags->handleAccess(pkt, lat, writebacks);
    } else {
        size = pkt->getSize();
@@ -234,27 +220,30 @@ Cache<TagStore,Buffering,Coherence>::access(PacketPtr &pkt)
        missQueue->doWriteback(writebacks.front());
        writebacks.pop_front();
    }
-    DPRINTF(Cache, "%s %x %s blk_addr: %x pc %x\n", pkt->cmdString(),
+    DPRINTF(Cache, "%s %x %s blk_addr: %x\n", pkt->cmdString(),
            pkt->getAddr() & (((ULL(1))<<48)-1), (blk) ? "hit" : "miss",
-            pkt->getAddr() & ~((Addr)blkSize - 1), pkt->req->getPC());
+            pkt->getAddr() & ~((Addr)blkSize - 1));
    if (blk) {
        // Hit
-        hits[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+        hits[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
        // clear dirty bit if write through
        if (pkt->needsResponse())
            respond(pkt, curTick+lat);
-//	return MA_HIT;
+        if (pkt->cmd == Packet::Writeback) {
+            //Signal that you can kill the pkt/req
+            pkt->flags |= SATISFIED;
+        }
        return true;
    }

    // Miss
    if (!pkt->req->isUncacheable()) {
-        misses[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+        misses[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
        /** @todo Move miss count code into BaseCache */
        if (missCount) {
            --missCount;
            if (missCount == 0)
-                new SimLoopExitEvent(curTick, "A cache reached the maximum miss count");
+                exitSimLoop("A cache reached the maximum miss count");
        }
    }
    missQueue->handleMiss(pkt, size, curTick + hitLatency);
@@ -267,10 +256,11 @@ template<class TagStore, class Buffering, class Coherence>
 Packet *
 Cache<TagStore,Buffering,Coherence>::getPacket()
 {
+    assert(missQueue->havePending());
    Packet * pkt = missQueue->getPacket();
    if (pkt) {
        if (!pkt->req->isUncacheable()) {
-            if (pkt->cmd == Packet::HardPFReq) misses[Packet::HardPFReq][pkt->req->getThreadNum()]++;
+            if (pkt->cmd == Packet::HardPFReq) misses[Packet::HardPFReq][0/*pkt->req->getThreadNum()*/]++;
            BlkType *blk = tags->findBlock(pkt);
            Packet::Command cmd = coherence->getBusCmd(pkt->cmd,
                                              (blk)? blk->status : 0);
@@ -285,15 +275,30 @@ Cache<TagStore,Buffering,Coherence>::getPacket()

 template<class TagStore, class Buffering, class Coherence>
 void
-Cache<TagStore,Buffering,Coherence>::sendResult(PacketPtr &pkt, bool success)
+Cache<TagStore,Buffering,Coherence>::sendResult(PacketPtr &pkt, MSHR* mshr, bool success)
 {
-    if (success) {
-        missQueue->markInService(pkt);
-          //Temp Hack for UPGRADES
-          if (pkt->cmd == Packet::UpgradeReq) {
-              handleResponse(pkt);
-          }
+    if (success && !(pkt->flags & NACKED_LINE)) {
+        missQueue->markInService(pkt, mshr);
+        //Temp Hack for UPGRADES
+        if (pkt->cmd == Packet::UpgradeReq) {
+            pkt->flags &= ~CACHE_LINE_FILL;
+            BlkType *blk = tags->findBlock(pkt);
+            CacheBlk::State old_state = (blk) ? blk->status : 0;
+            CacheBlk::State new_state = coherence->getNewState(pkt,old_state);
+            if (old_state != new_state)
+                DPRINTF(Cache, "Block for blk addr %x moving from state %i to %i\n",
+                        pkt->getAddr() & (((ULL(1))<<48)-1), old_state, new_state);
+            //Set the state on the upgrade
+            memcpy(pkt->getPtr<uint8_t>(), blk->data, blkSize);
+            PacketList writebacks;
+            tags->handleFill(blk, mshr, new_state, writebacks, pkt);
+            assert(writebacks.empty());
+            missQueue->handleResponse(pkt, curTick + hitLatency);
+        }
    } else if (pkt && !pkt->req->isUncacheable()) {
+        pkt->flags &= ~NACKED_LINE;
+        pkt->flags &= ~SATISFIED;
+        pkt->flags &= ~SNOOP_COMMIT;
        missQueue->restoreOrigCmd(pkt);
    }
 }
@@ -304,6 +309,14 @@ Cache<TagStore,Buffering,Coherence>::handleResponse(Packet * &pkt)
 {
    BlkType *blk = NULL;
    if (pkt->senderState) {
+        if (pkt->result == Packet::Nacked) {
+            //pkt->reinitFromRequest();
+            warn("NACKs from devices not connected to the same bus not implemented\n");
+            return;
+        }
+        if (pkt->result == Packet::BadAddress) {
+            //Make the response a Bad address and send it
+        }
 //	MemDebug::cacheResponse(pkt);
        DPRINTF(Cache, "Handling reponse to %x, blk addr: %x\n",pkt->getAddr(),
                pkt->getAddr() & (((ULL(1))<<48)-1));
@@ -312,11 +325,15 @@ Cache<TagStore,Buffering,Coherence>::handleResponse(Packet * &pkt)
            blk = tags->findBlock(pkt);
            CacheBlk::State old_state = (blk) ? blk->status : 0;
            PacketList writebacks;
+            CacheBlk::State new_state = coherence->getNewState(pkt,old_state);
+            if (old_state != new_state)
+                DPRINTF(Cache, "Block for blk addr %x moving from state %i to %i\n",
+                        pkt->getAddr() & (((ULL(1))<<48)-1), old_state, new_state);
            blk = tags->handleFill(blk, (MSHR*)pkt->senderState,
-                                   coherence->getNewState(pkt,old_state),
-                                   writebacks);
+                                   new_state, writebacks, pkt);
            while (!writebacks.empty()) {
                    missQueue->doWriteback(writebacks.front());
+                    writebacks.pop_front();
            }
        }
        missQueue->handleResponse(pkt, curTick + hitLatency);
@@ -372,7 +389,6 @@ template<class TagStore, class Buffering, class Coherence>
 void
 Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
 {
-
    Addr blk_addr = pkt->getAddr() & ~(Addr(blkSize-1));
    BlkType *blk = tags->findBlock(pkt);
    MSHR *mshr = missQueue->findMSHR(blk_addr);
@@ -385,7 +401,12 @@ Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
                    //If the outstanding request was an invalidate (upgrade,readex,..)
                    //Then we need to ACK the request until we get the data
                    //Also NACK if the outstanding request is not a cachefill (writeback)
+                    assert(!(pkt->flags & SATISFIED));
+                    pkt->flags |= SATISFIED;
                    pkt->flags |= NACKED_LINE;
+                    ///@todo NACK's from other levels
+                    //warn("NACKs from devices not connected to the same bus not implemented\n");
+                    //respondToSnoop(pkt, curTick + hitLatency);
                    return;
                }
                else {
@@ -398,6 +419,7 @@ Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
                    //@todo Make it so that a read to a pending read can't be exclusive now.

                    //Set the address so find match works
+                    //panic("Don't have invalidates yet\n");
                    invalidatePkt->addrOverride(pkt->getAddr());

                    //Append the invalidate on
@@ -420,6 +442,7 @@ Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
                    if (pkt->isRead()) {
                        //Only Upgrades don't get here
                        //Supply the data
+                        assert(!(pkt->flags & SATISFIED));
                        pkt->flags |= SATISFIED;

                        //If we are in an exclusive protocol, make it ask again
@@ -427,18 +450,18 @@ Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
                        pkt->flags |= SHARED_LINE;

                        assert(pkt->isRead());
-                        Addr offset = pkt->getAddr() & ~(blkSize - 1);
+                        Addr offset = pkt->getAddr() & (blkSize - 1);
                        assert(offset < blkSize);
                        assert(pkt->getSize() <= blkSize);
                        assert(offset + pkt->getSize() <=blkSize);
                        memcpy(pkt->getPtr<uint8_t>(), mshr->pkt->getPtr<uint8_t>() + offset, pkt->getSize());

-                        respondToSnoop(pkt);
+                        respondToSnoop(pkt, curTick + hitLatency);
                    }

                    if (pkt->isInvalidate()) {
                        //This must be an upgrade or other cache will take ownership
-                        missQueue->markInService(mshr->pkt);
+                        missQueue->markInService(mshr->pkt, mshr);
                    }
                    return;
                }
@@ -448,10 +471,16 @@ Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
    CacheBlk::State new_state;
    bool satisfy = coherence->handleBusRequest(pkt,blk,mshr, new_state);
    if (satisfy) {
+        DPRINTF(Cache, "Cache snooped a %s request for addr %x and now supplying data,"
+                "new state is %i\n",
+                pkt->cmdString(), blk_addr, new_state);
+
        tags->handleSnoop(blk, new_state, pkt);
-        respondToSnoop(pkt);
+        respondToSnoop(pkt, curTick + hitLatency);
        return;
    }
+    if (blk) DPRINTF(Cache, "Cache snooped a %s request for addr %x, new state is %i\n",
+                     pkt->cmdString(), blk_addr, new_state);
    tags->handleSnoop(blk, new_state);
 }

@@ -486,7 +515,7 @@ Cache<TagStore,Buffering,Coherence>::invalidateBlk(Addr addr)
 */
 template<class TagStore, class Buffering, class Coherence>
 Tick
-Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update)
+Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update, CachePort* otherSidePort)
 {
 //    MemDebug::cacheProbe(pkt);
    if (!pkt->req->isUncacheable()) {
@@ -505,6 +534,10 @@ Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update)
    int lat;
    BlkType *blk = tags->handleAccess(pkt, lat, writebacks, update);

+    DPRINTF(Cache, "%s %x %s blk_addr: %x\n", pkt->cmdString(),
+            pkt->getAddr() & (((ULL(1))<<48)-1), (blk) ? "hit" : "miss",
+            pkt->getAddr() & ~((Addr)blkSize - 1));
+
    if (!blk) {
        // Need to check for outstanding misses and writes
        Addr blk_addr = pkt->getAddr() & ~(blkSize - 1);
@@ -517,7 +550,8 @@ Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update)
        missQueue->findWrites(blk_addr, writes);

        if (!update) {
-            memSidePort->sendFunctional(pkt);
+                otherSidePort->sendFunctional(pkt);
+
            // Check for data in MSHR and writebuffer.
            if (mshr) {
                warn("Found outstanding miss on an non-update probe");
@@ -596,7 +630,7 @@ Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update)
            // update the cache state and statistics
            if (mshr || !writes.empty()){
                // Can't handle it, return pktuest unsatisfied.
-                return 0;
+                panic("Atomic access ran into outstanding MSHR's or WB's!");
            }
            if (!pkt->req->isUncacheable()) {
                // Fetch the cache block to fill
@@ -610,23 +644,46 @@ Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update)

                busPkt->time = curTick;

+                DPRINTF(Cache, "Sending a atomic %s for %x blk_addr: %x\n",
+                        busPkt->cmdString(),
+                        busPkt->getAddr() & (((ULL(1))<<48)-1),
+                        busPkt->getAddr() & ~((Addr)blkSize - 1));
+
                lat = memSidePort->sendAtomic(busPkt);

+                //Be sure to flip the response to a request for coherence
+                if (busPkt->needsResponse()) {
+                    busPkt->makeAtomicResponse();
+                }
+
 /*		if (!(busPkt->flags & SATISFIED)) {
                    // blocked at a higher level, just return
                    return 0;
                }

-*/		misses[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+*/		misses[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;

                CacheBlk::State old_state = (blk) ? blk->status : 0;
+                CacheBlk::State new_state = coherence->getNewState(busPkt, old_state);
+                    DPRINTF(Cache, "Receive response:%s for blk addr %x in state %i\n",
+                            busPkt->cmdString(),
+                            busPkt->getAddr() & (((ULL(1))<<48)-1), old_state);
+                if (old_state != new_state)
+                    DPRINTF(Cache, "Block for blk addr %x moving from state %i to %i\n",
+                            busPkt->getAddr() & (((ULL(1))<<48)-1), old_state, new_state);
+
                tags->handleFill(blk, busPkt,
-                                 coherence->getNewState(busPkt, old_state),
+                                 new_state,
                                 writebacks, pkt);
+                //Free the packet
+                delete busPkt;
+
                // Handle writebacks if needed
                while (!writebacks.empty()){
-                    memSidePort->sendAtomic(writebacks.front());
+                    Packet *wbPkt = writebacks.front();
+                    memSidePort->sendAtomic(wbPkt);
                    writebacks.pop_front();
+                    delete wbPkt;
                }
                return lat + hitLatency;
            } else {
@@ -642,12 +699,12 @@ Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update)
        }

        if (update) {
-            hits[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+            hits[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
        } else if (pkt->isWrite()) {
            // Still need to change data in all locations.
-            return memSidePort->sendAtomic(pkt);
+            otherSidePort->sendFunctional(pkt);
        }
-        return curTick + lat;
+        return hitLatency;
    }
    fatal("Probe not handled.\n");
    return 0;
@@ -655,18 +712,24 @@ Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update)

 template<class TagStore, class Buffering, class Coherence>
 Tick
-Cache<TagStore,Buffering,Coherence>::snoopProbe(PacketPtr &pkt, bool update)
+Cache<TagStore,Buffering,Coherence>::snoopProbe(PacketPtr &pkt)
 {
-    Addr blk_addr = pkt->getAddr() & ~(Addr(blkSize-1));
-    BlkType *blk = tags->findBlock(pkt);
-    MSHR *mshr = missQueue->findMSHR(blk_addr);
-    CacheBlk::State new_state = 0;
-    bool satisfy = coherence->handleBusRequest(pkt,blk,mshr, new_state);
-    if (satisfy) {
-        tags->handleSnoop(blk, new_state, pkt);
-        return hitLatency;
-    }
-    tags->handleSnoop(blk, new_state);
-    return 0;
+        Addr blk_addr = pkt->getAddr() & ~(Addr(blkSize-1));
+        BlkType *blk = tags->findBlock(pkt);
+        MSHR *mshr = missQueue->findMSHR(blk_addr);
+        CacheBlk::State new_state = 0;
+        bool satisfy = coherence->handleBusRequest(pkt,blk,mshr, new_state);
+        if (satisfy) {
+            DPRINTF(Cache, "Cache snooped a %s request for addr %x and now supplying data,"
+                    "new state is %i\n",
+                    pkt->cmdString(), blk_addr, new_state);
+
+            tags->handleSnoop(blk, new_state, pkt);
+            return hitLatency;
+        }
+        if (blk) DPRINTF(Cache, "Cache snooped a %s request for addr %x, new state is %i\n",
+                     pkt->cmdString(), blk_addr, new_state);
+        tags->handleSnoop(blk, new_state);
+        return 0;
 }

--- a/src/mem/cache/coherence/coherence_protocol.cc
+++ b/src/mem/cache/coherence/coherence_protocol.cc
@@ -271,7 +271,7 @@ CoherenceProtocol::CoherenceProtocol(const string &name,
    }

    Packet::Command writeToSharedCmd = doUpgrades ? Packet::UpgradeReq : Packet::ReadExReq;
-    Packet::Command writeToSharedResp = doUpgrades ? Packet::UpgradeResp : Packet::ReadExResp;
+    Packet::Command writeToSharedResp = doUpgrades ? Packet::UpgradeReq : Packet::ReadExResp;

 //@todo add in hardware prefetch to this list
    if (protocol == "msi") {
--- a/src/mem/cache/miss/blocking_buffer.cc
+++ b/src/mem/cache/miss/blocking_buffer.cc
@@ -123,12 +123,12 @@ BlockingBuffer::restoreOrigCmd(Packet * &pkt)
 }

 void
-BlockingBuffer::markInService(Packet * &pkt)
+BlockingBuffer::markInService(Packet * &pkt, MSHR* mshr)
 {
    if (!pkt->isCacheFill() && pkt->isWrite()) {
        // Forwarding a write/ writeback, don't need to change
        // the command
-        assert((MSHR*)pkt->senderState == &wb);
+        assert(mshr == &wb);
        cache->clearMasterRequest(Request_WB);
        if (!pkt->needsResponse()) {
            assert(wb.getNumTargets() == 0);
@@ -138,7 +138,7 @@ BlockingBuffer::markInService(Packet * &pkt)
            wb.inService = true;
        }
    } else {
-        assert((MSHR*)pkt->senderState == &miss);
+        assert(mshr == &miss);
        cache->clearMasterRequest(Request_MSHR);
        if (!pkt->needsResponse()) {
            assert(miss.getNumTargets() == 0);
@@ -189,7 +189,7 @@ BlockingBuffer::squash(int threadNum)
    if (miss.threadNum == threadNum) {
        Packet * target = miss.getTarget();
        miss.popTarget();
-        assert(target->req->getThreadNum() == threadNum);
+        assert(0/*target->req->getThreadNum()*/ == threadNum);
        target = NULL;
        assert(!miss.hasTargets());
        miss.ntargets=0;
@@ -218,7 +218,7 @@ BlockingBuffer::doWriteback(Addr addr,
    }

    ///All writebacks charged to same thread @todo figure this out
-    writebacks[pkt->req->getThreadNum()]++;
+    writebacks[0/*pkt->req->getThreadNum()*/]++;

    wb.allocateAsBuffer(pkt);
    cache->setMasterRequest(Request_WB, curTick);
@@ -230,7 +230,7 @@ BlockingBuffer::doWriteback(Addr addr,
 void
 BlockingBuffer::doWriteback(Packet * &pkt)
 {
-    writebacks[pkt->req->getThreadNum()]++;
+    writebacks[0/*pkt->req->getThreadNum()*/]++;

    wb.allocateAsBuffer(pkt);

--- a/src/mem/cache/miss/blocking_buffer.hh
+++ b/src/mem/cache/miss/blocking_buffer.hh
@@ -152,7 +152,7 @@ public:
     * are successfully sent.
     * @param pkt The request that was sent on the bus.
     */
-    void markInService(Packet * &pkt);
+    void markInService(Packet * &pkt, MSHR* mshr);

    /**
     * Frees the resources of the pktuest and unblock the cache.
--- a/src/mem/cache/miss/miss_queue.cc
+++ b/src/mem/cache/miss/miss_queue.cc
@@ -372,7 +372,7 @@ MissQueue::allocateMiss(Packet * &pkt, int size, Tick time)
 MSHR*
 MissQueue::allocateWrite(Packet * &pkt, int size, Tick time)
 {
-    MSHR* mshr = wb.allocate(pkt,blkSize);
+    MSHR* mshr = wb.allocate(pkt,size);
    mshr->order = order++;

 //REMOVING COMPRESSION FOR NOW
@@ -413,8 +413,8 @@ MissQueue::handleMiss(Packet * &pkt, int blkSize, Tick time)
        mshr = mq.findMatch(blkAddr);
        if (mshr) {
            //@todo remove hw_pf here
-            mshr_hits[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
-            if (mshr->threadNum != pkt->req->getThreadNum()) {
+            mshr_hits[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
+            if (mshr->threadNum != 0/*pkt->req->getThreadNum()*/) {
                mshr->threadNum = -1;
            }
            mq.allocateTarget(mshr, pkt);
@@ -434,11 +434,11 @@ MissQueue::handleMiss(Packet * &pkt, int blkSize, Tick time)
            mshr_no_allocate_misses++;
        }
        else {
-            mshr_misses[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+            mshr_misses[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
        }
    } else {
        //Count uncacheable accesses
-        mshr_uncacheable[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+        mshr_uncacheable[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
        size = pkt->getSize();
    }
    if (pkt->isWrite() && (pkt->req->isUncacheable() || !writeAllocate ||
@@ -446,7 +446,7 @@ MissQueue::handleMiss(Packet * &pkt, int blkSize, Tick time)
        /**
         * @todo Add write merging here.
         */
-        mshr = allocateWrite(pkt, blkSize, time);
+        mshr = allocateWrite(pkt, pkt->getSize(), time);
        return;
    }

@@ -499,7 +499,7 @@ MissQueue::getPacket()
        pkt = prefetcher->getPacket();
        if (pkt) {
            //Update statistic on number of prefetches issued (hwpf_mshr_misses)
-            mshr_misses[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+            mshr_misses[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
            //It will request the bus for the future, but should clear that immedieatley
            allocateMiss(pkt, pkt->getSize(), curTick);
            pkt = mq.getReq();
@@ -515,6 +515,14 @@ MissQueue::setBusCmd(Packet * &pkt, Packet::Command cmd)
    assert(pkt->senderState != 0);
    MSHR * mshr = (MSHR*)pkt->senderState;
    mshr->originalCmd = pkt->cmd;
+    if (cmd == Packet::UpgradeReq || cmd == Packet::InvalidateReq) {
+        pkt->flags |= NO_ALLOCATE;
+        pkt->flags &= ~CACHE_LINE_FILL;
+    }
+    else if (!pkt->req->isUncacheable() && !pkt->isNoAllocate() &&
+             (cmd & (1 << 6)/*NeedsResponse*/)) {
+        pkt->flags |= CACHE_LINE_FILL;
+    }
    if (pkt->isCacheFill() || pkt->isNoAllocate())
        pkt->cmd = cmd;
 }
@@ -526,9 +534,8 @@ MissQueue::restoreOrigCmd(Packet * &pkt)
 }

 void
-MissQueue::markInService(Packet * &pkt)
+MissQueue::markInService(Packet * &pkt, MSHR* mshr)
 {
-    assert(pkt->senderState != 0);
    bool unblock = false;
    BlockedCause cause = NUM_BLOCKED_CAUSES;

@@ -540,7 +547,7 @@ MissQueue::markInService(Packet * &pkt)
        // Forwarding a write/ writeback, don't need to change
        // the command
        unblock = wb.isFull();
-        wb.markInService((MSHR*)pkt->senderState);
+        wb.markInService(mshr);
        if (!wb.havePending()){
            cache->clearMasterRequest(Request_WB);
        }
@@ -551,11 +558,11 @@ MissQueue::markInService(Packet * &pkt)
        }
    } else {
        unblock = mq.isFull();
-        mq.markInService((MSHR*)pkt->senderState);
+        mq.markInService(mshr);
        if (!mq.havePending()){
            cache->clearMasterRequest(Request_MSHR);
        }
-        if (((MSHR*)(pkt->senderState))->originalCmd == Packet::HardPFReq) {
+        if (mshr->originalCmd == Packet::HardPFReq) {
            DPRINTF(HWPrefetch, "%s:Marking a HW_PF in service\n",
                    cache->name());
            //Also clear pending if need be
@@ -592,7 +599,7 @@ MissQueue::handleResponse(Packet * &pkt, Tick time)
    BlockedCause cause = NUM_BLOCKED_CAUSES;

    if (pkt->isCacheFill() && !pkt->isNoAllocate()) {
-        mshr_miss_latency[mshr->originalCmd][pkt->req->getThreadNum()] +=
+        mshr_miss_latency[mshr->originalCmd][0/*pkt->req->getThreadNum()*/] +=
            curTick - pkt->time;
        // targets were handled in the cache tags
        if (mshr == noTargetMSHR) {
@@ -619,7 +626,7 @@ MissQueue::handleResponse(Packet * &pkt, Tick time)
        }
    } else {
        if (pkt->req->isUncacheable()) {
-            mshr_uncacheable_lat[pkt->cmd][pkt->req->getThreadNum()] +=
+            mshr_uncacheable_lat[pkt->cmd][0/*pkt->req->getThreadNum()*/] +=
                curTick - pkt->time;
        }
        if (mshr->hasTargets() && pkt->req->isUncacheable()) {
@@ -725,7 +732,7 @@ MissQueue::doWriteback(Addr addr,
    }

    ///All writebacks charged to same thread @todo figure this out
-    writebacks[pkt->req->getThreadNum()]++;
+    writebacks[0/*pkt->req->getThreadNum()*/]++;

    allocateWrite(pkt, 0, curTick);
 }
@@ -734,7 +741,7 @@ MissQueue::doWriteback(Addr addr,
 void
 MissQueue::doWriteback(Packet * &pkt)
 {
-    writebacks[pkt->req->getThreadNum()]++;
+    writebacks[0/*pkt->req->getThreadNum()*/]++;
    allocateWrite(pkt, 0, curTick);
 }

--- a/src/mem/cache/miss/miss_queue.hh
+++ b/src/mem/cache/miss/miss_queue.hh
@@ -256,7 +256,7 @@ class MissQueue
     * are successfully sent.
     * @param pkt The request that was sent on the bus.
     */
-    void markInService(Packet * &pkt);
+    void markInService(Packet * &pkt, MSHR* mshr);

    /**
     * Collect statistics and free resources of a satisfied pktuest.
--- a/src/mem/cache/miss/mshr.cc
+++ b/src/mem/cache/miss/mshr.cc
@@ -88,7 +88,7 @@ void
 MSHR::allocateAsBuffer(Packet * &target)
 {
    addr = target->getAddr();
-    threadNum = target->req->getThreadNum();
+    threadNum = 0/*target->req->getThreadNum()*/;
    pkt = new Packet(target->req, target->cmd, -1);
    pkt->allocate();
    pkt->senderState = (Packet::SenderState*)this;
@@ -100,6 +100,7 @@ MSHR::deallocate()
 {
    assert(targets.empty());
    assert(ntargets == 0);
+    delete pkt;
    pkt = NULL;
    inService = false;
    //allocIter = NULL;
--- a/src/mem/cache/miss/mshr_queue.cc
+++ b/src/mem/cache/miss/mshr_queue.cc
@@ -128,6 +128,7 @@ MSHR*
 MSHRQueue::allocate(Packet * &pkt, int size)
 {
    Addr aligned_addr = pkt->getAddr() & ~((Addr)size - 1);
+    assert(!freeList.empty());
    MSHR *mshr = freeList.front();
    assert(mshr->getNumTargets() == 0);
    freeList.pop_front();
@@ -212,8 +213,13 @@ void
 MSHRQueue::markInService(MSHR* mshr)
 {
    //assert(mshr == pendingList.front());
-    if (!mshr->pkt->needsResponse()) {
+    if (!(mshr->pkt->needsResponse() || mshr->pkt->cmd == Packet::UpgradeReq)) {
        assert(mshr->getNumTargets() == 0);
+        if ((mshr->pkt->flags & SATISFIED) && (mshr->pkt->cmd == Packet::Writeback)) {
+            //Writeback hit, so delete it
+            //otherwise the consumer will delete it
+            delete mshr->pkt->req;
+        }
        deallocate(mshr);
        return;
    }
@@ -251,7 +257,7 @@ MSHRQueue::squash(int threadNum)
                Packet * target = mshr->getTarget();
                mshr->popTarget();

-                assert(target->req->getThreadNum() == threadNum);
+                assert(0/*target->req->getThreadNum()*/ == threadNum);
                target = NULL;
            }
            assert(!mshr->hasTargets());
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -39,9 +39,18 @@

 static const std::string ReadReqString("ReadReq");
 static const std::string WriteReqString("WriteReq");
-static const std::string WriteReqNoAckString("WriteReqNoAck");
+static const std::string WriteReqNoAckString("WriteReqNoAck|Writeback");
 static const std::string ReadRespString("ReadResp");
 static const std::string WriteRespString("WriteResp");
+static const std::string SoftPFReqString("SoftPFReq");
+static const std::string SoftPFRespString("SoftPFResp");
+static const std::string HardPFReqString("HardPFReq");
+static const std::string HardPFRespString("HardPFResp");
+static const std::string InvalidateReqString("InvalidateReq");
+static const std::string WriteInvalidateReqString("WriteInvalidateReq");
+static const std::string UpgradeReqString("UpgradeReq");
+static const std::string ReadExReqString("ReadExReq");
+static const std::string ReadExRespString("ReadExResp");
 static const std::string OtherCmdString("<other>");

 const std::string &
@@ -53,6 +62,15 @@ Packet::cmdString() const
      case WriteReqNoAck:   return WriteReqNoAckString;
      case ReadResp:        return ReadRespString;
      case WriteResp:       return WriteRespString;
+      case SoftPFReq:       return SoftPFReqString;
+      case SoftPFResp:      return SoftPFRespString;
+      case HardPFReq:       return HardPFReqString;
+      case HardPFResp:      return HardPFRespString;
+      case InvalidateReq:   return InvalidateReqString;
+      case WriteInvalidateReq:return WriteInvalidateReqString;
+      case UpgradeReq:      return UpgradeReqString;
+      case ReadExReq:       return ReadExReqString;
+      case ReadExResp:      return ReadExRespString;
      default:              return OtherCmdString;
    }
 }
@@ -66,6 +84,15 @@ Packet::cmdIdxToString(Packet::Command idx)
      case WriteReqNoAck:   return WriteReqNoAckString;
      case ReadResp:        return ReadRespString;
      case WriteResp:       return WriteRespString;
+      case SoftPFReq:       return SoftPFReqString;
+      case SoftPFResp:      return SoftPFRespString;
+      case HardPFReq:       return HardPFReqString;
+      case HardPFResp:      return HardPFRespString;
+      case InvalidateReq:   return InvalidateReqString;
+      case WriteInvalidateReq:return WriteInvalidateReqString;
+      case UpgradeReq:      return UpgradeReqString;
+      case ReadExReq:       return ReadExReqString;
+      case ReadExResp:      return ReadExRespString;
      default:              return OtherCmdString;
    }
 }
@@ -102,15 +129,11 @@ bool
 Packet::intersect(Packet *p)
 {
    Addr s1 = getAddr();
-    Addr e1 = getAddr() + getSize();
+    Addr e1 = getAddr() + getSize() - 1;
    Addr s2 = p->getAddr();
-    Addr e2 = p->getAddr() + p->getSize();
+    Addr e2 = p->getAddr() + p->getSize() - 1;

-    if (s1 >= s2 && s1 < e2)
-        return true;
-    if (e1 >= s2 && e1 < e2)
-        return true;
-    return false;
+    return !(s1 > e2 || e1 < s2);
 }

 bool
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -58,10 +58,8 @@ typedef std::list<PacketPtr> PacketList;
 #define NO_ALLOCATE 1 << 5
 #define SNOOP_COMMIT 1 << 6

-//For statistics we need max number of commands, hard code it at
-//20 for now.  @todo fix later
-#define NUM_MEM_CMDS 1 << 9
-
+//for now.  @todo fix later
+#define NUM_MEM_CMDS 1 << 11
 /**
 * A Packet is used to encapsulate a transfer between two objects in
 * the memory system (e.g., the L1 and L2 cache).  (In contrast, a
@@ -94,7 +92,6 @@ class Packet
     *   be called on it rather than simply delete.*/
    bool arrayData;

-
    /** The address of the request.  This address could be virtual or
     *   physical, depending on the system configuration. */
    Addr addr;
@@ -126,6 +123,12 @@ class Packet
    /** Used to calculate latencies for each packet.*/
    Tick time;

+    /** The time at which the packet will be fully transmitted */
+    Tick finishTime;
+
+    /** The time at which the first chunk of the packet will be transmitted */
+    Tick firstWordTime;
+
    /** The special destination address indicating that the packet
     *   should be routed based on its address. */
    static const short Broadcast = -1;
@@ -164,6 +167,8 @@ class Packet

  private:
    /** List of command attributes. */
+    // If you add a new CommandAttribute, make sure to increase NUM_MEM_CMDS
+    // as well.
    enum CommandAttribute
    {
        IsRead		= 1 << 0,
@@ -174,7 +179,9 @@ class Packet
        IsResponse 	= 1 << 5,
        NeedsResponse	= 1 << 6,
        IsSWPrefetch    = 1 << 7,
-        IsHWPrefetch    = 1 << 8
+        IsHWPrefetch    = 1 << 8,
+        IsUpgrade       = 1 << 9,
+        HasData		= 1 << 10
    };

  public:
@@ -183,21 +190,23 @@ class Packet
    {
        InvalidCmd      = 0,
        ReadReq		= IsRead  | IsRequest | NeedsResponse,
-        WriteReq	= IsWrite | IsRequest | NeedsResponse,
-        WriteReqNoAck	= IsWrite | IsRequest,
-        ReadResp	= IsRead  | IsResponse | NeedsResponse,
+        WriteReq	= IsWrite | IsRequest | NeedsResponse | HasData,
+        WriteReqNoAck	= IsWrite | IsRequest | HasData,
+        ReadResp	= IsRead  | IsResponse | NeedsResponse | HasData,
        WriteResp	= IsWrite | IsResponse | NeedsResponse,
-        Writeback       = IsWrite | IsRequest,
+        Writeback       = IsWrite | IsRequest | HasData,
        SoftPFReq       = IsRead  | IsRequest | IsSWPrefetch | NeedsResponse,
        HardPFReq       = IsRead  | IsRequest | IsHWPrefetch | NeedsResponse,
-        SoftPFResp      = IsRead  | IsResponse | IsSWPrefetch | NeedsResponse,
-        HardPFResp      = IsRead  | IsResponse | IsHWPrefetch | NeedsResponse,
+        SoftPFResp      = IsRead  | IsResponse | IsSWPrefetch
+                                | NeedsResponse | HasData,
+        HardPFResp      = IsRead  | IsResponse | IsHWPrefetch
+                                | NeedsResponse | HasData,
        InvalidateReq   = IsInvalidate | IsRequest,
-        WriteInvalidateReq = IsWrite | IsInvalidate | IsRequest,
-        UpgradeReq      = IsInvalidate | IsRequest | NeedsResponse,
-        UpgradeResp     = IsInvalidate | IsResponse | NeedsResponse,
+        WriteInvalidateReq = IsWrite | IsInvalidate | IsRequest | HasData,
+        UpgradeReq      = IsInvalidate | IsRequest | IsUpgrade,
        ReadExReq       = IsRead | IsInvalidate | IsRequest | NeedsResponse,
-        ReadExResp      = IsRead | IsInvalidate | IsResponse | NeedsResponse
+        ReadExResp      = IsRead | IsInvalidate | IsResponse
+                                | NeedsResponse | HasData
    };

    /** Return the string name of the cmd field (for debugging and
@@ -219,6 +228,7 @@ class Packet
    bool isResponse()	 { return (cmd & IsResponse) != 0; }
    bool needsResponse() { return (cmd & NeedsResponse) != 0; }
    bool isInvalidate()  { return (cmd & IsInvalidate) != 0; }
+    bool hasData()	 { return (cmd & HasData) != 0; }

    bool isCacheFill() { return (flags & CACHE_LINE_FILL) != 0; }
    bool isNoAllocate() { return (flags & NO_ALLOCATE) != 0; }
@@ -312,7 +322,7 @@ class Packet
     *   for returning as a response to that request.  Used for timing
     *   accesses only.  For atomic and functional accesses, the
     *   request packet is always implicitly passed back *without*
-     *   modifying the command or destination fields, so this function
+     *   modifying the destination fields, so this function
     *   should not be called. */
    void makeTimingResponse() {
        assert(needsResponse());
@@ -320,11 +330,31 @@ class Packet
        int icmd = (int)cmd;
        icmd &= ~(IsRequest);
        icmd |= IsResponse;
+        if (isRead())
+            icmd |= HasData;
+        if (isWrite())
+            icmd &= ~HasData;
        cmd = (Command)icmd;
        dest = src;
        srcValid = false;
    }

+    /** Take a request packet and modify it in place to be suitable
+     *   for returning as a response to that request.
+     */
+    void makeAtomicResponse() {
+        assert(needsResponse());
+        assert(isRequest());
+        int icmd = (int)cmd;
+        icmd &= ~(IsRequest);
+        icmd |= IsResponse;
+        if (isRead())
+            icmd |= HasData;
+        if (isWrite())
+            icmd &= ~HasData;
+        cmd = (Command)icmd;
+    }
+
    /** Take a request packet that has been returned as NACKED and modify it so
     * that it can be sent out again. Only packets that need a response can be
     * NACKED, so verify that that is true. */
--- a/src/mem/physical.cc
+++ b/src/mem/physical.cc
@@ -110,28 +110,112 @@ PhysicalMemory::calculateLatency(Packet *pkt)
    return lat;
 }

+
+
+// Add load-locked to tracking list.  Should only be called if the
+// operation is a load and the LOCKED flag is set.
+void
+PhysicalMemory::trackLoadLocked(Request *req)
+{
+    Addr paddr = LockedAddr::mask(req->getPaddr());
+
+    // first we check if we already have a locked addr for this
+    // xc.  Since each xc only gets one, we just update the
+    // existing record with the new address.
+    list<LockedAddr>::iterator i;
+
+    for (i = lockedAddrList.begin(); i != lockedAddrList.end(); ++i) {
+        if (i->matchesContext(req)) {
+            DPRINTF(LLSC, "Modifying lock record: cpu %d thread %d addr %#x\n",
+                    req->getCpuNum(), req->getThreadNum(), paddr);
+            i->addr = paddr;
+            return;
+        }
+    }
+
+    // no record for this xc: need to allocate a new one
+    DPRINTF(LLSC, "Adding lock record: cpu %d thread %d addr %#x\n",
+            req->getCpuNum(), req->getThreadNum(), paddr);
+    lockedAddrList.push_front(LockedAddr(req));
+}
+
+
+// Called on *writes* only... both regular stores and
+// store-conditional operations.  Check for conventional stores which
+// conflict with locked addresses, and for success/failure of store
+// conditionals.
+bool
+PhysicalMemory::checkLockedAddrList(Request *req)
+{
+    Addr paddr = LockedAddr::mask(req->getPaddr());
+    bool isLocked = req->isLocked();
+
+    // Initialize return value.  Non-conditional stores always
+    // succeed.  Assume conditional stores will fail until proven
+    // otherwise.
+    bool success = !isLocked;
+
+    // Iterate over list.  Note that there could be multiple matching
+    // records, as more than one context could have done a load locked
+    // to this location.
+    list<LockedAddr>::iterator i = lockedAddrList.begin();
+
+    while (i != lockedAddrList.end()) {
+
+        if (i->addr == paddr) {
+            // we have a matching address
+
+            if (isLocked && i->matchesContext(req)) {
+                // it's a store conditional, and as far as the memory
+                // system can tell, the requesting context's lock is
+                // still valid.
+                DPRINTF(LLSC, "StCond success: cpu %d thread %d addr %#x\n",
+                        req->getCpuNum(), req->getThreadNum(), paddr);
+                success = true;
+            }
+
+            // Get rid of our record of this lock and advance to next
+            DPRINTF(LLSC, "Erasing lock record: cpu %d thread %d addr %#x\n",
+                    i->cpuNum, i->threadNum, paddr);
+            i = lockedAddrList.erase(i);
+        }
+        else {
+            // no match: advance to next record
+            ++i;
+        }
+    }
+
+    if (isLocked) {
+        req->setScResult(success ? 1 : 0);
+    }
+
+    return success;
+}
+
 void
 PhysicalMemory::doFunctionalAccess(Packet *pkt)
 {
-    assert(pkt->getAddr() + pkt->getSize() < params()->addrRange.size());
+    assert(pkt->getAddr() + pkt->getSize() <= params()->addrRange.size());

-    switch (pkt->cmd) {
-      case Packet::ReadReq:
+    if (pkt->isRead()) {
+        if (pkt->req->isLocked()) {
+            trackLoadLocked(pkt->req);
+        }
        memcpy(pkt->getPtr<uint8_t>(),
               pmemAddr + pkt->getAddr() - params()->addrRange.start,
               pkt->getSize());
-        break;
-      case Packet::WriteReq:
-        memcpy(pmemAddr + pkt->getAddr() - params()->addrRange.start,
-               pkt->getPtr<uint8_t>(),
-               pkt->getSize());
-        // temporary hack: will need to add real LL/SC implementation
-        // for cacheless systems later.
-        if (pkt->req->getFlags() & LOCKED) {
-            pkt->req->setScResult(1);
+    }
+    else if (pkt->isWrite()) {
+        if (writeOK(pkt->req)) {
+            memcpy(pmemAddr + pkt->getAddr() - params()->addrRange.start,
+                   pkt->getPtr<uint8_t>(), pkt->getSize());
        }
-        break;
-      default:
+    }
+    else if (pkt->isInvalidate()) {
+        //upgrade or invalidate
+        pkt->flags |= SATISFIED;
+    }
+    else {
        panic("unimplemented");
    }

@@ -147,7 +231,7 @@ PhysicalMemory::getPort(const std::string &if_name, int idx)
        port = new MemoryPort(name() + "-port", this);
        return port;
    } else if (if_name == "functional") {
-        /* special port for functional writes at startup. */
+        /* special port for functional writes at startup. And for memtester */
        return new MemoryPort(name() + "-funcport", this);
    } else {
        panic("PhysicalMemory::getPort: unknown port %s requested", if_name);
--- a/src/mem/physical.hh
+++ b/src/mem/physical.hh
@@ -78,6 +78,68 @@ class PhysicalMemory : public MemObject
    const PhysicalMemory &operator=(const PhysicalMemory &specmem);

  protected:
+
+    class LockedAddr {
+      public:
+        // on alpha, minimum LL/SC granularity is 16 bytes, so lower
+        // bits need to masked off.
+        static const Addr Addr_Mask = 0xf;
+
+        static Addr mask(Addr paddr) { return (paddr & ~Addr_Mask); }
+
+        Addr addr; 	// locked address
+        int cpuNum;	// locking CPU
+        int threadNum;	// locking thread ID within CPU
+
+        // check for matching execution context
+        bool matchesContext(Request *req)
+        {
+            return (cpuNum == req->getCpuNum() &&
+                    threadNum == req->getThreadNum());
+        }
+
+        LockedAddr(Request *req)
+            : addr(mask(req->getPaddr())),
+              cpuNum(req->getCpuNum()),
+              threadNum(req->getThreadNum())
+        {
+        }
+    };
+
+    std::list<LockedAddr> lockedAddrList;
+
+    // helper function for checkLockedAddrs(): we really want to
+    // inline a quick check for an empty locked addr list (hopefully
+    // the common case), and do the full list search (if necessary) in
+    // this out-of-line function
+    bool checkLockedAddrList(Request *req);
+
+    // Record the address of a load-locked operation so that we can
+    // clear the execution context's lock flag if a matching store is
+    // performed
+    void trackLoadLocked(Request *req);
+
+    // Compare a store address with any locked addresses so we can
+    // clear the lock flag appropriately.  Return value set to 'false'
+    // if store operation should be suppressed (because it was a
+    // conditional store and the address was no longer locked by the
+    // requesting execution context), 'true' otherwise.  Note that
+    // this method must be called on *all* stores since even
+    // non-conditional stores must clear any matching lock addresses.
+    bool writeOK(Request *req) {
+        if (lockedAddrList.empty()) {
+            // no locked addrs: nothing to check, store_conditional fails
+            bool isLocked = req->isLocked();
+            if (isLocked) {
+                req->setScResult(0);
+            }
+            return !isLocked; // only do write if not an sc
+        } else {
+            // iterate over list...
+            return checkLockedAddrList(req);
+        }
+    }
+
    uint8_t *pmemAddr;
    MemoryPort *port;
    int pagePtr;
--- a/src/mem/port.hh
+++ b/src/mem/port.hh
@@ -106,8 +106,7 @@ class Port
    /** Holds the ports status.  Currently just that a range recomputation needs
     * to be done. */
    enum Status {
-        RangeChange,
-        SnoopSquash
+        RangeChange
    };

    void setName(const std::string &name)
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -232,9 +232,11 @@ class Request
    Addr getPC() { assert(validPC); return pc; }

    /** Accessor Function to Check Cacheability. */
-    bool isUncacheable() { return getFlags() & UNCACHEABLE; }
+    bool isUncacheable() { return (getFlags() & UNCACHEABLE) != 0; }

-    bool isInstRead() { return getFlags() & INST_READ; }
+    bool isInstRead() { return (getFlags() & INST_READ) != 0; }
+
+    bool isLocked() { return (getFlags() & LOCKED) != 0; }

    friend class Packet;
 };
--- a/src/mem/tport.cc
+++ b/src/mem/tport.cc
@@ -47,22 +47,28 @@ SimpleTimingPort::recvTiming(Packet *pkt)
    // if we ever added it back.
    assert(pkt->result != Packet::Nacked);
    Tick latency = recvAtomic(pkt);
-    // turn packet around to go back to requester
-    pkt->makeTimingResponse();
-    sendTimingLater(pkt, latency);
+    // turn packet around to go back to requester if response expected
+    if (pkt->needsResponse()) {
+        pkt->makeTimingResponse();
+        sendTimingLater(pkt, latency);
+    }
    return true;
 }

 void
 SimpleTimingPort::recvRetry()
 {
-    bool result = true;
-    while (result && transmitList.size()) {
-        result = sendTiming(transmitList.front());
-        if (result)
-            transmitList.pop_front();
+    assert(outTiming > 0);
+    assert(!transmitList.empty());
+    if (sendTiming(transmitList.front())) {
+        transmitList.pop_front();
+        outTiming--;
+        DPRINTF(Bus, "No Longer waiting on retry\n");
+        if (!transmitList.empty())
+            sendTimingLater(transmitList.front(), 1);
    }
-    if (transmitList.size() == 0 && drainEvent) {
+
+    if (transmitList.empty() && drainEvent) {
        drainEvent->process();
        drainEvent = NULL;
    }
@@ -71,18 +77,28 @@ SimpleTimingPort::recvRetry()
 void
 SimpleTimingPort::SendEvent::process()
 {
-    port->outTiming--;
-    assert(port->outTiming >= 0);
-    if (port->sendTiming(packet)) {
-        // send successfule
-        if (port->transmitList.size() == 0 && port->drainEvent) {
+    assert(port->outTiming > 0);
+    if (!port->transmitList.empty() && port->transmitList.front() != packet) {
+        //We are not the head of the list
+        port->transmitList.push_back(packet);
+    } else if (port->sendTiming(packet)) {
+        // send successful
+        if (port->transmitList.size()) {
+            port->transmitList.pop_front();
+            port->outTiming--;
+           if (!port->transmitList.empty())
+                port->sendTimingLater(port->transmitList.front(), 1);
+        }
+        if (port->transmitList.empty() && port->drainEvent) {
            port->drainEvent->process();
            port->drainEvent = NULL;
        }
    } else {
        // send unsuccessful (due to flow control).  Will get retry
-        // callback later; save for then.
-        port->transmitList.push_back(packet);
+        // callback later; save for then if not already
+        DPRINTF(Bus, "Waiting on retry\n");
+        if (!(port->transmitList.front() == packet))
+            port->transmitList.push_back(packet);
    }
 }

--- a/src/python/m5/objects/BaseCPU.py
+++ b/src/python/m5/objects/BaseCPU.py
@@ -11,10 +11,11 @@ class BaseCPU(SimObject):
    mem = Param.MemObject("memory")

    system = Param.System(Parent.any, "system object")
+    cpu_id = Param.Int("CPU identifier")
+
    if build_env['FULL_SYSTEM']:
        dtb = Param.AlphaDTB(AlphaDTB(), "Data TLB")
        itb = Param.AlphaITB(AlphaITB(), "Instruction TLB")
-        cpu_id = Param.Int(-1, "CPU identifier")
    else:
        workload = VectorParam.Process("processes to run")

--- a/src/python/m5/objects/Bus.py
+++ b/src/python/m5/objects/Bus.py
@@ -6,3 +6,5 @@ class Bus(MemObject):
    port = VectorPort("vector port for connecting devices")
    default = Port("Default port for requests that aren't handeled by a device.")
    bus_id = Param.Int(0, "blah")
+    clock = Param.Clock("1GHz", "bus clock speed")
+    width = Param.Int(64, "bus width (bytes)")
--- a/src/python/m5/objects/FUPool.py
+++ b/src/python/m5/objects/FUPool.py
@@ -1,6 +1,12 @@
 from m5.SimObject import SimObject
 from m5.params import *
+from FuncUnit import *
+from FuncUnitConfig import *

 class FUPool(SimObject):
    type = 'FUPool'
    FUList = VectorParam.FUDesc("list of FU's for this pool")
+
+class DefaultFUPool(FUPool):
+    FUList = [ IntALU(), IntMultDiv(), FP_ALU(), FP_MultDiv(), ReadPort(),
+               WritePort(), RdWrPort(), IprPort() ]
--- a/src/python/m5/objects/FuncUnitConfig.py
+++ b/src/python/m5/objects/FuncUnitConfig.py
@@ -0,0 +1,41 @@
+from m5.SimObject import SimObject
+from m5.params import *
+from FuncUnit import *
+
+class IntALU(FUDesc):
+    opList = [ OpDesc(opClass='IntAlu') ]
+    count = 6
+
+class IntMultDiv(FUDesc):
+    opList = [ OpDesc(opClass='IntMult', opLat=3),
+               OpDesc(opClass='IntDiv', opLat=20, issueLat=19) ]
+    count=2
+
+class FP_ALU(FUDesc):
+    opList = [ OpDesc(opClass='FloatAdd', opLat=2),
+               OpDesc(opClass='FloatCmp', opLat=2),
+               OpDesc(opClass='FloatCvt', opLat=2) ]
+    count = 4
+
+class FP_MultDiv(FUDesc):
+    opList = [ OpDesc(opClass='FloatMult', opLat=4),
+               OpDesc(opClass='FloatDiv', opLat=12, issueLat=12),
+               OpDesc(opClass='FloatSqrt', opLat=24, issueLat=24) ]
+    count = 2
+
+class ReadPort(FUDesc):
+    opList = [ OpDesc(opClass='MemRead') ]
+    count = 0
+
+class WritePort(FUDesc):
+    opList = [ OpDesc(opClass='MemWrite') ]
+    count = 0
+
+class RdWrPort(FUDesc):
+    opList = [ OpDesc(opClass='MemRead'), OpDesc(opClass='MemWrite') ]
+    count = 4
+
+class IprPort(FUDesc):
+    opList = [ OpDesc(opClass='IprAccess', opLat = 3, issueLat = 3) ]
+    count = 1
+
--- a/src/python/m5/objects/MemTest.py
+++ b/src/python/m5/objects/MemTest.py
@@ -1,13 +1,13 @@
 from m5.SimObject import SimObject
 from m5.params import *
+from m5.proxy import *
+from m5 import build_env
+
 class MemTest(SimObject):
    type = 'MemTest'
-    cache = Param.BaseCache("L1 cache")
-    check_mem = Param.FunctionalMemory("check memory")
-    main_mem = Param.FunctionalMemory("hierarchical memory")
    max_loads = Param.Counter("number of loads to execute")
+    atomic = Param.Bool(False, "Execute tester in atomic mode? (or timing)\n")
    memory_size = Param.Int(65536, "memory size")
-    percent_copies = Param.Percent(0, "target copy percentage")
    percent_dest_unaligned = Param.Percent(50,
        "percent of copy dest address that are unaligned")
    percent_reads = Param.Percent(65, "target read percentage")
@@ -18,3 +18,6 @@ class MemTest(SimObject):
    progress_interval = Param.Counter(1000000,
        "progress report interval (in accesses)")
    trace_addr = Param.Addr(0, "address to trace")
+
+    test = Port("Port to the memory system to test")
+    functional = Port("Port to the functional memory used for verification")
--- a/src/python/m5/objects/O3CPU.py
+++ b/src/python/m5/objects/O3CPU.py
@@ -3,6 +3,7 @@ from m5.proxy import *
 from m5 import build_env
 from BaseCPU import BaseCPU
 from Checker import O3Checker
+from FUPool import *

 class DerivO3CPU(BaseCPU):
    type = 'DerivO3CPU'
@@ -14,11 +15,13 @@ class DerivO3CPU(BaseCPU):
    if build_env['USE_CHECKER']:
        if not build_env['FULL_SYSTEM']:
            checker = Param.BaseCPU(O3Checker(workload=Parent.workload,
-                                              exitOnError=True,
+                                              exitOnError=False,
+                                              updateOnError=True,
                                              warnOnlyOnLoadError=False),
                                    "checker")
        else:
-            checker = Param.BaseCPU(O3Checker(exitOnError=True, warnOnlyOnLoadError=False), "checker")
+            checker = Param.BaseCPU(O3Checker(exitOnError=False, updateOnError=True,
+                                              warnOnlyOnLoadError=False), "checker")
            checker.itb = Parent.itb
            checker.dtb = Parent.dtb

@@ -57,7 +60,7 @@ class DerivO3CPU(BaseCPU):
    issueWidth = Param.Unsigned(8, "Issue width")
    wbWidth = Param.Unsigned(8, "Writeback width")
    wbDepth = Param.Unsigned(1, "Writeback depth")
-    fuPool = Param.FUPool("Functional Unit pool")
+    fuPool = Param.FUPool(DefaultFUPool(), "Functional Unit pool")

    iewToCommitDelay = Param.Unsigned(1, "Issue/Execute/Writeback to commit "
               "delay")
@@ -77,7 +80,7 @@ class DerivO3CPU(BaseCPU):
    localHistoryBits = Param.Unsigned(11, "Bits for the local history")
    globalPredictorSize = Param.Unsigned(8192, "Size of global predictor")
    globalCtrBits = Param.Unsigned(2, "Bits per counter")
-    globalHistoryBits = Param.Unsigned(4096, "Bits of history")
+    globalHistoryBits = Param.Unsigned(13, "Bits of history")
    choicePredictorSize = Param.Unsigned(8192, "Size of choice predictor")
    choiceCtrBits = Param.Unsigned(2, "Bits of choice counters")

--- a/src/python/m5/objects/PhysicalMemory.py
+++ b/src/python/m5/objects/PhysicalMemory.py
@@ -5,6 +5,7 @@ from MemObject import *
 class PhysicalMemory(MemObject):
    type = 'PhysicalMemory'
    port = Port("the access port")
+    functional = Port("Functional Access Port")
    range = Param.AddrRange(AddrRange('128MB'), "Device Address")
    file = Param.String('', "memory mapped file")
    latency = Param.Latency(Parent.clock, "latency of an access")
--- a/src/python/m5/params.py
+++ b/src/python/m5/params.py
@@ -804,7 +804,7 @@ class PortRef(object):
        newRef.simobj = simobj
        assert(isSimObject(newRef.simobj))
        if self.peer and not proxy.isproxy(self.peer):
-            peerObj = memo[self.peer.simobj]
+            peerObj = self.peer.simobj(_memo=memo)
            newRef.peer = self.peer.clone(peerObj, memo)
            assert(not isinstance(newRef.peer, VectorPortRef))
        return newRef
--- a/src/python/m5/proxy.py
+++ b/src/python/m5/proxy.py
@@ -33,6 +33,8 @@
 #
 #####################################################################

+import copy
+
 class BaseProxy(object):
    def __init__(self, search_self, search_up):
        self._search_self = search_self
@@ -129,15 +131,22 @@ class AttrProxy(BaseProxy):
            return super(AttrProxy, self).__getattr__(self, attr)
        if hasattr(self, '_pdesc'):
            raise AttributeError, "Attribute reference on bound proxy"
-        self._modifiers.append(attr)
-        return self
+        # Return a copy of self rather than modifying self in place
+        # since self could be an indirect reference via a variable or
+        # parameter
+        new_self = copy.deepcopy(self)
+        new_self._modifiers.append(attr)
+        return new_self

    # support indexing on proxies (e.g., Self.cpu[0])
    def __getitem__(self, key):
        if not isinstance(key, int):
            raise TypeError, "Proxy object requires integer index"
-        self._modifiers.append(key)
-        return self
+        if hasattr(self, '_pdesc'):
+            raise AttributeError, "Index operation on bound proxy"
+        new_self = copy.deepcopy(self)
+        new_self._modifiers.append(key)
+        return new_self

    def find(self, obj):
        try:
--- a/src/sim/main.cc
+++ b/src/sim/main.cc
@@ -317,8 +317,8 @@ simulate(Tick num_cycles = -1)
    else
        num_cycles = curTick + num_cycles;

-    Event *limit_event = new SimLoopExitEvent(num_cycles,
-                                              "simulate() limit reached");
+    Event *limit_event = schedExitSimLoop("simulate() limit reached",
+                                          num_cycles);

    while (1) {
        // there should always be at least one event (the SimLoopExitEvent
@@ -414,7 +414,12 @@ unserializeAll(const std::string &cpt_dir)
 /**
 * Queue of C++ callbacks to invoke on simulator exit.
 */
-CallbackQueue exitCallbacks;
+CallbackQueue&
+exitCallbacks()
+{
+    static CallbackQueue theQueue;
+    return theQueue;
+}

 /**
 * Register an exit callback.
@@ -422,7 +427,7 @@ CallbackQueue exitCallbacks;
 void
 registerExitCallback(Callback *callback)
 {
-    exitCallbacks.add(callback);
+    exitCallbacks().add(callback);
 }

 BaseCPU *
@@ -442,8 +447,8 @@ convertToBaseCPUPtr(SimObject *obj)
 void
 doExitCleanup()
 {
-    exitCallbacks.process();
-    exitCallbacks.clear();
+    exitCallbacks().process();
+    exitCallbacks().clear();

    cout.flush();

--- a/src/sim/pseudo_inst.cc
+++ b/src/sim/pseudo_inst.cc
@@ -138,14 +138,14 @@ namespace AlphaPseudo
    void
    m5exit_old(ThreadContext *tc)
    {
-        exitSimLoop(curTick, "m5_exit_old instruction encountered");
+        exitSimLoop("m5_exit_old instruction encountered");
    }

    void
    m5exit(ThreadContext *tc, Tick delay)
    {
        Tick when = curTick + delay * Clock::Int::ns;
-        exitSimLoop(when, "m5_exit instruction encountered");
+        schedExitSimLoop("m5_exit instruction encountered", when);
    }

    void
@@ -270,7 +270,11 @@ namespace AlphaPseudo
    {
        if (!doCheckpointInsts)
            return;
-        exitSimLoop("checkpoint");
+
+        Tick when = curTick + delay * Clock::Int::ns;
+        Tick repeat = period * Clock::Int::ns;
+
+        schedExitSimLoop("checkpoint", when, repeat);
    }

    uint64_t
--- a/src/sim/root.cc
+++ b/src/sim/root.cc
@@ -100,7 +100,7 @@ void
 Root::startup()
 {
    if (max_tick != 0)
-        exitSimLoop(curTick + max_tick, "reached maximum cycle count");
+        schedExitSimLoop("reached maximum cycle count", curTick + max_tick);

    if (progress_interval != 0)
        new ProgressEvent(&mainEventQueue, progress_interval);
--- a/src/sim/sim_events.cc
+++ b/src/sim/sim_events.cc
@@ -57,6 +57,11 @@ SimLoopExitEvent::process()

    // otherwise do nothing... the IsExitEvent flag takes care of
    // exiting the simulation loop and returning this object to Python
+
+    // but if you are doing this on intervals, don't forget to make another
+    if (repeat) {
+        schedule(curTick + repeat);
+    }
 }


@@ -66,16 +71,20 @@ SimLoopExitEvent::description()
    return "simulation loop exit";
 }

-void
-exitSimLoop(Tick when, const std::string &message, int exit_code)
+SimLoopExitEvent *
+schedExitSimLoop(const std::string &message, Tick when, Tick repeat,
+                 EventQueue *q, int exit_code)
 {
-    new SimLoopExitEvent(when, message, exit_code);
+    if (q == NULL)
+        q = &mainEventQueue;
+
+    return new SimLoopExitEvent(q, when, repeat, message, exit_code);
 }

 void
 exitSimLoop(const std::string &message, int exit_code)
 {
-    exitSimLoop(curTick, message, exit_code);
+    schedExitSimLoop(message, curTick, 0, NULL, exit_code);
 }

 void
--- a/src/sim/sim_events.hh
+++ b/src/sim/sim_events.hh
@@ -42,6 +42,7 @@ class SimLoopExitEvent : public Event
    // string explaining why we're terminating
    std::string cause;
    int code;
+    Tick repeat;

  public:
    // Default constructor.  Only really used for derived classes.
@@ -49,15 +50,18 @@ class SimLoopExitEvent : public Event
        : Event(&mainEventQueue, Sim_Exit_Pri)
    { }

-    SimLoopExitEvent(Tick _when, const std::string &_cause, int c = 0)
-        : Event(&mainEventQueue, Sim_Exit_Pri), cause(_cause),
-          code(c)
+    SimLoopExitEvent(EventQueue *q,
+                     Tick _when, Tick _repeat, const std::string &_cause,
+                     int c = 0)
+        : Event(q, Sim_Exit_Pri), cause(_cause),
+          code(c), repeat(_repeat)
    { setFlags(IsExitEvent); schedule(_when); }

-    SimLoopExitEvent(EventQueue *q,
-                     Tick _when, const std::string &_cause, int c = 0)
-        : Event(q, Sim_Exit_Pri), cause(_cause), code(c)
-    { setFlags(IsExitEvent); schedule(_when); }
+//     SimLoopExitEvent(EventQueue *q,
+// 		     Tick _when, const std::string &_cause,
+// 		     Tick _repeat = 0, int c = 0)
+// 	: Event(q, Sim_Exit_Pri), cause(_cause), code(c), repeat(_repeat)
+//     { setFlags(IsExitEvent); schedule(_when); }

    std::string getCause() { return cause; }
    int getCode() { return code; }
--- a/src/sim/sim_exit.hh
+++ b/src/sim/sim_exit.hh
@@ -38,6 +38,8 @@

 // forward declaration
 class Callback;
+class EventQueue;
+class SimLoopExitEvent;

 /// Register a callback to be called when Python exits.  Defined in
 /// sim/main.cc.
@@ -47,12 +49,14 @@ void registerExitCallback(Callback *);
 /// Python) at the indicated tick.  The message and exit_code
 /// parameters are saved in the SimLoopExitEvent to indicate why the
 /// exit occurred.
-void exitSimLoop(Tick when, const std::string &message, int exit_code = 0);
+SimLoopExitEvent *schedExitSimLoop(const std::string &message, Tick when,
+                                   Tick repeat = 0, EventQueue *q = NULL,
+                                   int exit_code = 0);

 /// Schedule an event to exit the simulation loop (returning to
 /// Python) at the end of the current cycle (curTick).  The message
 /// and exit_code parameters are saved in the SimLoopExitEvent to
 /// indicate why the exit occurred.
-void exitSimLoop(const std::string &cause, int exit_code = 0);
+void exitSimLoop(const std::string &message, int exit_code = 0);

 #endif // __SIM_EXIT_HH__
--- a/src/sim/stat_control.cc
+++ b/src/sim/stat_control.cc
@@ -186,7 +186,7 @@ StatEvent::process()
        DumpNow();

    if (flags & Stats::Reset) {
-        cprintf("Resetting stats!\n");
+        cprintf("Resetting stats at cycle %d!\n", curTick);
        reset();
    }

--- a/src/sim/system.cc
+++ b/src/sim/system.cc
@@ -219,6 +219,8 @@ System::new_page()
 {
    Addr return_addr = page_ptr << LogVMPageSize;
    ++page_ptr;
+    if (return_addr >= physmem->size())
+        fatal("Out of memory, please increase size of physical memory.");
    return return_addr;
 }
 #endif
--- a/tests/configs/memtest.py
+++ b/tests/configs/memtest.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2006 The Regents of The University of Michigan
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Ron Dreslinski
+
+import m5
+from m5.objects import *
+
+# --------------------
+# Base L1 Cache
+# ====================
+
+class L1(BaseCache):
+    latency = 1
+    block_size = 64
+    mshrs = 12
+    tgts_per_mshr = 8
+    protocol = CoherenceProtocol(protocol='moesi')
+
+# ----------------------
+# Base L2 Cache
+# ----------------------
+
+class L2(BaseCache):
+    block_size = 64
+    latency = 10
+    mshrs = 92
+    tgts_per_mshr = 16
+    write_buffers = 8
+
+#MAX CORES IS 8 with the fals sharing method
+nb_cores = 8
+cpus = [ MemTest(max_loads=1e12, percent_uncacheable=0, progress_interval=1000) for i in xrange(nb_cores) ]
+
+# system simulated
+system = System(cpu = cpus, funcmem = PhysicalMemory(),
+                physmem = PhysicalMemory(), membus = Bus(clock="500GHz", width=16))
+
+# l2cache & bus
+system.toL2Bus = Bus(clock="500GHz", width=16)
+system.l2c = L2(size='64kB', assoc=8)
+system.l2c.cpu_side = system.toL2Bus.port
+
+# connect l2c to membus
+system.l2c.mem_side = system.membus.port
+
+which_port = 0
+# add L1 caches
+for cpu in cpus:
+    cpu.l1c = L1(size = '32kB', assoc = 4)
+    cpu.l1c.cpu_side = cpu.test
+    cpu.l1c.mem_side = system.toL2Bus.port
+    if  which_port == 0:
+         system.funcmem.port = cpu.functional
+         which_port = 1
+    else:
+         system.funcmem.functional = cpu.functional
+
+
+# connect memory to membus
+system.physmem.port = system.membus.port
+
+
+# -----------------------
+# run simulation
+# -----------------------
+
+root = Root( system = system )
+root.system.mem_mode = 'timing'
+#root.trace.flags="Cache CachePort Bus"
+#root.trace.cycle=3810800
+
--- a/tests/configs/o3-timing-mp.py
+++ b/tests/configs/o3-timing-mp.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2006 The Regents of The University of Michigan
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Ron Dreslinski
+
+import m5
+from m5.objects import *
+m5.AddToPath('../configs/common')
+
+# --------------------
+# Base L1 Cache
+# ====================
+
+class L1(BaseCache):
+    latency = 1
+    block_size = 64
+    mshrs = 4
+    tgts_per_mshr = 8
+    protocol = CoherenceProtocol(protocol='moesi')
+
+# ----------------------
+# Base L2 Cache
+# ----------------------
+
+class L2(BaseCache):
+    block_size = 64
+    latency = 100
+    mshrs = 92
+    tgts_per_mshr = 16
+    write_buffers = 8
+
+nb_cores = 4
+cpus = [ DerivO3CPU(cpu_id=i) for i in xrange(nb_cores) ]
+
+# system simulated
+system = System(cpu = cpus, physmem = PhysicalMemory(), membus =
+Bus())
+
+# l2cache & bus
+system.toL2Bus = Bus()
+system.l2c = L2(size='4MB', assoc=8)
+system.l2c.cpu_side = system.toL2Bus.port
+
+# connect l2c to membus
+system.l2c.mem_side = system.membus.port
+
+# add L1 caches
+for cpu in cpus:
+    cpu.addPrivateSplitL1Caches(L1(size = '32kB', assoc = 1),
+                                L1(size = '32kB', assoc = 4))
+    cpu.mem = cpu.dcache
+    # connect cpu level-1 caches to shared level-2 cache
+    cpu.connectMemPorts(system.toL2Bus)
+
+# connect memory to membus
+system.physmem.port = system.membus.port
+
+
+# -----------------------
+# run simulation
+# -----------------------
+
+root = Root( system = system )
+root.system.mem_mode = 'timing'
+#root.trace.flags="Bus Cache"
+#root.trace.flags = "BusAddrRanges"
--- a/tests/configs/o3-timing.py
+++ b/tests/configs/o3-timing.py
@@ -29,7 +29,6 @@
 import m5
 from m5.objects import *
 m5.AddToPath('../configs/common')
-from FullO3Config import *

 class MyCache(BaseCache):
    assoc = 2
@@ -38,7 +37,7 @@ class MyCache(BaseCache):
    mshrs = 10
    tgts_per_mshr = 5

-cpu = DetailedO3CPU()
+cpu = DerivO3CPU()
 cpu.addTwoLevelCacheHierarchy(MyCache(size = '128kB'), MyCache(size = '256kB'),
                              MyCache(size = '2MB'))
 cpu.mem = cpu.dcache
--- a/tests/configs/simple-atomic-mp.py
+++ b/tests/configs/simple-atomic-mp.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2006 The Regents of The University of Michigan
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Ron Dreslinski
+
+import m5
+from m5.objects import *
+
+# --------------------
+# Base L1 Cache
+# ====================
+
+class L1(BaseCache):
+    latency = 1
+    block_size = 64
+    mshrs = 4
+    tgts_per_mshr = 8
+    protocol = CoherenceProtocol(protocol='moesi')
+
+# ----------------------
+# Base L2 Cache
+# ----------------------
+
+class L2(BaseCache):
+    block_size = 64
+    latency = 100
+    mshrs = 92
+    tgts_per_mshr = 16
+    write_buffers = 8
+
+nb_cores = 4
+cpus = [ AtomicSimpleCPU(cpu_id=i) for i in xrange(nb_cores) ]
+
+# system simulated
+system = System(cpu = cpus, physmem = PhysicalMemory(range = AddrRange('1024MB')), membus =
+Bus())
+
+# l2cache & bus
+system.toL2Bus = Bus()
+system.l2c = L2(size='4MB', assoc=8)
+system.l2c.cpu_side = system.toL2Bus.port
+
+# connect l2c to membus
+system.l2c.mem_side = system.membus.port
+
+# add L1 caches
+for cpu in cpus:
+    cpu.addPrivateSplitL1Caches(L1(size = '32kB', assoc = 1),
+                                L1(size = '32kB', assoc = 4))
+    cpu.mem = cpu.dcache
+    # connect cpu level-1 caches to shared level-2 cache
+    cpu.connectMemPorts(system.toL2Bus)
+
+# connect memory to membus
+system.physmem.port = system.membus.port
+
+
+# -----------------------
+# run simulation
+# -----------------------
+
+root = Root( system = system )
+root.system.mem_mode = 'atomic'
--- a/tests/configs/simple-atomic.py
+++ b/tests/configs/simple-atomic.py
@@ -29,7 +29,7 @@
 import m5
 from m5.objects import *

-system = System(cpu = AtomicSimpleCPU(),
+system = System(cpu = AtomicSimpleCPU(cpu_id=0),
                physmem = PhysicalMemory(),
                membus = Bus())
 system.physmem.port = system.membus.port
--- a/tests/configs/simple-timing-mp.py
+++ b/tests/configs/simple-timing-mp.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2006 The Regents of The University of Michigan
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Ron Dreslinski
+
+import m5
+from m5.objects import *
+
+# --------------------
+# Base L1 Cache
+# ====================
+
+class L1(BaseCache):
+    latency = 1
+    block_size = 64
+    mshrs = 4
+    tgts_per_mshr = 8
+    protocol = CoherenceProtocol(protocol='moesi')
+
+# ----------------------
+# Base L2 Cache
+# ----------------------
+
+class L2(BaseCache):
+    block_size = 64
+    latency = 100
+    mshrs = 92
+    tgts_per_mshr = 16
+    write_buffers = 8
+
+nb_cores = 4
+cpus = [ TimingSimpleCPU(cpu_id=i) for i in xrange(nb_cores) ]
+
+# system simulated
+system = System(cpu = cpus, physmem = PhysicalMemory(), membus =
+Bus())
+
+# l2cache & bus
+system.toL2Bus = Bus()
+system.l2c = L2(size='4MB', assoc=8)
+system.l2c.cpu_side = system.toL2Bus.port
+
+# connect l2c to membus
+system.l2c.mem_side = system.membus.port
+
+# add L1 caches
+for cpu in cpus:
+    cpu.addPrivateSplitL1Caches(L1(size = '32kB', assoc = 1),
+                                L1(size = '32kB', assoc = 4))
+    cpu.mem = cpu.dcache
+    # connect cpu level-1 caches to shared level-2 cache
+    cpu.connectMemPorts(system.toL2Bus)
+
+# connect memory to membus
+system.physmem.port = system.membus.port
+
+
+# -----------------------
+# run simulation
+# -----------------------
+
+root = Root( system = system )
+root.system.mem_mode = 'timing'
--- a/tests/configs/simple-timing.py
+++ b/tests/configs/simple-timing.py
@@ -36,7 +36,7 @@ class MyCache(BaseCache):
    mshrs = 10
    tgts_per_mshr = 5

-cpu = TimingSimpleCPU()
+cpu = TimingSimpleCPU(cpu_id=0)
 cpu.addTwoLevelCacheHierarchy(MyCache(size = '128kB'), MyCache(size = '256kB'),
                              MyCache(size = '2MB'))
 cpu.mem = cpu.dcache
--- a/tests/configs/tsunami-simple-atomic-dual.py
+++ b/tests/configs/tsunami-simple-atomic-dual.py
@@ -34,7 +34,7 @@ import FSConfig
 AlphaConsole.cpu = Parent.cpu[0]
 IntrControl.cpu = Parent.cpu[0]

-cpus = [ AtomicSimpleCPU() for i in xrange(2) ]
+cpus = [ AtomicSimpleCPU(cpu_id=i) for i in xrange(2) ]
 system = FSConfig.makeLinuxAlphaSystem('atomic')
 system.cpu = cpus
 for c in cpus:
--- a/tests/configs/tsunami-simple-atomic.py
+++ b/tests/configs/tsunami-simple-atomic.py
@@ -31,7 +31,7 @@ from m5.objects import *
 m5.AddToPath('../configs/common')
 import FSConfig

-cpu = AtomicSimpleCPU()
+cpu = AtomicSimpleCPU(cpu_id=0)
 system = FSConfig.makeLinuxAlphaSystem('atomic')
 system.cpu = cpu
 cpu.connectMemPorts(system.membus)
--- a/tests/configs/tsunami-simple-timing-dual.py
+++ b/tests/configs/tsunami-simple-timing-dual.py
@@ -34,7 +34,7 @@ import FSConfig
 AlphaConsole.cpu = Parent.cpu[0]
 IntrControl.cpu = Parent.cpu[0]

-cpus = [ TimingSimpleCPU() for i in xrange(2) ]
+cpus = [ TimingSimpleCPU(cpu_id=i) for i in xrange(2) ]
 system = FSConfig.makeLinuxAlphaSystem('timing')
 system.cpu = cpus
 for c in cpus:
--- a/tests/configs/tsunami-simple-timing.py
+++ b/tests/configs/tsunami-simple-timing.py
@@ -31,7 +31,7 @@ from m5.objects import *
 m5.AddToPath('../configs/common')
 import FSConfig

-cpu = TimingSimpleCPU()
+cpu = TimingSimpleCPU(cpu_id=0)
 system = FSConfig.makeLinuxAlphaSystem('timing')
 system.cpu = cpu
 cpu.connectMemPorts(system.membus)
--- a/tests/quick/00.hello.mp/test.py
+++ b/tests/quick/00.hello.mp/test.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2006 The Regents of The University of Michigan
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Ron Dreslinski
+
+# workload
+benchmarks = [
+    "tests/test-progs/hello/bin/alpha/linux/hello", "'hello'",
+    "tests/test-progs/hello/bin/alpha/linux/hello", "'hello'",
+    "tests/test-progs/hello/bin/alpha/linux/hello", "'hello'",
+    "tests/test-progs/hello/bin/alpha/linux/hello", "'hello'",
+    ]
+
+for i, cpu in zip(range(len(cpus)), root.system.cpu):
+    p            = LiveProcess()
+    p.executable = benchmarks[i*2]
+    p.cmd        = benchmarks[(i*2)+1]
+    root.system.cpu[i].workload = p
+    root.system.cpu[i].max_insts_all_threads = 10000000
+#root.system.cpu.workload = LiveProcess(cmd = 'hello',
+ #                                      executable = binpath('hello'))
--- a/tests/quick/00.hello/ref/alpha/linux/o3-timing/config.ini
+++ b/tests/quick/00.hello/ref/alpha/linux/o3-timing/config.ini
@@ -110,6 +110,7 @@ numROBEntries=192
 numRobs=1
 numThreads=1
 predType=tournament
+progress_interval=0
 renameToDecodeDelay=1
 renameToFetchDelay=1
 renameToIEWDelay=2
@@ -384,20 +385,30 @@ mem_side=system.membus.port[1]
 [system.cpu.toL2Bus]
 type=Bus
 bus_id=0
+clock=1000
+width=64
 port=system.cpu.icache.mem_side system.cpu.dcache.mem_side system.cpu.l2cache.cpu_side

 [system.cpu.workload]
 type=LiveProcess
 cmd=hello
+egid=100
 env=
+euid=100
 executable=tests/test-progs/hello/bin/alpha/linux/hello
+gid=100
 input=cin
 output=cout
+pid=100
+ppid=99
 system=system
+uid=100

 [system.membus]
 type=Bus
 bus_id=0
+clock=1000
+width=64
 port=system.physmem.port system.cpu.l2cache.mem_side

 [system.physmem]
@@ -409,6 +420,7 @@ port=system.membus.port[0]

 [trace]
 bufsize=0
+cycle=0
 dump_on_exit=false
 file=cout
 flags=
--- a/tests/quick/00.hello/ref/alpha/linux/o3-timing/config.out
+++ b/tests/quick/00.hello/ref/alpha/linux/o3-timing/config.out
@@ -19,6 +19,8 @@ mem_mode=atomic
 [system.membus]
 type=Bus
 bus_id=0
+clock=1000
+width=64

 [system.cpu.workload]
 type=LiveProcess
@@ -28,6 +30,12 @@ input=cin
 output=cout
 env=
 system=system
+uid=100
+euid=100
+gid=100
+egid=100
+pid=100
+ppid=99

 [system.cpu.dcache]
 type=BaseCache
@@ -208,6 +216,7 @@ max_insts_any_thread=0
 max_insts_all_threads=0
 max_loads_any_thread=0
 max_loads_all_threads=0
+progress_interval=0
 cachePorts=200
 decodeToFetchDelay=1
 renameToFetchDelay=1
@@ -354,10 +363,13 @@ hit_latency=1
 [system.cpu.toL2Bus]
 type=Bus
 bus_id=0
+clock=1000
+width=64

 [trace]
 flags=
 start=0
+cycle=0
 bufsize=0
 file=cout
 dump_on_exit=false
@@ -401,3 +413,6 @@ trace_system=client
 [debug]
 break_cycles=

+[statsreset]
+reset_cycle=0
+
--- a/tests/quick/00.hello/ref/alpha/linux/o3-timing/m5stats.txt
+++ b/tests/quick/00.hello/ref/alpha/linux/o3-timing/m5stats.txt
--- a/tests/quick/00.hello/ref/alpha/linux/o3-timing/stderr
+++ b/tests/quick/00.hello/ref/alpha/linux/o3-timing/stderr
@@ -1,3 +1,12 @@
 warn: Entering event queue @ 0.  Starting simulation...
 warn: cycle 0: fault (page_table_fault) detected @ PC 0x000000
 warn: Increasing stack 0x11ff92000:0x11ff9b000 to 0x11ff90000:0x11ff9b000 because of access to 0x11ff91ff0
+warn: Default fetch doesn't update it's state from a functional call.
+warn: Default fetch doesn't update it's state from a functional call.
+warn: Default fetch doesn't update it's state from a functional call.
+warn: Default fetch doesn't update it's state from a functional call.
+warn: Default fetch doesn't update it's state from a functional call.
+warn: Default fetch doesn't update it's state from a functional call.
+warn: Default fetch doesn't update it's state from a functional call.
+warn: Default fetch doesn't update it's state from a functional call.
+warn: Default fetch doesn't update it's state from a functional call.
--- a/Show More
+++ b/Show More