From 9a5d900770b8af540a9edac6694030d28a36f671 Mon Sep 17 00:00:00 2001 From: Giacomo Travaglini Date: Tue, 12 Sep 2023 08:10:34 +0100 Subject: [PATCH 1/3] cpu: Stop treating TraceCPU as a BaseCPU This is fixing a recently reported issue [1] where it is not possible to use the TraceCPU to replay elastic traces It requires some architectural data structures (like ArchMMU, ArchDecoder...) which are no longer defined in the BaseCPU class at compilation time. Which Arch version should be used for a class (TraceCPU) that is supposed to be ISA agnostic ? Does it really make sense to define them for the TraceCPU? Those classes are not used anyway during trace replay and their sole purpose would just be to comply to the BaseCPU interface. As there is no elegant way to make things work, this patch stops treating the TraceCPU as a BaseCPU. While it philosophically makes sense to treat the TraceCPU as a common CPU (it sort of replays pre-executed instructions), a case can be made for considering it more like a traffic generator. [1]: https://github.com/gem5/gem5/issues/301 Change-Id: I7438169e8cc7fb6272731efb336ed2cf271c0844 Signed-off-by: Giacomo Travaglini --- src/cpu/trace/TraceCPU.py | 14 ++++++++------ src/cpu/trace/trace_cpu.cc | 27 ++++++++++++++++++++------ src/cpu/trace/trace_cpu.hh | 39 ++++++++++++++++++++++---------------- 3 files changed, 52 insertions(+), 28 deletions(-) diff --git a/src/cpu/trace/TraceCPU.py b/src/cpu/trace/TraceCPU.py index 1be16518d7..586e77caed 100644 --- a/src/cpu/trace/TraceCPU.py +++ b/src/cpu/trace/TraceCPU.py @@ -1,4 +1,4 @@ -# Copyright (c) 2013 - 2016 ARM Limited +# Copyright (c) 2013 - 2016, 2023 Arm Limited # All rights reserved. # # The license below extends only to copyright in the software and shall @@ -34,10 +34,11 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from m5.params import * -from m5.objects.BaseCPU import BaseCPU +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject -class TraceCPU(BaseCPU): +class TraceCPU(ClockedObject): """Trace CPU model which replays traces generated in a prior simulation using DerivO3CPU or its derived classes. It interfaces with L1 caches. """ @@ -54,13 +55,14 @@ class TraceCPU(BaseCPU): def require_caches(cls): return True - def addPMU(self, pmu=None): - pass - @classmethod def support_take_over(cls): return True + system = Param.System(Parent.any, "system object") + + icache_port = RequestPort("Instruction Port") + dcache_port = RequestPort("Data Port") instTraceFile = Param.String("", "Instruction trace file") dataTraceFile = Param.String("", "Data dependency trace file") sizeStoreBuffer = Param.Unsigned( diff --git a/src/cpu/trace/trace_cpu.cc b/src/cpu/trace/trace_cpu.cc index 7399cf6199..ab14c8784a 100644 --- a/src/cpu/trace/trace_cpu.cc +++ b/src/cpu/trace/trace_cpu.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 - 2016 ARM Limited + * Copyright (c) 2013 - 2016, 2023 Arm Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -47,7 +47,8 @@ namespace gem5 int TraceCPU::numTraceCPUs = 0; TraceCPU::TraceCPU(const TraceCPUParams ¶ms) - : BaseCPU(params), + : ClockedObject(params), + cacheLineSize(params.system->cacheLineSize()), icachePort(this), dcachePort(this), instRequestorID(params.system->getRequestorId(this, "inst")), @@ -109,7 +110,7 @@ TraceCPU::init() DPRINTF(TraceCPUData, "Data memory request trace file is \"%s\".\n", dataTraceFile); - BaseCPU::init(); + ClockedObject::init(); // Get the send tick of the first instruction read request Tick first_icache_tick = icacheGen.init(); @@ -176,7 +177,7 @@ TraceCPU::schedDcacheNext() DPRINTF(TraceCPUData, "DcacheGen event.\n"); // Update stat for numCycles - baseStats.numCycles = clockEdge() / clockPeriod(); + traceStats.numCycles = clockEdge() / clockPeriod(); dcacheGen.execute(); if (dcacheGen.isExecComplete()) { @@ -216,7 +217,7 @@ TraceCPU::checkAndSchedExitEvent() ADD_STAT(cpi, statistics::units::Rate< statistics::units::Cycle, statistics::units::Count>::get(), "Cycles per micro-op used as a proxy for CPI", - trace->baseStats.numCycles / numOps) + trace->traceStats.numCycles / numOps) { cpi.precision(6); } @@ -591,7 +592,7 @@ TraceCPU::ElasticDataGen::executeMemReq(GraphNode* node_ptr) // stat counting this is useful to keep a check on how frequently this // happens. If required the code could be revised to mimick splitting such // a request into two. - unsigned blk_size = owner.cacheLineSize(); + unsigned blk_size = owner.cacheLineSize; Addr blk_offset = (node_ptr->physAddr & (Addr)(blk_size - 1)); if (!(blk_offset + node_ptr->size <= blk_size)) { node_ptr->size = blk_size - blk_offset; @@ -1152,6 +1153,20 @@ TraceCPU::schedDcacheNextEvent(Tick when) } +Port & +TraceCPU::getPort(const std::string &if_name, PortID idx) +{ + // Get the right port based on name. This applies to all the + // subclasses of the base CPU and relies on their implementation + // of getDataPort and getInstPort. + if (if_name == "dcache_port") + return getDataPort(); + else if (if_name == "icache_port") + return getInstPort(); + else + return ClockedObject::getPort(if_name, idx); +} + bool TraceCPU::IcachePort::recvTimingResp(PacketPtr pkt) { diff --git a/src/cpu/trace/trace_cpu.hh b/src/cpu/trace/trace_cpu.hh index 87f820fe6d..f3f5c38786 100644 --- a/src/cpu/trace/trace_cpu.hh +++ b/src/cpu/trace/trace_cpu.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 - 2016 ARM Limited + * Copyright (c) 2013 - 2016, 2023 Arm Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -48,10 +48,13 @@ #include "cpu/base.hh" #include "debug/TraceCPUData.hh" #include "debug/TraceCPUInst.hh" +#include "mem/packet.hh" +#include "mem/request.hh" #include "params/TraceCPU.hh" #include "proto/inst_dep_record.pb.h" #include "proto/packet.pb.h" #include "proto/protoio.hh" +#include "sim/clocked_object.hh" #include "sim/sim_events.hh" namespace gem5 @@ -66,8 +69,7 @@ namespace gem5 * simulation compared to the detailed cpu model and good correlation when the * same trace is used for playback on different memory sub-systems. * - * The TraceCPU inherits from BaseCPU so some virtual methods need to be - * defined. It has two port subclasses inherited from RequestPort for + * The TraceCPU has two port subclasses inherited from RequestPort for * instruction and data ports. It issues the memory requests deducing the * timing from the trace and without performing real execution of micro-ops. As * soon as the last dependency for an instruction is complete, its @@ -139,7 +141,7 @@ namespace gem5 * exit. */ -class TraceCPU : public BaseCPU +class TraceCPU : public ClockedObject { public: @@ -147,15 +149,6 @@ class TraceCPU : public BaseCPU void init(); - /** - * This is a pure virtual function in BaseCPU. As we don't know how many - * insts are in the trace but only know how how many micro-ops are we - * cannot count this stat. - * - * @return 0 - */ - Counter totalInsts() const { return 0; } - /** * Return totalOps as the number of committed micro-ops plus the * speculatively issued loads that are modelled in the TraceCPU replay. @@ -170,9 +163,6 @@ class TraceCPU : public BaseCPU */ void updateNumOps(uint64_t rob_num); - /* Pure virtual function in BaseCPU. Do nothing. */ - void wakeup(ThreadID tid=0) { return; } - /* * When resuming from checkpoint in FS mode, the TraceCPU takes over from * the old cpu. This function overrides the takeOverFrom() function in the @@ -303,6 +293,9 @@ class TraceCPU : public BaseCPU TraceCPU* owner; }; + /** Cache the cache line size that we get from the system */ + const unsigned int cacheLineSize; + /** Port to connect to L1 instruction cache. */ IcachePort icachePort; @@ -1112,6 +1105,8 @@ class TraceCPU : public BaseCPU /** Stat for number of simulated micro-ops. */ statistics::Scalar numOps; + /** Number of CPU cycles simulated */ + statistics::Scalar numCycles; /** Stat for the CPI. This is really cycles per * micro-op and not inst. */ statistics::Formula cpi; @@ -1125,6 +1120,18 @@ class TraceCPU : public BaseCPU /** Used to get a reference to the dcache port. */ Port &getDataPort() { return dcachePort; } + /** + * Get a port on this CPU. All CPUs have a data and + * instruction port, and this method uses getDataPort and + * getInstPort of the subclasses to resolve the two ports. + * + * @param if_name the port name + * @param idx ignored index + * + * @return a reference to the port with the given name + */ + Port &getPort(const std::string &if_name, + PortID idx=InvalidPortID) override; }; } // namespace gem5 From 785eba6ce120cbeee371ab39ce5e4079cb4bf91d Mon Sep 17 00:00:00 2001 From: Giacomo Travaglini Date: Tue, 12 Sep 2023 10:32:31 +0100 Subject: [PATCH 2/3] configs: Reflect TraceCPU changes in the etrace_replay script As we no longer inherit from the BaseCPU, we can't really use CPU generation methods (like Simulation.setCPUClass) and cache generation ones (like CacheConfig.config_cache). This is good news as it allows us to simplify the etrace script and to remove a dependency with the deprecated-to-be common library. Change-Id: Ic89ce2b9d713ee6f6e11bf20c5065426298b3da2 Signed-off-by: Giacomo Travaglini --- configs/example/etrace_replay.py | 67 ++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/configs/example/etrace_replay.py b/configs/example/etrace_replay.py index ddbf01acf5..184934fab9 100644 --- a/configs/example/etrace_replay.py +++ b/configs/example/etrace_replay.py @@ -1,4 +1,4 @@ -# Copyright (c) 2015 ARM Limited +# Copyright (c) 2015, 2023 Arm Limited # All rights reserved. # # The license below extends only to copyright in the software and shall @@ -43,10 +43,42 @@ addToPath("../") from common import Options from common import Simulation -from common import CacheConfig from common import MemConfig from common.Caches import * + +def config_cache(args, system): + """ + Configure the cache hierarchy. Only two configurations are natively + supported as an example: L1(I/D) only or L1 + L2. + """ + from common.CacheConfig import _get_cache_opts + + system.l1i = L1_ICache(**_get_cache_opts("l1i", args)) + system.l1d = L1_DCache(**_get_cache_opts("l1d", args)) + + system.cpu.dcache_port = system.l1d.cpu_side + system.cpu.icache_port = system.l1i.cpu_side + + if args.l2cache: + # Provide a clock for the L2 and the L1-to-L2 bus here as they + # are not connected using addTwoLevelCacheHierarchy. Use the + # same clock as the CPUs. + system.l2 = L2Cache( + clk_domain=system.cpu_clk_domain, **_get_cache_opts("l2", args) + ) + + system.tol2bus = L2XBar(clk_domain=system.cpu_clk_domain) + system.l2.cpu_side = system.tol2bus.mem_side_ports + system.l2.mem_side = system.membus.cpu_side_ports + + system.l1i.mem_side = system.tol2bus.cpu_side_ports + system.l1d.mem_side = system.tol2bus.cpu_side_ports + else: + system.l1i.mem_side = system.membus.cpu_side_ports + system.l1d.mem_side = system.membus.cpu_side_ports + + parser = argparse.ArgumentParser() Options.addCommonOptions(parser) @@ -59,29 +91,18 @@ if "--ruby" in sys.argv: args = parser.parse_args() -numThreads = 1 - -if args.cpu_type != "TraceCPU": - fatal( - "This is a script for elastic trace replay simulation, use " - "--cpu-type=TraceCPU\n" - ) - if args.num_cpus > 1: fatal("This script does not support multi-processor trace replay.\n") -# In this case FutureClass will be None as there is not fast forwarding or -# switching -(CPUClass, test_mem_mode, FutureClass) = Simulation.setCPUClass(args) -CPUClass.numThreads = numThreads - system = System( - cpu=CPUClass(cpu_id=0), - mem_mode=test_mem_mode, + mem_mode=TraceCPU.memory_mode(), mem_ranges=[AddrRange(args.mem_size)], cache_line_size=args.cacheline_size, ) +# Generate the TraceCPU +system.cpu = TraceCPU() + # Create a top-level voltage domain system.voltage_domain = VoltageDomain(voltage=args.sys_voltage) @@ -105,11 +126,6 @@ system.cpu_clk_domain = SrcClockDomain( for cpu in system.cpu: cpu.clk_domain = system.cpu_clk_domain -# BaseCPU no longer has default values for the BaseCPU.isa -# createThreads() is needed to fill in the cpu.isa -for cpu in system.cpu: - cpu.createThreads() - # Assign input trace files to the Trace CPU system.cpu.instTraceFile = args.inst_trace_file system.cpu.dataTraceFile = args.data_trace_file @@ -118,8 +134,11 @@ system.cpu.dataTraceFile = args.data_trace_file MemClass = Simulation.setMemClass(args) system.membus = SystemXBar() system.system_port = system.membus.cpu_side_ports -CacheConfig.config_cache(args, system) + +# Configure the classic cache hierarchy +config_cache(args, system) + MemConfig.config_mem(args, system) root = Root(full_system=False, system=system) -Simulation.run(args, root, system, FutureClass) +Simulation.run(args, root, system, None) From a0a799f4743a007706406bd5114e897343b9b020 Mon Sep 17 00:00:00 2001 From: Giacomo Travaglini Date: Tue, 12 Sep 2023 13:39:22 +0100 Subject: [PATCH 3/3] cpu: Disable CPU switching functionality with TraceCPU Now that the TraceCPU is no longer a BaseCPU we disable CPU switching functionality. AFAICS from the code, it seems like using m5.switchCpus was never really working. The takeOverFrom was described as being used when checkpointing (which is not really the case). Moreover the icache/dcache event loops were not checking if the CPU was switched out so the trace was always been consumed regardless of the BaseCPU state. Note: IMHO the only case where you might want to switch between an execution-driven CPU to the TraceCPU is when you want to warm your caches before the ROI. All other cases don't really make sense as with the TraceCPU there is no architectural state being maintained/updated. Change-Id: I0611359d2b833e1bc0762be72642df24a7c92b1e Signed-off-by: Giacomo Travaglini --- src/cpu/trace/TraceCPU.py | 4 ---- src/cpu/trace/trace_cpu.cc | 9 +-------- src/cpu/trace/trace_cpu.hh | 10 +--------- 3 files changed, 2 insertions(+), 21 deletions(-) diff --git a/src/cpu/trace/TraceCPU.py b/src/cpu/trace/TraceCPU.py index 586e77caed..5e82fd9f9f 100644 --- a/src/cpu/trace/TraceCPU.py +++ b/src/cpu/trace/TraceCPU.py @@ -55,10 +55,6 @@ class TraceCPU(ClockedObject): def require_caches(cls): return True - @classmethod - def support_take_over(cls): - return True - system = Param.System(Parent.any, "system object") icache_port = RequestPort("Instruction Port") diff --git a/src/cpu/trace/trace_cpu.cc b/src/cpu/trace/trace_cpu.cc index ab14c8784a..d5c6b4fb3c 100644 --- a/src/cpu/trace/trace_cpu.cc +++ b/src/cpu/trace/trace_cpu.cc @@ -39,6 +39,7 @@ #include "base/compiler.hh" #include "sim/sim_exit.hh" +#include "sim/system.hh" namespace gem5 { @@ -94,14 +95,6 @@ TraceCPU::updateNumOps(uint64_t rob_num) } } -void -TraceCPU::takeOverFrom(BaseCPU *oldCPU) -{ - // Unbind the ports of the old CPU and bind the ports of the TraceCPU. - getInstPort().takeOverFrom(&oldCPU->getInstPort()); - getDataPort().takeOverFrom(&oldCPU->getDataPort()); -} - void TraceCPU::init() { diff --git a/src/cpu/trace/trace_cpu.hh b/src/cpu/trace/trace_cpu.hh index f3f5c38786..d54b3c4227 100644 --- a/src/cpu/trace/trace_cpu.hh +++ b/src/cpu/trace/trace_cpu.hh @@ -45,10 +45,10 @@ #include #include "base/statistics.hh" -#include "cpu/base.hh" #include "debug/TraceCPUData.hh" #include "debug/TraceCPUInst.hh" #include "mem/packet.hh" +#include "mem/port.hh" #include "mem/request.hh" #include "params/TraceCPU.hh" #include "proto/inst_dep_record.pb.h" @@ -163,14 +163,6 @@ class TraceCPU : public ClockedObject */ void updateNumOps(uint64_t rob_num); - /* - * When resuming from checkpoint in FS mode, the TraceCPU takes over from - * the old cpu. This function overrides the takeOverFrom() function in the - * BaseCPU. It unbinds the ports of the old CPU and binds the ports of the - * TraceCPU. - */ - void takeOverFrom(BaseCPU *oldCPU); - /** * When instruction cache port receives a retry, schedule event * icacheNextEvent.