From 36dc93a5fa09765b9d2bac402bb557d228effcad Mon Sep 17 00:00:00 2001 From: Andreas Hansson Date: Mon, 2 Mar 2015 04:00:47 -0500 Subject: [PATCH] mem: Move crossbar default latencies to subclasses This patch introduces a few subclasses to the CoherentXBar and NoncoherentXBar to distinguish the different uses in the system. We use the crossbar in a wide range of places: interfacing cores to the L2, as a system interconnect, connecting I/O and peripherals, etc. Needless to say, these crossbars have very different performance, and the clock frequency alone is not enough to distinguish these scenarios. Instead of trying to capture every possible case, this patch introduces dedicated subclasses for the three primary use-cases: L2XBar, SystemXBar and IOXbar. More can be added if needed, and the defaults can be overridden. --- configs/common/CacheConfig.py | 6 +-- configs/common/FSConfig.py | 14 +++---- configs/dram/sweep.py | 2 +- configs/example/memcheck.py | 4 +- configs/example/memtest.py | 4 +- configs/example/ruby_mem_test.py | 2 +- configs/example/se.py | 2 +- configs/ruby/Ruby.py | 2 +- configs/splash2/cluster.py | 10 ++--- configs/splash2/run.py | 4 +- src/cpu/BaseCPU.py | 7 +--- src/mem/XBar.py | 51 +++++++++++++++++++++++--- tests/configs/base_config.py | 4 +- tests/configs/memtest-filter.py | 6 +-- tests/configs/memtest.py | 4 +- tests/configs/o3-timing-mp-ruby.py | 2 +- tests/configs/o3-timing-ruby.py | 2 +- tests/configs/simple-atomic-mp-ruby.py | 2 +- tests/configs/tgen-dram-ctrl.py | 2 +- tests/configs/tgen-simple-mem.py | 2 +- 20 files changed, 84 insertions(+), 48 deletions(-) diff --git a/configs/common/CacheConfig.py b/configs/common/CacheConfig.py index f31b3d566a..66fe491e1f 100644 --- a/configs/common/CacheConfig.py +++ b/configs/common/CacheConfig.py @@ -65,14 +65,12 @@ def config_cache(options, system): if options.l2cache: # Provide a clock for the L2 and the L1-to-L2 bus here as they # are not connected using addTwoLevelCacheHierarchy. Use the - # same clock as the CPUs, and set the L1-to-L2 bus width to 32 - # bytes (256 bits). + # same clock as the CPUs. system.l2 = l2_cache_class(clk_domain=system.cpu_clk_domain, size=options.l2_size, assoc=options.l2_assoc) - system.tol2bus = CoherentXBar(clk_domain = system.cpu_clk_domain, - width = 32) + system.tol2bus = L2XBar(clk_domain = system.cpu_clk_domain) system.l2.cpu_side = system.tol2bus.master system.l2.mem_side = system.membus.slave diff --git a/configs/common/FSConfig.py b/configs/common/FSConfig.py index cfa6dee4d6..e95fff4241 100644 --- a/configs/common/FSConfig.py +++ b/configs/common/FSConfig.py @@ -50,7 +50,7 @@ class CowIdeDisk(IdeDisk): def childImage(self, ci): self.image.child.image_file = ci -class MemBus(CoherentXBar): +class MemBus(SystemXBar): badaddr_responder = BadAddr() default = Self.badaddr_responder.pio @@ -78,7 +78,7 @@ def makeLinuxAlphaSystem(mem_mode, mdesc=None, ruby=False, cmdline=None): self.tsunami = BaseTsunami() # Create the io bus to connect all device ports - self.iobus = NoncoherentXBar() + self.iobus = IOXBar() self.tsunami.attachIO(self.iobus) self.tsunami.ide.pio = self.iobus.master @@ -143,7 +143,7 @@ def makeSparcSystem(mem_mode, mdesc=None): # generic system mdesc = SysConfig() self.readfile = mdesc.script() - self.iobus = NoncoherentXBar() + self.iobus = IOXBar() self.membus = MemBus() self.bridge = Bridge(delay='50ns') self.t1000 = T1000() @@ -205,7 +205,7 @@ def makeArmSystem(mem_mode, machine_type, num_cpus=1, mdesc=None, mdesc = SysConfig() self.readfile = mdesc.script() - self.iobus = NoncoherentXBar() + self.iobus = IOXBar() self.membus = MemBus() self.membus.badaddr_responder.warn_access = "warn" self.bridge = Bridge(delay='50ns') @@ -311,7 +311,7 @@ def makeLinuxMipsSystem(mem_mode, mdesc=None, cmdline=None): # generic system mdesc = SysConfig() self.readfile = mdesc.script() - self.iobus = NoncoherentXBar() + self.iobus = IOXBar() self.membus = MemBus() self.bridge = Bridge(delay='50ns') self.mem_ranges = [AddrRange('1GB')] @@ -358,7 +358,7 @@ def connectX86ClassicSystem(x86_sys, numCPUs): x86_sys.membus = MemBus() # North Bridge - x86_sys.iobus = NoncoherentXBar() + x86_sys.iobus = IOXBar() x86_sys.bridge = Bridge(delay='50ns') x86_sys.bridge.master = x86_sys.iobus.slave x86_sys.bridge.slave = x86_sys.membus.master @@ -394,7 +394,7 @@ def connectX86ClassicSystem(x86_sys, numCPUs): def connectX86RubySystem(x86_sys): # North Bridge - x86_sys.iobus = NoncoherentXBar() + x86_sys.iobus = IOXBar() # add the ide to the list of dma devices that later need to attach to # dma controllers diff --git a/configs/dram/sweep.py b/configs/dram/sweep.py index 18a58b2da4..f0b20dcc5d 100644 --- a/configs/dram/sweep.py +++ b/configs/dram/sweep.py @@ -84,7 +84,7 @@ if args: # start with the system itself, using a multi-layer 1.5 GHz # crossbar, delivering 64 bytes / 5 cycles (one header cycle) # which amounts to 19.2 GByte/s per layer and thus per port -system = System(membus = NoncoherentXBar(width = 16)) +system = System(membus = IOXBar(width = 16)) system.clk_domain = SrcClockDomain(clock = '1.5GHz', voltage_domain = VoltageDomain(voltage = '1V')) diff --git a/configs/example/memcheck.py b/configs/example/memcheck.py index 4f85223d9b..f0bc26e320 100644 --- a/configs/example/memcheck.py +++ b/configs/example/memcheck.py @@ -243,7 +243,7 @@ def make_cache_level(ncaches, prototypes, level, next_cache): if level != 0: # Create a crossbar and add it to the subsystem, note that # we do this even with a single element on this level - xbar = CoherentXBar(width = 32) + xbar = L2XBar(width = 32) subsys.xbar = xbar if next_cache: xbar.master = next_cache.cpu_side @@ -269,7 +269,7 @@ def make_cache_level(ncaches, prototypes, level, next_cache): if ntesters > 1: # Create a crossbar and add it to the subsystem - xbar = CoherentXBar(width = 32) + xbar = L2XBar(width = 32) subsys.xbar = xbar xbar.master = next_cache.cpu_side for tester, checker in zip(testers, checkers): diff --git a/configs/example/memtest.py b/configs/example/memtest.py index 6c1e657e46..9a66320d80 100644 --- a/configs/example/memtest.py +++ b/configs/example/memtest.py @@ -233,7 +233,7 @@ def make_cache_level(ncaches, prototypes, level, next_cache): if level != 0: # Create a crossbar and add it to the subsystem, note that # we do this even with a single element on this level - xbar = CoherentXBar(width = 32) + xbar = L2XBar() subsys.xbar = xbar if next_cache: xbar.master = next_cache.cpu_side @@ -258,7 +258,7 @@ def make_cache_level(ncaches, prototypes, level, next_cache): if ntesters > 1: # Create a crossbar and add it to the subsystem - xbar = CoherentXBar(width = 32) + xbar = L2XBar() subsys.xbar = xbar xbar.master = next_cache.cpu_side for tester in testers: diff --git a/configs/example/ruby_mem_test.py b/configs/example/ruby_mem_test.py index f5e6d2a824..e2887410f7 100644 --- a/configs/example/ruby_mem_test.py +++ b/configs/example/ruby_mem_test.py @@ -106,7 +106,7 @@ cpus = [ MemTest(atomic = False, system = System(cpu = cpus, funcmem = SimpleMemory(in_addr_map = False), - funcbus = NoncoherentXBar(), + funcbus = IOXBar(), clk_domain = SrcClockDomain(clock = options.sys_clock), mem_ranges = [AddrRange(options.mem_size)]) diff --git a/configs/example/se.py b/configs/example/se.py index 3f51acdeb3..a582d29762 100644 --- a/configs/example/se.py +++ b/configs/example/se.py @@ -265,7 +265,7 @@ if options.ruby: system.cpu[i].dtb.walker.port = ruby_port.slave else: MemClass = Simulation.setMemClass(options) - system.membus = CoherentXBar() + system.membus = SystemXBar() system.system_port = system.membus.slave CacheConfig.config_cache(options, system) MemConfig.config_mem(options, system) diff --git a/configs/ruby/Ruby.py b/configs/ruby/Ruby.py index e0d53fd6ce..1fa969782c 100644 --- a/configs/ruby/Ruby.py +++ b/configs/ruby/Ruby.py @@ -116,7 +116,7 @@ def setup_memory_controllers(system, ruby, dir_cntrls, options): crossbar = None if len(system.mem_ranges) > 1: - crossbar = NoncoherentXBar() + crossbar = IOXBar() crossbars.append(crossbar) dir_cntrl.memory = crossbar.slave diff --git a/configs/splash2/cluster.py b/configs/splash2/cluster.py index b17c8877ee..9fafcb70cc 100644 --- a/configs/splash2/cluster.py +++ b/configs/splash2/cluster.py @@ -171,7 +171,7 @@ if options.timing: for j in xrange(options.numclusters): clusters[j].id = j for cluster in clusters: - cluster.clusterbus = CoherentXBar(clock=busFrequency) + cluster.clusterbus = L2XBar(clock=busFrequency) all_l1buses += [cluster.clusterbus] cluster.cpus = [TimingSimpleCPU(cpu_id = i + cluster.id, clock=options.frequency) @@ -184,7 +184,7 @@ elif options.detailed: for j in xrange(options.numclusters): clusters[j].id = j for cluster in clusters: - cluster.clusterbus = CoherentXBar(clock=busFrequency) + cluster.clusterbus = L2XBar(clock=busFrequency) all_l1buses += [cluster.clusterbus] cluster.cpus = [DerivO3CPU(cpu_id = i + cluster.id, clock=options.frequency) @@ -197,7 +197,7 @@ else: for j in xrange(options.numclusters): clusters[j].id = j for cluster in clusters: - cluster.clusterbus = CoherentXBar(clock=busFrequency) + cluster.clusterbus = L2XBar(clock=busFrequency) all_l1buses += [cluster.clusterbus] cluster.cpus = [AtomicSimpleCPU(cpu_id = i + cluster.id, clock=options.frequency) @@ -211,10 +211,10 @@ else: # ---------------------- system = System(cpu = all_cpus, l1_ = all_l1s, l1bus_ = all_l1buses, physmem = SimpleMemory(), - membus = CoherentXBar(clock = busFrequency)) + membus = SystemXBar(clock = busFrequency)) system.clock = '1GHz' -system.toL2bus = CoherentXBar(clock = busFrequency) +system.toL2bus = L2XBar(clock = busFrequency) system.l2 = L2(size = options.l2size, assoc = 8) # ---------------------- diff --git a/configs/splash2/run.py b/configs/splash2/run.py index d542a94370..14e5f47d4d 100644 --- a/configs/splash2/run.py +++ b/configs/splash2/run.py @@ -196,10 +196,10 @@ else: # Create a system, and add system wide objects # ---------------------- system = System(cpu = cpus, physmem = SimpleMemory(), - membus = CoherentXBar(clock = busFrequency)) + membus = SystemXBar(clock = busFrequency)) system.clock = '1GHz' -system.toL2bus = CoherentXBar(clock = busFrequency) +system.toL2bus = L2XBar(clock = busFrequency) system.l2 = L2(size = options.l2size, assoc = 8) # ---------------------- diff --git a/src/cpu/BaseCPU.py b/src/cpu/BaseCPU.py index ee6c05f467..9aa24c97b3 100644 --- a/src/cpu/BaseCPU.py +++ b/src/cpu/BaseCPU.py @@ -47,7 +47,7 @@ from m5.defines import buildEnv from m5.params import * from m5.proxy import * -from XBar import CoherentXBar +from XBar import L2XBar from InstTracer import InstTracer from CPUTracers import ExeTracer from MemObject import MemObject @@ -285,10 +285,7 @@ class BaseCPU(MemObject): def addTwoLevelCacheHierarchy(self, ic, dc, l2c, iwc = None, dwc = None): self.addPrivateSplitL1Caches(ic, dc, iwc, dwc) - # Set a width of 32 bytes (256-bits), which is four times that - # of the default bus. The clock of the CPU is inherited by - # default. - self.toL2Bus = CoherentXBar(width = 32) + self.toL2Bus = L2XBar() self.connectCachedPorts(self.toL2Bus) self.l2cache = l2c self.toL2Bus.master = self.l2cache.cpu_side diff --git a/src/mem/XBar.py b/src/mem/XBar.py index 64910ed726..a445b5e37e 100644 --- a/src/mem/XBar.py +++ b/src/mem/XBar.py @@ -66,12 +66,12 @@ class BaseXBar(MemObject): # is the latency involved once a decision is made to forward the # request. The response latency, is similar to the forward # latency, but for responses rather than requests. - frontend_latency = Param.Cycles(3, "Frontend latency") - forward_latency = Param.Cycles(4, "Forward latency") - response_latency = Param.Cycles(2, "Response latency") + frontend_latency = Param.Cycles("Frontend latency") + forward_latency = Param.Cycles("Forward latency") + response_latency = Param.Cycles("Response latency") # Width governing the throughput of the crossbar - width = Param.Unsigned(8, "Datapath width per port (bytes)") + width = Param.Unsigned("Datapath width per port (bytes)") # The default port can be left unconnected, or be used to connect # a default slave port @@ -95,7 +95,7 @@ class CoherentXBar(BaseXBar): # The coherent crossbar additionally has snoop responses that are # forwarded after a specific latency. - snoop_response_latency = Param.Cycles(4, "Snoop response latency") + snoop_response_latency = Param.Cycles("Snoop response latency") # An optional snoop filter snoop_filter = Param.SnoopFilter(NULL, "Selected snoop filter") @@ -111,3 +111,44 @@ class SnoopFilter(SimObject): lookup_latency = Param.Cycles(1, "Lookup latency") system = Param.System(Parent.any, "System that the crossbar belongs to.") + +# We use a coherent crossbar to connect multiple masters to the L2 +# caches. Normally this crossbar would be part of the cache itself. +class L2XBar(CoherentXBar): + # 256-bit crossbar by default + width = 32 + + # Assume that most of this is covered by the cache latencies, with + # no more than a single pipeline stage for any packet. + frontend_latency = 1 + forward_latency = 0 + response_latency = 1 + snoop_response_latency = 1 + +# One of the key coherent crossbar instances is the system +# interconnect, tying together the CPU clusters, GPUs, and any I/O +# coherent masters, and DRAM controllers. +class SystemXBar(CoherentXBar): + # 128-bit crossbar by default + width = 16 + + # A handful pipeline stages for each portion of the latency + # contributions. + frontend_latency = 3 + forward_latency = 4 + response_latency = 2 + snoop_response_latency = 4 + +# In addition to the system interconnect, we typically also have one +# or more on-chip I/O crossbars. Note that at some point we might want +# to also define an off-chip I/O crossbar such as PCIe. +class IOXBar(NoncoherentXBar): + # 128-bit crossbar by default + width = 16 + + # Assume a simpler datapath than a coherent crossbar, incuring + # less pipeline stages for decision making and forwarding of + # requests. + frontend_latency = 2 + forward_latency = 1 + response_latency = 2 diff --git a/tests/configs/base_config.py b/tests/configs/base_config.py index 5637ca3f54..c440d48d96 100644 --- a/tests/configs/base_config.py +++ b/tests/configs/base_config.py @@ -104,7 +104,7 @@ class BaseSystem(object): Returns: A bus that CPUs should use to connect to the shared cache. """ - system.toL2Bus = CoherentXBar(clk_domain=system.cpu_clk_domain) + system.toL2Bus = L2XBar(clk_domain=system.cpu_clk_domain) system.l2c = L2Cache(clk_domain=system.cpu_clk_domain, size='4MB', assoc=8) system.l2c.cpu_side = system.toL2Bus.master @@ -186,7 +186,7 @@ class BaseSESystem(BaseSystem): def create_system(self): system = System(physmem = self.mem_class(), - membus = CoherentXBar(), + membus = SystemXBar(), mem_mode = self.mem_mode) system.system_port = system.membus.slave system.physmem.port = system.membus.master diff --git a/tests/configs/memtest-filter.py b/tests/configs/memtest-filter.py index 42dd056394..34ac75f00e 100644 --- a/tests/configs/memtest-filter.py +++ b/tests/configs/memtest-filter.py @@ -38,7 +38,7 @@ cpus = [ MemTest() for i in xrange(nb_cores) ] # system simulated system = System(cpu = cpus, physmem = SimpleMemory(), - membus = CoherentXBar(width=16, snoop_filter = SnoopFilter())) + membus = SystemXBar(width=16, snoop_filter = SnoopFilter())) # Dummy voltage domain for all our clock domains system.voltage_domain = VoltageDomain() system.clk_domain = SrcClockDomain(clock = '1GHz', @@ -49,8 +49,8 @@ system.clk_domain = SrcClockDomain(clock = '1GHz', system.cpu_clk_domain = SrcClockDomain(clock = '2GHz', voltage_domain = system.voltage_domain) -system.toL2Bus = CoherentXBar(clk_domain = system.cpu_clk_domain, width=16, - snoop_filter = SnoopFilter()) +system.toL2Bus = L2XBar(clk_domain = system.cpu_clk_domain, + snoop_filter = SnoopFilter()) system.l2c = L2Cache(clk_domain = system.cpu_clk_domain, size='64kB', assoc=8) system.l2c.cpu_side = system.toL2Bus.master diff --git a/tests/configs/memtest.py b/tests/configs/memtest.py index 42f50ce3b9..5bbfeb774b 100644 --- a/tests/configs/memtest.py +++ b/tests/configs/memtest.py @@ -38,7 +38,7 @@ cpus = [ MemTest() for i in xrange(nb_cores) ] # system simulated system = System(cpu = cpus, physmem = SimpleMemory(), - membus = CoherentXBar(width=16)) + membus = SystemXBar()) # Dummy voltage domain for all our clock domains system.voltage_domain = VoltageDomain() system.clk_domain = SrcClockDomain(clock = '1GHz', @@ -49,7 +49,7 @@ system.clk_domain = SrcClockDomain(clock = '1GHz', system.cpu_clk_domain = SrcClockDomain(clock = '2GHz', voltage_domain = system.voltage_domain) -system.toL2Bus = CoherentXBar(clk_domain = system.cpu_clk_domain, width=16) +system.toL2Bus = L2XBar(clk_domain = system.cpu_clk_domain) system.l2c = L2Cache(clk_domain = system.cpu_clk_domain, size='64kB', assoc=8) system.l2c.cpu_side = system.toL2Bus.master diff --git a/tests/configs/o3-timing-mp-ruby.py b/tests/configs/o3-timing-mp-ruby.py index 3fea4ed711..fb2d56fd1f 100644 --- a/tests/configs/o3-timing-mp-ruby.py +++ b/tests/configs/o3-timing-mp-ruby.py @@ -38,7 +38,7 @@ import ruby_config ruby_memory = ruby_config.generate("TwoLevel_SplitL1UnifiedL2.rb", nb_cores) # system simulated -system = System(cpu = cpus, physmem = ruby_memory, membus = CoherentXBar(), +system = System(cpu = cpus, physmem = ruby_memory, membus = SystemXBar(), mem_mode = "timing", clk_domain = SrcClockDomain(clock = '1GHz')) diff --git a/tests/configs/o3-timing-ruby.py b/tests/configs/o3-timing-ruby.py index 68a07e7025..c47d9f355c 100644 --- a/tests/configs/o3-timing-ruby.py +++ b/tests/configs/o3-timing-ruby.py @@ -39,7 +39,7 @@ cpu = DerivO3CPU(cpu_id=0) system = System(cpu = cpu, physmem = ruby_memory, - membus = CoherentXBar(), + membus = SystemXBar(), mem_mode = "timing", clk_domain = SrcClockDomain(clock = '1GHz')) diff --git a/tests/configs/simple-atomic-mp-ruby.py b/tests/configs/simple-atomic-mp-ruby.py index 321cb977fb..bdda6d0051 100644 --- a/tests/configs/simple-atomic-mp-ruby.py +++ b/tests/configs/simple-atomic-mp-ruby.py @@ -38,7 +38,7 @@ import ruby_config ruby_memory = ruby_config.generate("TwoLevel_SplitL1UnifiedL2.rb", nb_cores) # system simulated -system = System(cpu = cpus, physmem = ruby_memory, membus = CoherentXBar(), +system = System(cpu = cpus, physmem = ruby_memory, membus = SystemXBar(), clk_domain = SrcClockDomain(clock = '1GHz')) # Create a seperate clock domain for components that should run at diff --git a/tests/configs/tgen-dram-ctrl.py b/tests/configs/tgen-dram-ctrl.py index d170ac0774..cd6721e6dd 100644 --- a/tests/configs/tgen-dram-ctrl.py +++ b/tests/configs/tgen-dram-ctrl.py @@ -49,7 +49,7 @@ cpu = TrafficGen(config_file = "tests/quick/se/70.tgen/tgen-dram-ctrl.cfg") # system simulated system = System(cpu = cpu, physmem = DDR3_1600_x64(), - membus = NoncoherentXBar(width = 16), + membus = IOXBar(width = 16), clk_domain = SrcClockDomain(clock = '1GHz', voltage_domain = VoltageDomain())) diff --git a/tests/configs/tgen-simple-mem.py b/tests/configs/tgen-simple-mem.py index be700ac7af..edb2f9fcdc 100644 --- a/tests/configs/tgen-simple-mem.py +++ b/tests/configs/tgen-simple-mem.py @@ -49,7 +49,7 @@ cpu = TrafficGen(config_file = "tests/quick/se/70.tgen/tgen-simple-mem.cfg") # system simulated system = System(cpu = cpu, physmem = SimpleMemory(), - membus = NoncoherentXBar(width = 16), + membus = IOXBar(width = 16), clk_domain = SrcClockDomain(clock = '1GHz', voltage_domain = VoltageDomain()))