# Copyright (c) 2018-2021 Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # 3. Neither the name of the copyright holder nor the names of its # contributors may be used to endorse or promote products derived from this # software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import m5 from m5.objects import * from m5.defines import buildEnv from m5.util import addToPath import os, argparse, sys addToPath('../') from common import Options from ruby import Ruby # # Add the ruby specific and protocol specific options # parser = argparse.ArgumentParser() Options.addNoISAOptions(parser) Ruby.define_options(parser) # GPU Ruby tester options parser.add_argument("--cache-size", default="small", choices=["small", "large"], help="Cache sizes to use. Small encourages races between \ requests and writebacks. Large stresses write-through \ and/or write-back GPU caches.") parser.add_argument("--system-size", default="small", choices=["small", "medium", "large"], help="This option defines how many CUs, CPUs and cache \ components in the test system.") parser.add_argument("--address-range", default="small", choices=["small", "large"], help="This option defines the number of atomic \ locations that affects the working set's size. \ A small number of atomic locations encourage more \ races among threads. The large option stresses cache \ resources.") parser.add_argument("--episode-length", default="short", choices=["short", "medium", "long"], help="This option defines the number of LDs and \ STs in an episode. The small option encourages races \ between the start and end of an episode. The long \ option encourages races between LDs and STs in the \ same episode.") parser.add_argument("--test-length", type=int, default=1, help="The number of episodes to be executed by each \ wavefront. This determines the maximum number, i.e., \ val X #WFs, of episodes to be executed in the test.") parser.add_argument("--debug-tester", action='store_true', help="This option will turn on DRF checker") parser.add_argument("--random-seed", type=int, default=0, help="Random seed number. Default value (i.e., 0) means \ using runtime-specific value") parser.add_argument("--log-file", type=str, default="gpu-ruby-test.log") parser.add_argument("--num-dmas", type=int, default=0, help="The number of DMA engines to use in tester config.") args = parser.parse_args() # # Set up cache size - 2 options # 0: small cache # 1: large cache # if (args.cache_size == "small"): args.tcp_size="256B" args.tcp_assoc=2 args.tcc_size="1kB" args.tcc_assoc=2 elif (args.cache_size == "large"): args.tcp_size="256kB" args.tcp_assoc=16 args.tcc_size="1024kB" args.tcc_assoc=16 # # Set up system size - 3 options # if (args.system_size == "small"): # 1 CU, 1 CPU, 1 SQC, 1 Scalar args.wf_size = 1 args.wavefronts_per_cu = 1 args.num_cpus = 1 args.num_dmas = 1 args.cu_per_sqc = 1 args.cu_per_scalar_cache = 1 args.num_compute_units = 1 elif (args.system_size == "medium"): # 4 CUs, 4 CPUs, 1 SQCs, 1 Scalars args.wf_size = 16 args.wavefronts_per_cu = 4 args.num_cpus = 4 args.num_dmas = 2 args.cu_per_sqc = 4 args.cu_per_scalar_cache = 4 args.num_compute_units = 4 elif (args.system_size == "large"): # 8 CUs, 4 CPUs, 1 SQCs, 1 Scalars args.wf_size = 32 args.wavefronts_per_cu = 4 args.num_cpus = 4 args.num_dmas = 4 args.cu_per_sqc = 4 args.cu_per_scalar_cache = 4 args.num_compute_units = 8 # # Set address range - 2 options # level 0: small # level 1: large # Each location corresponds to a 4-byte piece of data # args.mem_size = '1024MB' if (args.address_range == "small"): num_atomic_locs = 10 num_regular_locs_per_atomic_loc = 10000 elif (args.address_range == "large"): num_atomic_locs = 100 num_regular_locs_per_atomic_loc = 100000 # # Set episode length (# of actions per episode) - 3 options # 0: 10 actions # 1: 100 actions # 2: 500 actions # if (args.episode_length == "short"): eps_length = 10 elif (args.episode_length == "medium"): eps_length = 100 elif (args.episode_length == "long"): eps_length = 500 # # Set Ruby and tester deadlock thresholds. Ruby's deadlock detection is the # primary check for deadlocks. The tester's deadlock threshold detection is # a secondary check for deadlock. If there is a bug in RubyPort that causes # a packet not to return to the tester properly, the tester will issue a # deadlock panic. We set cache_deadlock_threshold < tester_deadlock_threshold # to detect deadlock caused by Ruby protocol first before one caused by the # coalescer. Both units are in Ticks # args.cache_deadlock_threshold = 1e8 tester_deadlock_threshold = 1e9 # For now we're testing only GPU protocol, so we force num_cpus to be 0 args.num_cpus = 0 # Number of DMA engines n_DMAs = args.num_dmas # Number of CUs n_CUs = args.num_compute_units # Set test length, i.e., number of episodes per wavefront * #WFs. # Test length can be 1x#WFs, 10x#WFs, 100x#WFs, ... n_WFs = n_CUs * args.wavefronts_per_cu max_episodes = args.test_length * n_WFs # Number of SQC and Scalar caches assert(n_CUs % args.cu_per_sqc == 0) n_SQCs = n_CUs // args.cu_per_sqc args.num_sqc = n_SQCs assert(args.cu_per_scalar_cache != 0) n_Scalars = n_CUs // args.cu_per_scalar_cache args.num_scalar_cache = n_Scalars # # Create GPU Ruby random tester # tester = ProtocolTester(cus_per_sqc = args.cu_per_sqc, cus_per_scalar = args.cu_per_scalar_cache, wavefronts_per_cu = args.wavefronts_per_cu, workitems_per_wavefront = args.wf_size, num_atomic_locations = num_atomic_locs, num_normal_locs_per_atomic = \ num_regular_locs_per_atomic_loc, max_num_episodes = max_episodes, episode_length = eps_length, debug_tester = args.debug_tester, random_seed = args.random_seed, log_file = args.log_file) # # Create a gem5 system. Note that the memory object isn't actually used by the # tester, but is included to ensure the gem5 memory size == Ruby memory size # checks. The system doesn't have real CPUs or CUs. It just has a tester that # has physical ports to be connected to Ruby # system = System(cpu = tester, mem_ranges = [AddrRange(args.mem_size)], cache_line_size = args.cacheline_size, mem_mode = 'timing') system.voltage_domain = VoltageDomain(voltage = args.sys_voltage) system.clk_domain = SrcClockDomain(clock = args.sys_clock, voltage_domain = system.voltage_domain) # # Command processor is not needed for the tester since we don't run real # kernels. Setting it to zero disables the VIPER protocol from creating # a command processor and its caches. # args.num_cp = 0 # # Make generic DMA sequencer for Ruby to use # if n_DMAs > 0: dma_devices = [TesterDma()] * n_DMAs system.piobus = IOXBar() for _, dma_device in enumerate(dma_devices): dma_device.pio = system.piobus.mem_side_ports system.dma_devices = dma_devices # # Create the Ruby system # # the ruby tester reuses num_cpus to specify the # number of cpu ports connected to the tester object, which # is stored in system.cpu. because there is only ever one # tester object, num_cpus is not necessarily equal to the # size of system.cpu cpu_list = [ system.cpu ] * args.num_cpus Ruby.create_system(args, full_system = False, system = system, dma_ports = system.dma_devices if n_DMAs > 0 else [], cpus = cpu_list) # # The tester is most effective when randomization is turned on and # artifical delay is randomly inserted on messages # system.ruby.randomization = True # Assert that we got the right number of Ruby ports assert(len(system.ruby._cpu_ports) == n_CUs + n_SQCs + n_Scalars) # # Attach Ruby ports to the tester in the order: # cpu_sequencers, # vector_coalescers, # sqc_sequencers, # scalar_sequencers # # Note that this requires the protocol to create sequencers in this order # print("Attaching ruby ports to the tester") for i, ruby_port in enumerate(system.ruby._cpu_ports): ruby_port.no_retry_on_stall = True ruby_port.using_ruby_tester = True # piobus is only created if there are DMAs if n_DMAs > 0: ruby_port.mem_request_port = system.piobus.cpu_side_ports if i < n_CUs: tester.cu_vector_ports = ruby_port.in_ports tester.cu_token_ports = ruby_port.gmTokenPort tester.max_cu_tokens = 4*n_WFs elif i < (n_CUs + n_SQCs): tester.cu_sqc_ports = ruby_port.in_ports else: tester.cu_scalar_ports = ruby_port.in_ports i += 1 # # Attach DMA ports. Since Ruby.py doesn't return these they need to be found. # Connect tester's request port to each DMA sequencer's in_ports. This assumes # the protocol names these system.dma_cntrl<#>. # dma_ports = [] for i in range(n_DMAs): dma_cntrl = getattr(system, 'dma_cntrl' + str(i)) dma_ports.append(dma_cntrl.dma_sequencer.in_ports) tester.dma_ports = dma_ports # # Common variables for all types of threads # thread_clock = SrcClockDomain(clock = '1GHz', voltage_domain = system.voltage_domain) g_thread_idx = 0 # # No CPU threads are used for GPU tester # tester.cpu_threads = [] # # Create DMA threads # dma_threads = [] print("Creating %i DMAs" % n_DMAs) for dma_idx in range(n_DMAs): dma_threads.append(DmaThread(thread_id = g_thread_idx, num_lanes = 1, clk_domain = thread_clock, deadlock_threshold = \ tester_deadlock_threshold)) g_thread_idx += 1 tester.dma_threads = dma_threads # # Create GPU wavefronts # wavefronts = [] print("Creating %i WFs attached to %i CUs" % \ (n_CUs * tester.wavefronts_per_cu, n_CUs)) for cu_idx in range(n_CUs): for wf_idx in range(tester.wavefronts_per_cu): wavefronts.append(GpuWavefront(thread_id = g_thread_idx, cu_id = cu_idx, num_lanes = args.wf_size, clk_domain = thread_clock, deadlock_threshold = \ tester_deadlock_threshold)) g_thread_idx += 1 tester.wavefronts = wavefronts # # Run simulation # root = Root(full_system = False, system = system) # Not much point in this being higher than the L1 latency m5.ticks.setGlobalFrequency('1ns') # Instantiate configuration m5.instantiate() # Simulate until tester completes exit_event = m5.simulate() print('Exiting tick: ', m5.curTick()) print('Exiting because ', exit_event.getCause())