tests,configs,mem-ruby: Adding Ruby tester for GPU_VIPER

This patch adds the GPU protocol tester that uses data-race-free operation to discover bugs in GPU protocols including GPU_VIPER. For more information please see the following paper and the README: T. Ta, X. Zhang, A. Gutierrez and B. M. Beckmann, "Autonomous Data-Race-Free GPU Testing," 2019 IEEE International Symposium on Workload Characterization (IISWC), Orlando, FL, USA, 2019, pp. 81-92, doi: 10.1109/IISWC47752.2019.9042019. Change-Id: Ic9939d131a930d1e7014ed0290601140bdd1499f Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/32855 Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
2020-09-24 14:53:13 -05:00
parent 1a2b677728
commit f36817c367
19 changed files with 3498 additions and 103 deletions
--- a/configs/example/ruby_gpu_random_test.py
+++ b/configs/example/ruby_gpu_random_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+# Copyright (c) 2018-2020 Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # For use for simulation and test purposes only
@@ -43,145 +43,272 @@ addToPath('../')
 from common import Options
 from ruby import Ruby
 # Get paths we might need.
 config_path = os.path.dirname(os.path.abspath(__file__))
 config_root = os.path.dirname(config_path)
 m5_root = os.path.dirname(config_root)
 parser = optparse.OptionParser()
 Options.addNoISAOptions(parser)
 parser.add_option("--maxloads", metavar="N", default=100,
                  help="Stop after N loads")
 parser.add_option("-f", "--wakeup_freq", metavar="N", default=10,
                  help="Wakeup every N cycles")
 parser.add_option("-u", "--num-compute-units", type="int", default=1,
                  help="number of compute units in the GPU")
 parser.add_option("--num-cp", type="int", default=0,
                  help="Number of GPU Command Processors (CP)")
 # not super important now, but to avoid putting the number 4 everywhere, make
 # it an option/knob
 parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs \
                  sharing an SQC (icache, and thus icache TLB)")
 parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \
                  "per CU")
 parser.add_option("--wf-size", type="int", default=64,
                  help="Wavefront size(in workitems)")
 parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \
                  "WF slots per SIMD")
 #
 # Add the ruby specific and protocol specific options
 #
 parser = optparse.OptionParser()
 Options.addNoISAOptions(parser)
 Ruby.define_options(parser)
-exec(compile( \
+# GPU Ruby tester options
-    open(os.path.join(config_root, "common", "Options.py")).read(), \
+parser.add_option("--cache-size", type="choice", default="small",
-    os.path.join(config_root, "common", "Options.py"), 'exec'))
+                  choices=["small", "large"],
                  help="Cache sizes to use. Small encourages races between \
                        requests and writebacks. Large stresses write-through \
                        and/or write-back GPU caches.")
 parser.add_option("--system-size", type="choice", default="small",
                  choices=["small", "medium", "large"],
                  help="This option defines how many CUs, CPUs and cache \
                        components in the test system.")
 parser.add_option("--address-range", type="choice", default="small",
                  choices=["small", "large"],
                  help="This option defines the number of atomic \
                        locations that affects the working set's size. \
                        A small number of atomic locations encourage more \
                        races among threads. The large option stresses cache \
                        resources.")
 parser.add_option("--episode-length", type="choice", default="short",
                  choices=["short", "medium", "long"],
                  help="This option defines the number of LDs and \
                        STs in an episode. The small option encourages races \
                        between the start and end of an episode. The long \
                        option encourages races between LDs and STs in the \
                        same episode.")
 parser.add_option("--test-length", type="int", default=1,
                  help="The number of episodes to be executed by each \
                        wavefront. This determines the maximum number, i.e., \
                        val X #WFs, of episodes to be executed in the test.")
 parser.add_option("--debug-tester", action='store_true',
                  help="This option will turn on DRF checker")
 parser.add_option("--random-seed", type="int", default=0,
                  help="Random seed number. Default value (i.e., 0) means \
                        using runtime-specific value")
 parser.add_option("--log-file", type="string", default="gpu-ruby-test.log")
 (options, args) = parser.parse_args()
 #
 # Set the default cache size and associativity to be very small to encourage
 # races between requests and writebacks.
 #
 options.l1d_size="256B"
 options.l1i_size="256B"
 options.l2_size="512B"
 options.l3_size="1kB"
 options.l1d_assoc=2
 options.l1i_assoc=2
 options.l2_assoc=2
 options.l3_assoc=2
 # This file can support multiple compute units
 assert(options.num_compute_units >= 1)
 n_cu = options.num_compute_units
 options.num_sqc = int((n_cu + options.cu_per_sqc - 1) // options.cu_per_sqc)
 if args:
     print("Error: script doesn't take any positional arguments")
     sys.exit(1)
 #
-# Create the ruby random tester
+# Set up cache size - 2 options
 #   0: small cache
 #   1: large cache
 #
-
+if (options.cache_size == "small"):
-# Check to for the GPU_RfO protocol.  Other GPU protocols are non-SC and will
+    options.tcp_size="256B"
-# not work with the Ruby random tester.
+    options.tcp_assoc=2
-assert(buildEnv['PROTOCOL'] == 'GPU_RfO')
+    options.tcc_size="1kB"
-
+    options.tcc_assoc=2
-# The GPU_RfO protocol does not support cache flushes
+elif (options.cache_size == "large"):
-check_flush = False
+    options.tcp_size="256kB"
-
+    options.tcp_assoc=16
-tester = RubyTester(check_flush=check_flush,
+    options.tcc_size="1024kB"
-                    checks_to_complete=options.maxloads,
+    options.tcc_assoc=16
                    wakeup_frequency=options.wakeup_freq,
                    deadlock_threshold=1000000)
 #
-# Create the M5 system.  Note that the Memory Object isn't
+# Set up system size - 3 options
 # actually used by the rubytester, but is included to support the
 # M5 memory size == Ruby memory size checks
 #
-system = System(cpu=tester, mem_ranges=[AddrRange(options.mem_size)])
+if (options.system_size == "small"):
    # 1 CU, 1 CPU, 1 SQC, 1 Scalar
    options.wf_size = 1
    options.wavefronts_per_cu = 1
    options.num_cpus = 1
    options.cu_per_sqc = 1
    options.cu_per_scalar_cache = 1
    options.num_compute_units = 1
 elif (options.system_size == "medium"):
    # 4 CUs, 4 CPUs, 1 SQCs, 1 Scalars
    options.wf_size = 16
    options.wavefronts_per_cu = 4
    options.num_cpus = 4
    options.cu_per_sqc = 4
    options.cu_per_scalar_cache = 4
    options.num_compute_units = 4
 elif (options.system_size == "large"):
    # 8 CUs, 4 CPUs, 1 SQCs, 1 Scalars
    options.wf_size = 32
    options.wavefronts_per_cu = 4
    options.num_cpus = 4
    options.cu_per_sqc = 4
    options.cu_per_scalar_cache = 4
    options.num_compute_units = 8
-# Create a top-level voltage domain and clock domain
+#
-system.voltage_domain = VoltageDomain(voltage=options.sys_voltage)
+# Set address range - 2 options
 #   level 0: small
 #   level 1: large
 # Each location corresponds to a 4-byte piece of data
 #
 options.mem_size = '1024MB'
 if (options.address_range == "small"):
    num_atomic_locs = 10
    num_regular_locs_per_atomic_loc = 10000
 elif (options.address_range == "large"):
    num_atomic_locs = 100
    num_regular_locs_per_atomic_loc = 100000
-system.clk_domain = SrcClockDomain(clock=options.sys_clock,
+#
-                                   voltage_domain=system.voltage_domain)
+# Set episode length (# of actions per episode) - 3 options
 #   0: 10 actions
 #   1: 100 actions
 #   2: 500 actions
 #
 if (options.episode_length == "short"):
    eps_length = 10
 elif (options.episode_length == "medium"):
    eps_length = 100
 elif (options.episode_length == "long"):
    eps_length = 500
 #
 # Set Ruby and tester deadlock thresholds. Ruby's deadlock detection is the
 # primary check for deadlocks. The tester's deadlock threshold detection is
 # a secondary check for deadlock. If there is a bug in RubyPort that causes
 # a packet not to return to the tester properly, the tester will issue a
 # deadlock panic. We set cache_deadlock_threshold < tester_deadlock_threshold
 # to detect deadlock caused by Ruby protocol first before one caused by the
 # coalescer. Both units are in Ticks
 #
 options.cache_deadlock_threshold = 1e8
 tester_deadlock_threshold = 1e9
 # For now we're testing only GPU protocol, so we force num_cpus to be 0
 options.num_cpus = 0
 # Number of CUs
 n_CUs = options.num_compute_units
 # Set test length, i.e., number of episodes per wavefront * #WFs.
 # Test length can be 1x#WFs, 10x#WFs, 100x#WFs, ...
 n_WFs = n_CUs * options.wavefronts_per_cu
 max_episodes = options.test_length * n_WFs
 # Number of SQC and Scalar caches
 assert(n_CUs % options.cu_per_sqc == 0)
 n_SQCs = n_CUs // options.cu_per_sqc
 options.num_sqc = n_SQCs
 assert(options.cu_per_scalar_cache != 0)
 n_Scalars = n_CUs // options.cu_per_scalar_cache
 options.num_scalar_cache = n_Scalars
 #
 # Create GPU Ruby random tester
 #
 tester = ProtocolTester(cus_per_sqc = options.cu_per_sqc,
                        cus_per_scalar = options.cu_per_scalar_cache,
                        wavefronts_per_cu = options.wavefronts_per_cu,
                        workitems_per_wavefront = options.wf_size,
                        num_atomic_locations = num_atomic_locs,
                        num_normal_locs_per_atomic = \
                                          num_regular_locs_per_atomic_loc,
                        max_num_episodes = max_episodes,
                        episode_length = eps_length,
                        debug_tester = options.debug_tester,
                        random_seed = options.random_seed,
                        log_file = options.log_file)
 #
 # Create a gem5 system. Note that the memory object isn't actually used by the
 # tester, but is included to ensure the gem5 memory size == Ruby memory size
 # checks. The system doesn't have real CPUs or CUs. It just has a tester that
 # has physical ports to be connected to Ruby
 #
 system = System(cpu = tester,
                mem_ranges = [AddrRange(options.mem_size)],
                cache_line_size = options.cacheline_size,
                mem_mode = 'timing')
 system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
 system.clk_domain = SrcClockDomain(clock = options.sys_clock,
                                   voltage_domain = system.voltage_domain)
 #
 # Command processor is not needed for the tester since we don't run real
 # kernels. Setting it to zero disables the VIPER protocol from creating
 # a command processor and its caches.
 #
 options.num_cp = 0
 #
 # Create the Ruby system
 #
 Ruby.create_system(options, False, system)
 # Create a seperate clock domain for Ruby
 system.ruby.clk_domain = SrcClockDomain(clock=options.ruby_clock,
                                       voltage_domain=system.voltage_domain)
 tester.num_cpus = len(system.ruby._cpu_ports)
 #
 # The tester is most effective when randomization is turned on and
 # artifical delay is randomly inserted on messages
 #
 system.ruby.randomization = True
-for ruby_port in system.ruby._cpu_ports:
+# Assert that we got the right number of Ruby ports
 assert(len(system.ruby._cpu_ports) == n_CUs + n_SQCs + n_Scalars)
-    #
+#
-    # Tie the ruby tester ports to the ruby cpu read and write ports
+# Attach Ruby ports to the tester in the order:
-    #
+#               cpu_sequencers,
-    if ruby_port.support_data_reqs and ruby_port.support_inst_reqs:
+#               vector_coalescers,
-        tester.cpuInstDataPort = ruby_port.slave
+#               sqc_sequencers,
-    elif ruby_port.support_data_reqs:
+#               scalar_sequencers
-        tester.cpuDataPort = ruby_port.slave
+#
-    elif ruby_port.support_inst_reqs:
+# Note that this requires the protocol to create sequencers in this order
-        tester.cpuInstPort = ruby_port.slave
+#
-
+print("Attaching ruby ports to the tester")
-    # Do not automatically retry stalled Ruby requests
+for i, ruby_port in enumerate(system.ruby._cpu_ports):
    ruby_port.no_retry_on_stall = True
    #
    # Tell each sequencer this is the ruby tester so that it
    # copies the subblock back to the checker
    #
    ruby_port.using_ruby_tester = True
-# -----------------------
+    if i < n_CUs:
-# run simulation
+        tester.cu_vector_ports = ruby_port.in_ports
-# -----------------------
+        tester.cu_token_ports = ruby_port.gmTokenPort
        tester.max_cu_tokens = 4*n_WFs
    elif i < (n_CUs + n_SQCs):
        tester.cu_sqc_ports = ruby_port.in_ports
    else:
        tester.cu_scalar_ports = ruby_port.in_ports
-root = Root( full_system = False, system = system )
+    i += 1
-root.system.mem_mode = 'timing'
+
 #
 # No CPU threads are needed for GPU tester
 #
 tester.cpu_threads = []
 #
 # Create GPU wavefronts
 #
 thread_clock = SrcClockDomain(clock = '1GHz',
                              voltage_domain = system.voltage_domain)
 wavefronts = []
 g_thread_idx = 0
 print("Creating %i WFs attached to %i CUs" % \
                (n_CUs * tester.wavefronts_per_cu, n_CUs))
 for cu_idx in range(n_CUs):
    for wf_idx in range(tester.wavefronts_per_cu):
        wavefronts.append(GpuWavefront(thread_id = g_thread_idx,
                                         cu_id = cu_idx,
                                         num_lanes = options.wf_size,
                                         clk_domain = thread_clock,
                                         deadlock_threshold = \
                                                tester_deadlock_threshold))
        g_thread_idx += 1
 tester.wavefronts = wavefronts
 #
 # Run simulation
 #
 root = Root(full_system = False, system = system)
 # Not much point in this being higher than the L1 latency
 m5.ticks.setGlobalFrequency('1ns')
-# instantiate configuration
+# Instantiate configuration
 m5.instantiate()
-# simulate until program terminates
+# Simulate until tester completes
-exit_event = m5.simulate(options.abs_max_tick)
+exit_event = m5.simulate()
-print('Exiting @ tick', m5.curTick(), 'because', exit_event.getCause())
+print('Exiting tick: ', m5.curTick())
 print('Exiting because ', exit_event.getCause())
--- a/src/cpu/testers/gpu_ruby_test/CpuThread.py
+++ b/src/cpu/testers/gpu_ruby_test/CpuThread.py
@@ -0,0 +1,39 @@
 # Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # For use for simulation and test purposes only
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 # 1. Redistributions of source code must retain the above copyright notice,
 # this list of conditions and the following disclaimer.
 #
 # 2. Redistributions in binary form must reproduce the above copyright notice,
 # this list of conditions and the following disclaimer in the documentation
 # and/or other materials provided with the distribution.
 #
 # 3. Neither the name of the copyright holder nor the names of its
 # contributors may be used to endorse or promote products derived from this
 # software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 from m5.params import *
 from m5.proxy import *
 from m5.objects.GpuThread import GpuThread
 class CpuThread(GpuThread):
    type = 'CpuThread'
    cxx_header = "cpu/testers/gpu_ruby_test/cpu_thread.hh"
--- a/src/cpu/testers/gpu_ruby_test/GpuThread.py
+++ b/src/cpu/testers/gpu_ruby_test/GpuThread.py
@@ -0,0 +1,42 @@
 # Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # For use for simulation and test purposes only
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 # 1. Redistributions of source code must retain the above copyright notice,
 # this list of conditions and the following disclaimer.
 #
 # 2. Redistributions in binary form must reproduce the above copyright notice,
 # this list of conditions and the following disclaimer in the documentation
 # and/or other materials provided with the distribution.
 #
 # 3. Neither the name of the copyright holder nor the names of its
 # contributors may be used to endorse or promote products derived from this
 # software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 from m5.objects.ClockedObject import ClockedObject
 from m5.params import *
 from m5.proxy import *
 class GpuThread(ClockedObject):
    type = 'GpuThread'
    abstract = True
    cxx_header = "cpu/testers/gpu_ruby_test/gpu_thread.hh"
    thread_id = Param.Int("Unique GpuThread ID")
    num_lanes = Param.Int("Number of lanes this thread has")
    deadlock_threshold = Param.Cycles(1000000000, "Deadlock threshold")
--- a/src/cpu/testers/gpu_ruby_test/GpuWavefront.py
+++ b/src/cpu/testers/gpu_ruby_test/GpuWavefront.py
@@ -0,0 +1,40 @@
 # Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # For use for simulation and test purposes only
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 # 1. Redistributions of source code must retain the above copyright notice,
 # this list of conditions and the following disclaimer.
 #
 # 2. Redistributions in binary form must reproduce the above copyright notice,
 # this list of conditions and the following disclaimer in the documentation
 # and/or other materials provided with the distribution.
 #
 # 3. Neither the name of the copyright holder nor the names of its
 # contributors may be used to endorse or promote products derived from this
 # software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 from m5.params import *
 from m5.proxy import *
 from m5.objects.GpuThread import GpuThread
 class GpuWavefront(GpuThread):
    type = 'GpuWavefront'
    cxx_header = "cpu/testers/gpu_ruby_test/gpu_wavefront.hh"
    cu_id = Param.Int("Compute Unit ID")
--- a/src/cpu/testers/gpu_ruby_test/ProtocolTester.py
+++ b/src/cpu/testers/gpu_ruby_test/ProtocolTester.py
@@ -0,0 +1,64 @@
 # Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # For use for simulation and test purposes only
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 # 1. Redistributions of source code must retain the above copyright notice,
 # this list of conditions and the following disclaimer.
 #
 # 2. Redistributions in binary form must reproduce the above copyright notice,
 # this list of conditions and the following disclaimer in the documentation
 # and/or other materials provided with the distribution.
 #
 # 3. Neither the name of the copyright holder nor the names of its
 # contributors may be used to endorse or promote products derived from this
 # software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 from m5.objects.ClockedObject import ClockedObject
 from m5.params import *
 from m5.proxy import *
 class ProtocolTester(ClockedObject):
    type = 'ProtocolTester'
    cxx_header = "cpu/testers/gpu_ruby_test/protocol_tester.hh"
    cpu_ports = VectorRequestPort("Ports for CPUs")
    cu_vector_ports = VectorRequestPort("Vector ports for GPUs")
    cu_sqc_ports = VectorRequestPort("SQC ports for GPUs")
    cu_scalar_ports = VectorRequestPort("Scalar ports for GPUs")
    cus_per_sqc = Param.Int(4, "Number of CUs per SQC")
    cus_per_scalar = Param.Int(4, "Number of CUs per scalar cache")
    wavefronts_per_cu = Param.Int(1, "Number of wavefronts per CU")
    workitems_per_wavefront = Param.Int(64, "Number of workitems per wf")
    cpu_threads = VectorParam.CpuThread("All cpus")
    wavefronts = VectorParam.GpuWavefront("All wavefronts")
    num_atomic_locations = Param.Int(2, "Number of atomic locations")
    num_normal_locs_per_atomic = Param.Int(1000, \
                                "Number of normal locations per atomic")
    episode_length = Param.Int(10, "Number of actions per episode")
    max_num_episodes = Param.Int(20, "Maximum number of episodes")
    debug_tester = Param.Bool(False, "Are we debugging the tester?")
    random_seed = Param.Int(0, "Random seed number. Default value (0) means \
                                using runtime-specific value.")
    log_file = Param.String("Log file's name")
    system = Param.System(Parent.any, "System we belong to")
--- a/src/cpu/testers/gpu_ruby_test/README
+++ b/src/cpu/testers/gpu_ruby_test/README
@@ -0,0 +1,129 @@
 /*
 * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 This directory contains a tester for gem5 GPU protocols. Unlike the Ruby random
 teter, this tester does not rely on sequential consistency. Instead, it
 assumes tested protocols supports release consistency.
 ----- Getting Started -----
 To start using the tester quickly, you can use the following example command
 line to get running immediately:
 build/GCN3_X86/gem5.opt configs/example/ruby_gpu_random_test.py \
            --test-length=1000 --system-size=medium --cache-size=small
 An overview of the main command line options is as follows. For all options
 use `build/GCN3_X86/gem5.opt configs/example/ruby_gpu_random_test.py --help`
 or see the configuration file.
 * --cache-size (small, large): Use smaller sizes for testing evict, etc.
 * --system-size (small, medium, large): Effectively the number of threads in
                 the GPU model. Large size will have more contention. Larger
                 sizes are useful for checking contention.
 * --episode-length (short, medium, long): Number of loads and stores in an
                 episode. Episodes will also have atomics mixed in. See below
                 for a definition of episode.
 * --test-length (int): Number of episodes to execute. This will determine the
                 amount of time the tester runs for. Longer time will stress
                 the protocol harder.
 The remainder of this file describes the theory behind the tester design and
 a link to a more detailed research paper is provided at the end.
 ----- Theory Overview -----
 The GPU Ruby tester creates a system consisting of both CPU threads and GPU
 wavefronts. CPU threads are scalar, so there is one lane per CPU thread. GPU
 wavefront may have multiple lanes. The number of lanes is initialized when
 a thread/wavefront is created.
 Each thread/wavefront executes a number of episodes. Each episode is a series
 of memory actions (i.e., atomic, load, store, acquire and release). In a
 wavefront, all lanes execute the same sequence of actions, but they may target
 different addresses. One can think of an episode as a critical section which
 is bounded by a lock acquire in the beginning and a lock release at the end. An
 episode consists of actions in the following order:
 1 - Atomic action
 2 - Acquire action
 3 - A number of load and store actions
 4 - Release action
 5 - Atomic action that targets the same address as (1) does
 There are two separate set of addresses: atomic and non-atomic. Atomic actions
 target only atomic addresses. Load and store actions target only non-atomic
 addresses. Memory addresses are all 4-byte aligned in the tester.
 To test false sharing cases in which both atomic and non-atomic addresses are
 placed in the same cache line, we abstract out the concept of memory addresses
 from the tester's perspective by introducing the concept of location. Locations
 are numbered from 0 to N-1 (if there are N addresses). The first X locations
 [0..X-1] are atomic locations, and the rest are non-atomic locations.
 The 1-1 mapping between locations and addresses are randomly created when the
 tester is initialized.
 Per load and store action, its target location is selected so that there is no
 data race in the generated stream of memory requests at any time during the
 test. Since in Data-Race-Free model, the memory system's behavior is undefined
 in data race cases, we exclude data race scenarios from our protocol test.
 Once location per load/store action is determined, each thread/wavefront either
 loads current value at the location or stores an incremental value to that
 location. The tester maintains a table tracking all last writers and their
 written values, so we know what value should be returned from a load and what
 value should be written next at a particular location. Value returned from a
 load must match with the value written by the last writer.
 ----- Directory Structure -----
 ProtocolTester.hh/cc -- This is the main tester class that orchestrates the
                        entire test.
 AddressManager.hh/cc -- This manages address space, randomly maps address to
                        location, generates locations for all episodes,
                        maintains per-location last writer and validates
                        values returned from load actions.
 GpuThread.hh/cc         -- This is abstract class for CPU threads and GPU
                        wavefronts. It generates and executes a series of
                        episodes.
 CpuThread.hh/cc      -- Thread class for CPU threads. Not fully implemented yet
 GpuWavefront.hh/cc   -- GpuThread class for GPU wavefronts.
 Episode.hh/cc        -- Class to encapsulate an episode, notably including
                        episode load/store structure and ordering.
 For more detail, please see the following paper:
 T. Ta, X. Zhang, A. Gutierrez and B. M. Beckmann, "Autonomous Data-Race-Free
 GPU Testing," 2019 IEEE International Symposium on Workload Characterization
 (IISWC), Orlando, FL, USA, 2019, pp. 81-92, doi:
 10.1109/IISWC47752.2019.9042019.
--- a/src/cpu/testers/gpu_ruby_test/SConscript
+++ b/src/cpu/testers/gpu_ruby_test/SConscript
@@ -0,0 +1,54 @@
 #
 # Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # For use for simulation and test purposes only
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 # 1. Redistributions of source code must retain the above copyright notice,
 # this list of conditions and the following disclaimer.
 #
 # 2. Redistributions in binary form must reproduce the above copyright notice,
 # this list of conditions and the following disclaimer in the documentation
 # and/or other materials provided with the distribution.
 #
 # 3. Neither the name of the copyright holder nor the names of its
 # contributors may be used to endorse or promote products derived from this
 # software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 #
 Import('*')
 if not env['BUILD_GPU']:
    Return()
 if env['PROTOCOL'] == 'None':
    Return()
 SimObject('ProtocolTester.py')
 SimObject('GpuThread.py')
 SimObject('CpuThread.py')
 SimObject('GpuWavefront.py')
 Source('address_manager.cc')
 Source('episode.cc')
 Source('protocol_tester.cc')
 Source('gpu_thread.cc')
 Source('cpu_thread.cc')
 Source('gpu_wavefront.cc')
 DebugFlag('ProtocolTest')
--- a/src/cpu/testers/gpu_ruby_test/address_manager.cc
+++ b/src/cpu/testers/gpu_ruby_test/address_manager.cc
@@ -0,0 +1,431 @@
 /*
 * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "cpu/testers/gpu_ruby_test/address_manager.hh"
 #include <algorithm>
 #include "base/intmath.hh"
 #include "base/logging.hh"
 #include "base/random.hh"
 #include "base/trace.hh"
 const int AddressManager::INVALID_VALUE = -1;
 const int AddressManager::INVALID_LOCATION = -1;
 AddressManager::AddressManager(int n_atomic_locs, int n_normal_locs_per_atomic)
      : numAtomicLocs(n_atomic_locs),
        numLocsPerAtomic(n_normal_locs_per_atomic)
 {
    assert(numAtomicLocs > 0 && numLocsPerAtomic > 0);
    numNormalLocs = numAtomicLocs * numLocsPerAtomic;
    // generate random address map
    randAddressMap.resize(numAtomicLocs + numNormalLocs);
    for (Location i = 0; i < numAtomicLocs + numNormalLocs; ++i) {
        // all addresses are sizeof(Value) (i.e., 4-byte) aligned
        randAddressMap[i] = (Addr)((i + 128) << floorLog2(sizeof(Value)));
    }
    // randomly shuffle randAddressMap
    std::random_shuffle(randAddressMap.begin(), randAddressMap.end());
    // initialize atomic locations
    // first and last normal location per atomic location
    Location first, last;
    for (Location atomic_loc = 0; atomic_loc < numAtomicLocs; ++atomic_loc) {
        first = numAtomicLocs + numLocsPerAtomic * atomic_loc;
        last = first + numLocsPerAtomic - 1;
        atomicStructs.push_back(new AtomicStruct(atomic_loc, first, last));
    }
    // initialize log table
    for (Location loc = 0; loc < numAtomicLocs + numNormalLocs; ++loc) {
        logTable.push_back(new LastWriter());
    }
 }
 AddressManager::~AddressManager()
 {
    for (AtomicStruct* atomic_struct : atomicStructs)
        delete atomic_struct;
    for (LastWriter* lw : logTable)
        delete lw;
 }
 Addr
 AddressManager::getAddress(Location loc)
 {
    assert(loc < numAtomicLocs + numNormalLocs && loc >= 0);
    return randAddressMap[loc];
 }
 AddressManager::Location
 AddressManager::getAtomicLoc()
 {
    Location ret_atomic_loc = random() % numAtomicLocs;
    atomicStructs[ret_atomic_loc]->startLocSelection();
    return ret_atomic_loc;
 }
 AddressManager::Location
 AddressManager::getLoadLoc(Location atomic_loc)
 {
    assert(atomic_loc >= 0 && atomic_loc < numAtomicLocs);
    return atomicStructs[atomic_loc]->getLoadLoc();
 }
 AddressManager::Location
 AddressManager::getStoreLoc(Location atomic_loc)
 {
    assert(atomic_loc >= 0 && atomic_loc < numAtomicLocs);
    return atomicStructs[atomic_loc]->getStoreLoc();
 }
 void
 AddressManager::finishLocSelection(Location atomic_loc)
 {
    assert(atomic_loc >= 0 && atomic_loc < numAtomicLocs);
    atomicStructs[atomic_loc]->endLocSelection();
 }
 void
 AddressManager::releaseLocation(Location atomic_loc, Location loc)
 {
    assert(atomic_loc >= 0 && atomic_loc < numAtomicLocs);
    atomicStructs[atomic_loc]->releaseLoc(loc);
 }
 std::string
 AddressManager::printLastWriter(Location loc) const
 {
    return logTable[loc]->print();
 }
 // ------------------- AtomicStruct --------------------------
 AddressManager::AtomicStruct::AtomicStruct(Location atomic_loc,
                                           Location loc_begin,
                                           Location loc_end)
 {
    // the location range must have at least 1 location
    assert(loc_begin <= loc_end);
    atomicLoc = atomic_loc;
    arraySize = loc_end - loc_begin + 1;
    locationBase = loc_begin;
    // allocate an array of arrray_size
    locArray = new Location[arraySize];
    // initialize locArray & locProps
    Location loc;
    for (int offset = 0; offset < arraySize; ++offset) {
        loc = locationBase + offset;
        locArray[offset] = loc;
        locProps.push_back(LocProperty(offset, 0));
    }
    // region (1) and (3) are initially empty
    firstMark = 0;
    secondMark = arraySize;
    // no request made at this location so far
    requestCount = 0;
 }
 AddressManager::AtomicStruct::~AtomicStruct()
 {
    delete[] locArray;
 }
 void
 AddressManager::AtomicStruct::startLocSelection()
 {
    assert(firstMark >= 0);
    assert(firstMark <= secondMark);
    assert(secondMark <= arraySize);
    // make sure loadStoreMap has been cleared
    assert(loadStoreMap.empty());
    // this atomic location is picked for Atomic_ACQ
    // and Atomic_REL in an episode
    requestCount += 2;
    // add two expected values in expectedValues set
    expectedValues.insert(requestCount - 1);
    expectedValues.insert(requestCount - 2);
 }
 AddressManager::Location
 AddressManager::AtomicStruct::getLoadLoc()
 {
    assert(firstMark >= 0);
    assert(firstMark <= secondMark);
    assert(secondMark <= arraySize);
    if (firstMark == arraySize) {
        // no location can be picked for a LD now, so return an empty location
        return INVALID_LOCATION;
    } else {
        // we can pick any location btw
        // locArray [firstMark : arraySize-1]
        int range_size = arraySize - firstMark;
        Location ret_loc = locArray[firstMark + random() % range_size];
        // update loadStoreMap
        LdStMap::iterator it = loadStoreMap.find(ret_loc);
        if (it == loadStoreMap.end()) {
            // insert a new entry to the map b/c the entry is not there yet
            // to mark this location has been picked for a LD
            loadStoreMap.insert(std::pair<Location, LdStBits>
                                            (ret_loc, LdStBits(true,false)));
        } else {
            // otherwise, just update the LD bit
            (it->second).first = true;
        }
        return ret_loc;
    }
 }
 AddressManager::Location
 AddressManager::AtomicStruct::getStoreLoc()
 {
    assert(firstMark >= 0);
    assert(firstMark <= secondMark);
    assert(secondMark <= arraySize);
    if (firstMark == secondMark) {
        // no location can be picked for a ST now, return an invalid location
        return INVALID_LOCATION;
    } else {
        // we can pick any location btw [firstMark : secondMark-1]
        int range_size = secondMark - firstMark;
        Location ret_loc = locArray[firstMark + random() % range_size];
        // update loadStoreMap
        LdStMap::iterator it = loadStoreMap.find(ret_loc);
        if (it == loadStoreMap.end()) {
            // insert a new entry to the map b/c the entry is not there yet
            // to mark this location has been picked for a ST
            loadStoreMap.insert(std::pair<Location, LdStBits>
                                            (ret_loc, LdStBits(false,true)));
        } else {
            // otherwise, just update the ST bit
            (it->second).second = true;
        }
        return ret_loc;
    }
 }
 // for each entry in loadStoreMap,
 //  if <LD_bit, ST_bit> == <1,0>
 //    - if the location is in (2), then move it to (3)
 //    - if the location is in (3), no move
 //    - otherwise, throw an error
 //  if <LD_bit, ST_bit> == <0,1> or <1,1>
 //    - move it from (2) to (1)
 void
 AddressManager::AtomicStruct::endLocSelection()
 {
    assert(firstMark >= 0);
    assert(firstMark <= secondMark);
    assert(secondMark <= arraySize);
    for (auto& it : loadStoreMap) {
        Location loc = it.first;
        LdStBits p = it.second;
        assert(loc >= locationBase && loc < locationBase + arraySize);
        LocProperty& loc_prop = locProps[loc - locationBase];
        if (p.first && !p.second) {
            // this location has been picked for LD(s) but not ST
            // it must be in either region (2) or (3)
            assert(inSecondRegion(loc_prop.first) ||
                   inThirdRegion(loc_prop.first));
            if (inSecondRegion(loc_prop.first)) {
                // there is no owner of this location yet
                assert(loc_prop.second == 0);
                // pick the last location in (2) to swap
                Location swapped_loc = locArray[secondMark - 1];
                LocProperty& swapped_loc_prop =
                                         locProps[swapped_loc - locationBase];
                // swap loc and swapped_loc
                swap(loc_prop, swapped_loc_prop);
                // then, expand (3)
                secondMark--;
            }
            // increment the location's number of owners
            loc_prop.second++;
        } else if (p.second) {
            // this location has been picked for ST(s) and/or LD(s)
            // it must be in region (2)
            assert(inSecondRegion(loc_prop.first) && loc_prop.second == 0);
            // pick the first location in (2) to swap
            Location swapped_loc = locArray[firstMark];
            LocProperty& swapped_loc_prop =
                                        locProps[swapped_loc - locationBase];
            // swap loc and swapped_loc
            swap(loc_prop, swapped_loc_prop);
            // then, expand (1)
            firstMark++;
            // increment the location's number of owners
            loc_prop.second++;
        } else {
            panic("Location in loadStoreMap but wasn't picked in any"
                            " action\n");
        }
    }
    // clear the ld_st_map
    loadStoreMap.clear();
 }
 void
 AddressManager::AtomicStruct::releaseLoc(Location loc)
 {
    assert(loc >= locationBase && loc < locationBase + arraySize);
    LocProperty& loc_prop = locProps[loc - locationBase];
    if (inFirstRegion(loc_prop.first)) {
        // this location must have exactly 1 owner
        assert(loc_prop.second == 1);
        // pick the last location in region 1 to swap
        Location swapped_loc = locArray[firstMark - 1];
        LocProperty& swapped_loc_prop = locProps[swapped_loc - locationBase];
        // swap loc and swapped_loc
        swap(loc_prop, swapped_loc_prop);
        // then shrink (1)
        firstMark--;
        // reset the location's number of owners
        loc_prop.second = 0;
    } else if (inThirdRegion(loc_prop.first)) {
        // this location must have at least 1 owner
        assert(loc_prop.second >= 1);
        if (loc_prop.second == 1) {
            // pick the first location in region 3 to swap
            Location swapped_loc = locArray[secondMark];
            LocProperty& swapped_loc_prop =
                                        locProps[swapped_loc - locationBase];
            // swap loc and swapped_loc
            swap(loc_prop, swapped_loc_prop);
            // then shrink (3)
            secondMark++;
        }
        // decrement the loc's number of owners
        loc_prop.second--;
    } else {
        // some one else must already reset this counter
        assert(inSecondRegion(loc_prop.first) && loc_prop.second == 0);
    }
 }
 bool
 AddressManager::AtomicStruct::isExpectedValue(Value val)
 {
    ExpectedValueSet::iterator it = expectedValues.find(val);
    if (it == expectedValues.end()) {
        std::stringstream exp_val_ss;
        for (auto& val : expectedValues) {
            exp_val_ss << " " << val;
        }
        warn("Expected return values are:\n\t%s\n", exp_val_ss.str());
        return false;
    }
    // erase this value b/c it's done
    expectedValues.erase(it);
    return true;
 }
 void
 AddressManager::AtomicStruct::swap(LocProperty& prop_1, LocProperty& prop_2)
 {
    int new_idx_1 = prop_2.first;
    int new_idx_2 = prop_1.first;
    // swap the two locations in locArray
    Location tmp = locArray[prop_1.first];
    locArray[prop_1.first] = locArray[prop_2.first];
    locArray[prop_2.first] = tmp;
    // update their new indices
    prop_1.first = new_idx_1;
    prop_2.first = new_idx_2;
 }
 // ------------------ log table ---------------------
 void
 AddressManager::updateLogTable(Location loc, int thread_id, int episode_id,
                               Value new_value, Tick cur_tick, int cu_id)
 {
    assert(loc >= 0 && loc < numAtomicLocs + numNormalLocs);
    logTable[loc]->update(thread_id, cu_id, episode_id, new_value, cur_tick);
 }
 AddressManager::Value
 AddressManager::getLoggedValue(Location loc) const
 {
    assert(loc >= 0 && loc < numAtomicLocs + numNormalLocs);
    return logTable[loc]->getLastStoredValue();
 }
 bool
 AddressManager::validateAtomicResp(Location loc, Value ret_val)
 {
    assert(loc >= 0 && loc < numAtomicLocs);
    return atomicStructs[loc]->isExpectedValue(ret_val);
 }
--- a/src/cpu/testers/gpu_ruby_test/address_manager.hh
+++ b/src/cpu/testers/gpu_ruby_test/address_manager.hh
@@ -0,0 +1,274 @@
 /*
 * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef CPU_TESTERS_PROTOCOL_TESTER_ADDRESS_MANAGER_HH_
 #define CPU_TESTERS_PROTOCOL_TESTER_ADDRESS_MANAGER_HH_
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 #include "base/types.hh"
 #include "sim/eventq.hh"
 /*
 * --- AddressManager has 3 main tasks ---
 *    (1) generate DRF request sequences
 *    (2) maintain internal log table
 *    (3) validate return values against ones in the log table
 *
 * A location is an abstract index of a unique real address.
 *    It's used internally within the tester only.
 *    randAddressMap has the mapping between a location and its real address.
 *
 * A value is an integer that a location in real memory can store.
 *    for now, we assume a value is 4-byte
 *
 * The location range (randAddressMap) has two distinct parts:
 *    Atomic locations: in the 1st part of randAddressMap &
 *    Non-atomic locations (or just locations): in the 2nd part
 */
 /*
 * --- DRF request sequence generation ---
 *    Each lane of an episode starts selecting its location by calling:
 *      (1) getAtomicLoc
 *      (2) getLoadLoc/getStoreLoc
 *      (3) finishLocSelection
 *
 *    Each lane of an episode completes its executing by calling:
 *      releaseLocation for all locations it selected
 */
 /*
 * --- Internal structures ---
 *  There are multiple atomic structures, each of which corresponds
 *    to an atomic location.
 *
 *  Each atomic structure manages a distinct range of locations in locArray
 *  This array is partitioned into 3 parts that are used to select locations
 *  for LDs and STs. Here is the location selecting rule:
 *                  |    (1)    |    (2)    |    (3)    |
 *    - all locations in (1) cannot be picked for any LD and ST action
 *    - all locations in (2) can be picked for either LD or ST action
 *    - all locations in (3) can be picked for LD action only
 *
 *  We maintain the 3 parts by 2 indices firstMark and secondMark.
 *  As locations are moved between partitions, both indices are updated
 *  accordingly.
 *    [0 .. firstMark-1]                  part (1)
 *    [firstMark .. secondMark-1]      part (2)
 *    [secondMark .. arraySize-1]        part (3)
 *
 *  Each location has its context/property. locProps maintains
 *  contexts/properties of all locations. Context/property includes
 *      - current index of a location in locArray
 *      - the number of owners who are currently using the location
 *
 *  To guarantee DRF constraints, the following conditions must hold
 *    - all locations in (1) have exactly 1 owner
 *    - all locations in (2) have exactly 0 owner
 *    - all locations in (3) have at least 1 owner
 *    - A LD request can randomly pick any location in (2) & (3)
 *    - A ST request can randomly pick any location in (2)
 *
 *  loadStoreMap maintains all locations already selected for LDs/STs so far
 *
 *  When endLocSelection is called (i.e., we've picked all locations for an
 *  episode), we need to move each selected location to its right partition.
 *    if LD_bit == 1 && ST_bit == 0 (i.e., picked for LDs), then move the
 *          location to (3) -> future LDs can pick it.
 *    if LD_bit == 0 && ST_bit == 1, then move the location to (1) -> NO future
 *          action can pick it until this episode is done.
 *    if LD_bit == 1 && ST_bit == 1, then move the location to (1) -> NO future
 *          action can pick it until this episode is done.
 *    clear the loadStoreMap
 */
 class AddressManager
 {
  public:
    AddressManager(int n_atomic_locs, int numNormalLocsPerAtomic);
    ~AddressManager();
    typedef int32_t Value;
    typedef int32_t Location;
    // return the unique address mapped to a location
    Addr getAddress(Location loc);
    // return a unique atomic location & start picking locations
    Location getAtomicLoc();
    // return a random location for LD
    Location getLoadLoc(Location atomic_loc);
    // return a random location for ST
    Location getStoreLoc(Location atomic_loc);
    // finish picking locations
    void finishLocSelection(Location atomic_loc);
    // an episode is done, release location I've picked
    void releaseLocation(Location atomic_loc, Location loc);
    // update a log table entry with a given set of values
    void updateLogTable(Location loc, int threadId, int episodeId,
                        Value new_value, Tick curTick, int cuId = -1);
    // return the current value in the log table
    Value getLoggedValue(Location loc) const;
    // validate atomic response
    bool validateAtomicResp(Location loc, Value ret_val);
    std::string printLastWriter(Location loc) const;
    static const int INVALID_VALUE;
    static const int INVALID_LOCATION;
  private:
    class LastWriter
    {
      public:
        LastWriter()
            : threadId(-1), cuId(-1), episodeId(-1), value(0),
              writeTick(0)
        { }
        const std::string print() const
        {
            return "(GpuThread ID " + std::to_string(threadId) +
                   ", CU ID " + std::to_string(cuId) +
                   ", Episode ID " + std::to_string(episodeId) +
                   ", Value " + std::to_string(value) +
                   ", Tick " + std::to_string(writeTick) +
                   ")";
        }
        void update(int _thread, int _cu, int _episode, Value _value,
                    Tick _tick)
        {
            threadId = _thread;
            cuId = _cu;
            episodeId = _episode;
            value = _value;
            writeTick = _tick;
        }
        Value getLastStoredValue() const { return value; }
      private:
        int threadId;
        int cuId;
        int episodeId;
        Value value;
        Tick writeTick;
    };
    class AtomicStruct
    {
      public:
        AtomicStruct(Location atom_loc, Location loc_begin, Location loc_end);
        ~AtomicStruct();
        // functions picking locations for LD/ST/ATOMIC ops
        void startLocSelection();
        Location getLoadLoc();
        Location getStoreLoc();
        void endLocSelection();
        // an episode completed its actions
        // return locations to their correct positions
        void releaseLoc(Location loc);
        // is the value what we expect?
        bool isExpectedValue(Value val);
      private:
        Location atomicLoc;
        Location locationBase;
        // array storing all locations this structure is managing
        Location* locArray;
        int firstMark, secondMark;
        int arraySize;
        // a vector of location's properties
        typedef std::pair<int, int> LocProperty;
        typedef std::vector<LocProperty> LocPropTable;
        LocPropTable locProps;
        // a temporary map of location and its LD/ST selection
        typedef std::pair<bool, bool> LdStBits;
        typedef std::unordered_map<Location, LdStBits> LdStMap;
        LdStMap loadStoreMap;
        // number of atomic requests at this location so far
        int requestCount;
        // a set of expected values
        // when we request the first n atomic ops, we expect to receive n
        // return values from [0 .. n-1]
        typedef std::unordered_set<Value> ExpectedValueSet;
        ExpectedValueSet expectedValues;
        // swap two locations in locArray
        void swap(LocProperty& prop_1, LocProperty& prop_2);
        bool inFirstRegion(int idx) const
        {
            return (idx >= 0 && idx < firstMark);
        }
        bool inSecondRegion(int idx) const
        {
            return (idx >= firstMark && idx < secondMark);
        }
        bool inThirdRegion(int idx) const
        {
            return (idx >= secondMark && idx < arraySize);
        }
    };
    // number of atomic locations
    int numAtomicLocs;
    // number of normal/non-atomic locations per atomic structure
    int numLocsPerAtomic;
    // total number of non-atomic locations
    int numNormalLocs;
    // location - address mapping
    typedef std::vector<Addr> AddressMap;
    AddressMap randAddressMap;
    // a list of atomic structures
    typedef std::vector<AtomicStruct*> AtomicStructTable;
    AtomicStructTable atomicStructs;
    // internal log table
    typedef std::vector<LastWriter*> LogTable;
    LogTable logTable;
 };
 #endif /* CPU_TESTERS_PROTOCOL_TESTER_ADDRESS_MANAGER_HH_ */
--- a/src/cpu/testers/gpu_ruby_test/cpu_thread.cc
+++ b/src/cpu/testers/gpu_ruby_test/cpu_thread.cc
@@ -0,0 +1,123 @@
 /*
 * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "cpu/testers/gpu_ruby_test/cpu_thread.hh"
 #include "debug/ProtocolTest.hh"
 CpuThread::CpuThread(const Params &p)
    :GpuThread(p)
 {
    threadName = "CpuThread(Thread ID " + std::to_string(threadId) + ")";
    threadEvent.setDesc("CpuThread tick");
    assert(numLanes == 1);
 }
 CpuThread*
 CpuThreadParams::create() const
 {
    return new CpuThread(*this);
 }
 void
 CpuThread::issueLoadOps()
 {
    assert(curAction);
    assert(curAction->getType() == Episode::Action::Type::LOAD);
    // we should not have any outstanding fence or atomic op at this point
    assert(pendingFenceCount == 0);
    assert(pendingAtomicCount == 0);
    fatal("CpuThread::issueLoadOps - not yet implemented");
 }
 void
 CpuThread::issueStoreOps()
 {
    assert(curAction);
    assert(curAction->getType() == Episode::Action::Type::STORE);
    // we should not have any outstanding fence or atomic op at this point
    assert(pendingFenceCount == 0);
    assert(pendingAtomicCount == 0);
    fatal("CpuThread::issueStoreOps - not yet implemented");
 }
 void
 CpuThread::issueAtomicOps()
 {
    assert(curAction);
    assert(curAction->getType() == Episode::Action::Type::ATOMIC);
    // we should not have any outstanding ops at this point
    assert(pendingFenceCount == 0);
    assert(pendingLdStCount == 0);
    assert(pendingAtomicCount == 0);
    fatal("CpuThread::issueAtomicOps - not yet implemented");
 }
 void
 CpuThread::issueAcquireOp()
 {
    DPRINTF(ProtocolTest, "Issuing Acquire Op ...\n");
    assert(curAction);
    assert(curAction->getType() == Episode::Action::Type::ACQUIRE);
    // we should not have any outstanding ops at this point
    assert(pendingFenceCount == 0);
    assert(pendingLdStCount == 0);
    assert(pendingAtomicCount == 0);
    // no-op: Acquire does not apply to CPU threads
 }
 void
 CpuThread::issueReleaseOp()
 {
    DPRINTF(ProtocolTest, "Issuing Release Op ...\n");
    assert(curAction);
    assert(curAction->getType() == Episode::Action::Type::RELEASE);
    // we should not have any outstanding ops at this point
    assert(pendingFenceCount == 0);
    assert(pendingLdStCount == 0);
    assert(pendingAtomicCount == 0);
    // no-op: Release does not apply to CPU threads
 }
 void
 CpuThread::hitCallback(PacketPtr pkt)
 {
    fatal("CpuThread::hitCallback - not yet implemented");
 }
--- a/src/cpu/testers/gpu_ruby_test/cpu_thread.hh
+++ b/src/cpu/testers/gpu_ruby_test/cpu_thread.hh
@@ -0,0 +1,61 @@
 /*
 * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef CPU_TESTERS_PROTOCOL_TESTER_CPU_THREAD_HH_
 #define CPU_TESTERS_PROTOCOL_TESTER_CPU_THREAD_HH_
 #include "cpu/testers/gpu_ruby_test/gpu_thread.hh"
 #include "params/CpuThread.hh"
 #include "sim/clocked_object.hh"
 class CpuThread : public GpuThread
 {
  public:
    typedef CpuThreadParams Params;
    CpuThread(const Params &p);
    virtual ~CpuThread() = default;
    typedef AddressManager::Location Location;
    typedef AddressManager::Value Value;
    void hitCallback(PacketPtr pkt);
  protected:
    void issueLoadOps();
    void issueStoreOps();
    void issueAtomicOps();
    void issueAcquireOp();
    void issueReleaseOp();
 };
 #endif /* CPU_TESTERS_PROTOCOL_TESTER_CPU_THREAD_HH_ */
--- a/src/cpu/testers/gpu_ruby_test/episode.cc
+++ b/src/cpu/testers/gpu_ruby_test/episode.cc
@@ -0,0 +1,321 @@
 /*
 * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "cpu/testers/gpu_ruby_test/episode.hh"
 #include <fstream>
 #include <unordered_set>
 #include "cpu/testers/gpu_ruby_test/gpu_thread.hh"
 #include "cpu/testers/gpu_ruby_test/protocol_tester.hh"
 Episode::Episode(ProtocolTester* _tester, GpuThread* _thread, int num_loads,
                 int num_stores)
      : tester(_tester),
        thread(_thread),
        numLoads(num_loads),
        numStores(num_stores),
        nextActionIdx(0)
 {
    assert(tester && thread);
    episodeId = tester->getNextEpisodeID();
    numLanes = thread->getNumLanes();
    assert(numLanes > 0);
    addrManager = tester->getAddressManager();
    assert(addrManager);
    atomicLocs.resize(numLanes, AddressManager::INVALID_LOCATION);
    // generate a sequence of actions
    initActions();
    isActive = true;
    DPRINTFN("Episode %d\n", episodeId);
 }
 Episode::~Episode()
 {
    for (Episode::Action* action : actions) {
        assert(action);
        delete action;
    }
 }
 const Episode::Action*
 Episode::peekCurAction() const
 {
    if (nextActionIdx < actions.size())
        return actions[nextActionIdx];
    else
        return nullptr;
 }
 void
 Episode::popAction()
 {
    assert(nextActionIdx < actions.size());
    nextActionIdx++;
 }
 void
 Episode::initActions()
 {
    // first, push Atomic & then Acquire action
    actions.push_back(new Action(Action::Type::ATOMIC, numLanes));
    actions.push_back(new Action(Action::Type::ACQUIRE, numLanes));
    // second, push a number of LD/ST actions
    int num_loads = numLoads;
    int num_stores = numStores;
    while ((num_loads + num_stores) > 0) {
        switch (random() % 2) {
            case 0: // Load
                if (num_loads > 0) {
                    actions.push_back(new Action(Action::Type::LOAD,
                                                   numLanes));
                    num_loads--;
                }
                break;
            case 1: // Store
                if (num_stores > 0) {
                    actions.push_back(new Action(Action::Type::STORE,
                                                   numLanes));
                    num_stores--;
                }
                break;
            default:
                assert(false);
        }
    }
    // last, push an Release & then Atomic action
    actions.push_back(new Action(Action::Type::RELEASE, numLanes));
    actions.push_back(new Action(Action::Type::ATOMIC, numLanes));
    // for each lane, pick a list of locations
    Location normal_loc;
    for (int lane = 0; lane < numLanes; ++lane) {
        normal_loc = AddressManager::INVALID_LOCATION;
        // first, we select atomic loc for this lane
        // atomic loc for this lane should not have been picked yet
        assert(atomicLocs[lane] == AddressManager::INVALID_LOCATION);
        // pick randomly an atomic location
        atomicLocs[lane] = addrManager->getAtomicLoc();
        assert(atomicLocs[lane] >= 0);
        // go through each action in this lane and set its location
        for (Action* action : actions) {
            assert(action);
            switch (action->getType()) {
                case Action::Type::ATOMIC:
                    action->setLocation(lane, atomicLocs[lane]);
                    break;
                case Action::Type::LOAD:
                    // pick randomly a normal location
                    normal_loc = addrManager->
                                            getLoadLoc(atomicLocs[lane]);
                    assert(normal_loc >= AddressManager::INVALID_LOCATION);
                    if (normal_loc != AddressManager::INVALID_LOCATION) {
                        // check DRF
                        if (!tester->checkDRF(atomicLocs[lane],
                                                normal_loc, false) ||
                            !this->checkDRF(atomicLocs[lane], normal_loc,
                                            false, lane)) {
                            panic("GpuTh %d - Data race detected. STOPPED!\n",
                                  thread->getGpuThreadId());
                        }
                    }
                    action->setLocation(lane, normal_loc);
                    break;
                case Action::Type::STORE:
                    // pick randomly a normal location
                    normal_loc = addrManager->
                                            getStoreLoc(atomicLocs[lane]);
                    assert(normal_loc >= AddressManager::INVALID_LOCATION);
                    if (normal_loc != AddressManager::INVALID_LOCATION) {
                        // check DRF
                        if (!tester->checkDRF(atomicLocs[lane],
                                                normal_loc, true) ||
                            !this->checkDRF(atomicLocs[lane], normal_loc,
                                            true, lane)) {
                            panic("GpuTh %d - Data race detected. STOPPED!\n",
                                  thread->getGpuThreadId());
                        }
                    }
                    action->setLocation(lane, normal_loc);
                    break;
                case Action::Type::ACQUIRE:
                case Action::Type::RELEASE:
                    // no op
                    break;
                default:
                    panic("Invalid action type\n");
            }
        }
        addrManager->finishLocSelection(atomicLocs[lane]);
    }
 }
 void
 Episode::completeEpisode()
 {
    // release all locations this episode has picked and used
    Location atomic_loc, normal_loc;
    for (int lane = 0; lane < numLanes; ++lane) {
        atomic_loc = AddressManager::INVALID_LOCATION;
        normal_loc = AddressManager::INVALID_LOCATION;
        std::unordered_set<Location> unique_loc_set;
        for (Action* action : actions) {
            assert(action);
            if (action->isAtomicAction()) {
                if (atomic_loc == AddressManager::INVALID_LOCATION) {
                    atomic_loc = action->getLocation(lane);
                } else {
                    // both atomic ops in the same lane must be
                    // at the same location
                    assert(atomic_loc == action->getLocation(lane));
                }
            } else if (!action->isMemFenceAction()) {
                assert(atomic_loc >= 0);
                normal_loc = action->getLocation(lane);
                if (normal_loc >= 0)
                    unique_loc_set.insert(normal_loc);
            }
        }
        // each unique loc can be released only once
        for (Location loc : unique_loc_set)
            addrManager->releaseLocation(atomic_loc, loc);
    }
    // this episode is no longer active
    isActive = false;
 }
 bool
 Episode::checkDRF(Location atomic_loc, Location loc, bool isStore,
                  int max_lane) const
 {
    assert(atomic_loc != AddressManager::INVALID_LOCATION);
    assert(loc != AddressManager::INVALID_LOCATION);
    assert(max_lane <= numLanes);
    for (int lane = 0; lane < max_lane; ++lane) {
        if (atomic_loc == atomicLocs[lane]) {
            for (const Action* action : actions) {
                if (!action->isAtomicAction() &&
                    !action->isMemFenceAction()) {
                    if (isStore && loc == action->getLocation(lane)) {
                        warn("ST at location %d races against thread %d\n",
                             loc, thread->getGpuThreadId());
                        return false;
                    } else if (!isStore &&
                               action->getType() == Action::Type::STORE &&
                               loc == action->getLocation(lane)) {
                        warn("LD at location %d races against thread %d\n",
                             loc, thread->getGpuThreadId());
                        return false;
                    }
                }
            }
        }
    }
    return true;
 }
 // -------------------- Action class ----------------------------
 Episode::Action::Action(Type t, int num_lanes)
    : type(t),
      numLanes(num_lanes)
 {
    assert(numLanes > 0);
    locations.resize(numLanes);
    for (Location &loc : locations) loc = AddressManager::INVALID_LOCATION;
 }
 void
 Episode::Action::setLocation(int lane, Location loc)
 {
    assert(lane >= 0 && lane < numLanes);
    locations[lane] = loc;
 }
 AddressManager::Location
 Episode::Action::getLocation(int lane) const
 {
    assert(lane >= 0 && lane < numLanes);
    return locations[lane];
 }
 bool
 Episode::Action::isAtomicAction() const
 {
    return (type == Type::ATOMIC);
 }
 bool
 Episode::Action::isMemFenceAction() const
 {
    return (type == Type::ACQUIRE || type == Type::RELEASE);
 }
 const std::string
 Episode::Action::printType() const
 {
    if (type == Type::ACQUIRE)
        return "ACQUIRE";
    else if (type == Type::RELEASE)
        return "RELEASE";
    else if (type == Type::ATOMIC)
        return "ATOMIC";
    else if (type == Type::LOAD)
        return "LOAD";
    else if (type == Type::STORE)
        return "STORE";
    else
        panic("Invalid action type\n");
 }
--- a/src/cpu/testers/gpu_ruby_test/episode.hh
+++ b/src/cpu/testers/gpu_ruby_test/episode.hh
@@ -0,0 +1,126 @@
 /*
 * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef CPU_TESTERS_PROTOCOL_TESTER_EPISODE_HH_
 #define CPU_TESTERS_PROTOCOL_TESTER_EPISODE_HH_
 #include <vector>
 #include "cpu/testers/gpu_ruby_test/address_manager.hh"
 class ProtocolTester;
 class GpuThread;
 class Episode
 {
  public:
    typedef AddressManager::Location Location;
    typedef AddressManager::Value Value;
    class Action {
      public:
        enum class Type {
            ACQUIRE,
            RELEASE,
            ATOMIC,
            LOAD,
            STORE,
        };
        Action(Type t, int num_lanes);
        ~Action() {}
        Type getType() const { return type; }
        void setLocation(int lane, Location loc);
        Location getLocation(int lane) const;
        bool isAtomicAction() const;
        bool isMemFenceAction() const;
        const std::string printType() const;
      private:
        Type type;
        int numLanes;
        typedef std::vector<Location> LocationList;
        LocationList locations;
    };
    Episode(ProtocolTester* tester, GpuThread* thread, int num_loads,
            int num_stores);
    ~Episode();
    // return episode id
    int getEpisodeId() const { return episodeId; }
    // return the action at the head of the action queue
    const Action* peekCurAction() const;
    // pop the action at the head of the action queue
    void popAction();
    // check if there is more action to be issued in this episode
    bool hasMoreActions() const { return nextActionIdx < actions.size();}
    // complete this episode by releasing all locations & updating st effects
    void completeEpisode();
    // check if this episode is executing
    bool isEpsActive() const { return isActive; }
    // check if the input episode and this one have any data race
    bool checkDRF(Location atomic_loc, Location loc, bool isStore,
                  int max_lane) const;
  private:
    // pointers to tester, thread and address amanger structures
    ProtocolTester *tester;
    GpuThread *thread;
    AddressManager *addrManager;
    // a unique episode id
    int episodeId;
    // list of actions in this episode
    typedef std::vector<Action*> ActionList;
    ActionList actions;
    // list of atomic locations picked for this episode
    typedef std::vector<Location> AtomicLocationList;
    AtomicLocationList atomicLocs;
    // is a thread running this episode?
    bool isActive;
    // episode length = num_loads + num_stores
    int numLoads;
    int numStores;
    // index of the next action in actions
    int nextActionIdx;
    // number of lanes in this thread
    int numLanes;
    // randomly generate actions in this episode
    void initActions();
 };
 #endif /* CPU_TESTERS_PROTOCOL_TESTER_EPISODE_HH_ */
--- a/src/cpu/testers/gpu_ruby_test/gpu_thread.cc
+++ b/src/cpu/testers/gpu_ruby_test/gpu_thread.cc
@@ -0,0 +1,430 @@
 /*
 * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "cpu/testers/gpu_ruby_test/gpu_thread.hh"
 #include <fstream>
 #include "debug/ProtocolTest.hh"
 GpuThread::GpuThread(const Params &p)
      : ClockedObject(p),
        threadEvent(this, "GpuThread tick"),
        deadlockCheckEvent(this),
        threadId(p.thread_id),
        numLanes(p.num_lanes),
        tester(nullptr), addrManager(nullptr), port(nullptr),
        scalarPort(nullptr), sqcPort(nullptr), curEpisode(nullptr),
        curAction(nullptr), pendingLdStCount(0), pendingFenceCount(0),
        pendingAtomicCount(0), lastActiveCycle(Cycles(0)),
        deadlockThreshold(p.deadlock_threshold)
 {
 }
 GpuThread::~GpuThread()
 {
    for (auto ep : episodeHistory) {
        assert(ep != nullptr);
        delete ep;
    }
 }
 void
 GpuThread::wakeup()
 {
    // this thread is waken up by one of the following events
    //      - hitCallback is called
    //      - a new episode is created
    // check if this is the first episode in this thread
    if (curEpisode == nullptr) {
        issueNewEpisode();
        assert(curEpisode);
    }
    if (isNextActionReady()) {
        // isNextActionReady should check if the action list is empty
        assert(curAction != nullptr);
        // issue the next action
        issueNextAction();
    } else {
        // check for completion of the current episode
        // completion = no outstanding requests + not having more actions
        if (!curEpisode->hasMoreActions() &&
            pendingLdStCount == 0 &&
            pendingFenceCount == 0 &&
            pendingAtomicCount == 0) {
            curEpisode->completeEpisode();
            // check if it's time to stop the tester
            if (tester->checkExit()) {
                // no more event is scheduled for this thread
                return;
            }
            // issue the next episode
            issueNewEpisode();
            assert(curEpisode);
            // now we get a new episode
            // let's wake up the thread in the next cycle
            if (!threadEvent.scheduled()) {
                scheduleWakeup();
            }
        }
    }
 }
 void
 GpuThread::scheduleWakeup()
 {
    assert(!threadEvent.scheduled());
    schedule(threadEvent, nextCycle());
 }
 void
 GpuThread::scheduleDeadlockCheckEvent()
 {
    // after this first schedule, the deadlock event is scheduled by itself
    assert(!deadlockCheckEvent.scheduled());
    schedule(deadlockCheckEvent, nextCycle());
 }
 void
 GpuThread::attachGpuThreadToPorts(ProtocolTester *_tester,
                            ProtocolTester::SeqPort *_port,
                            ProtocolTester::SeqPort *_scalarPort,
                            ProtocolTester::SeqPort *_sqcPort)
 {
    tester = _tester;
    port = _port;
    scalarPort = _scalarPort;
    sqcPort = _sqcPort;
    assert(tester && port);
    addrManager = tester->getAddressManager();
    assert(addrManager);
 }
 void
 GpuThread::issueNewEpisode()
 {
    int num_reg_loads = random() % tester->getEpisodeLength();
    int num_reg_stores = tester->getEpisodeLength() - num_reg_loads;
    // create a new episode
    curEpisode = new Episode(tester, this, num_reg_loads, num_reg_stores);
    episodeHistory.push_back(curEpisode);
 }
 bool
 GpuThread::isNextActionReady()
 {
    if (!curEpisode->hasMoreActions()) {
        return false;
    } else {
        curAction = curEpisode->peekCurAction();
        switch(curAction->getType()) {
            case Episode::Action::Type::ATOMIC:
                // an atomic action must wait for all previous requests
                // to complete
                if (pendingLdStCount == 0 &&
                    pendingFenceCount == 0 &&
                    pendingAtomicCount == 0) {
                    return true;
                }
                return false;
            case Episode::Action::Type::ACQUIRE:
                // we should not see any outstanding ld_st or fence here
                assert(pendingLdStCount == 0 &&
                       pendingFenceCount == 0);
                // an acquire action must wait for all previous atomic
                // requests to complete
                if (pendingAtomicCount == 0) {
                    return true;
                }
                return false;
            case Episode::Action::Type::RELEASE:
                // we should not see any outstanding atomic or fence here
                assert(pendingAtomicCount == 0 &&
                       pendingFenceCount == 0);
                // a release action must wait for all previous ld/st
                // requests to complete
                if (pendingLdStCount == 0) {
                    return true;
                }
                return false;
            case Episode::Action::Type::LOAD:
            case Episode::Action::Type::STORE:
                // we should not see any outstanding atomic here
                assert(pendingAtomicCount == 0);
                // can't issue if there is a pending fence
                if (pendingFenceCount > 0) {
                    return false;
                }
                // a Load or Store is ready if it doesn't overlap
                // with any outstanding request
                for (int lane = 0; lane < numLanes; ++lane) {
                    Location loc = curAction->getLocation(lane);
                    if (loc != AddressManager::INVALID_LOCATION) {
                        Addr addr = addrManager->getAddress(loc);
                        if (outstandingLoads.find(addr) !=
                            outstandingLoads.end()) {
                            return false;
                        }
                        if (outstandingStores.find(addr) !=
                            outstandingStores.end()) {
                            return false;
                        }
                        if (outstandingAtomics.find(addr) !=
                            outstandingAtomics.end()) {
                            // this is not an atomic action, so the address
                            // should not be in outstandingAtomics list
                            assert(false);
                        }
                    }
                }
                return true;
            default:
                panic("The tester got an invalid action\n");
        }
    }
 }
 void
 GpuThread::issueNextAction()
 {
    switch(curAction->getType()) {
        case Episode::Action::Type::ATOMIC:
            issueAtomicOps();
            break;
        case Episode::Action::Type::ACQUIRE:
            issueAcquireOp();
            break;
        case Episode::Action::Type::RELEASE:
            issueReleaseOp();
            break;
        case Episode::Action::Type::LOAD:
            issueLoadOps();
            break;
        case Episode::Action::Type::STORE:
            issueStoreOps();
            break;
        default:
            panic("The tester got an invalid action\n");
    }
    // the current action has been issued, pop it from the action list
    curEpisode->popAction();
    lastActiveCycle = curCycle();
    // we may be able to schedule the next action
    // just wake up this thread in the next cycle
    if (!threadEvent.scheduled()) {
        scheduleWakeup();
    }
 }
 void
 GpuThread::addOutstandingReqs(OutstandingReqTable& req_table, Addr address,
                           int lane, Location loc, Value stored_val)
 {
    OutstandingReqTable::iterator it = req_table.find(address);
    OutstandingReq req(lane, loc, stored_val, curCycle());
    if (it == req_table.end()) {
        // insert a new list of requests for this address
        req_table.insert(std::pair<Addr, OutstandingReqList>(address,
                                                OutstandingReqList(1, req)));
    } else {
        // add a new request
        (it->second).push_back(req);
    }
 }
 GpuThread::OutstandingReq
 GpuThread::popOutstandingReq(OutstandingReqTable& req_table, Addr addr)
 {
    OutstandingReqTable::iterator it = req_table.find(addr);
    // there must be exactly one list of requests for this address in the table
    assert(it != req_table.end());
    // get the request list
    OutstandingReqList& req_list = it->second;
    assert(!req_list.empty());
    // save a request
    OutstandingReq ret_req = req_list.back();
    // remove the request from the list
    req_list.pop_back();
    // if the list is now empty, remove it from req_table
    if (req_list.empty()) {
        req_table.erase(it);
    }
    return ret_req;
 }
 void
 GpuThread::validateAtomicResp(Location loc, int lane, Value ret_val)
 {
    if (!addrManager->validateAtomicResp(loc, ret_val)) {
        std::stringstream ss;
        Addr addr = addrManager->getAddress(loc);
        // basic info
        ss << threadName << ": Atomic Op returned unexpected value\n"
           << "\tEpisode " << curEpisode->getEpisodeId() << "\n"
           << "\tLane ID " << lane << "\n"
           << "\tAddress " << printAddress(addr) << "\n"
           << "\tAtomic Op's return value " << ret_val << "\n";
        // print out basic info
        warn("%s\n", ss.str());
        // TODO add more detailed info
        // dump all error info and exit the simulation
        tester->dumpErrorLog(ss);
    }
 }
 void
 GpuThread::validateLoadResp(Location loc, int lane, Value ret_val)
 {
    if (ret_val != addrManager->getLoggedValue(loc)) {
        std::stringstream ss;
        Addr addr = addrManager->getAddress(loc);
        // basic info
        ss << threadName << ": Loaded value is not consistent with "
           << "the last stored value\n"
           << "\tGpuThread " << threadId << "\n"
           << "\tEpisode " << curEpisode->getEpisodeId() << "\n"
           << "\tLane ID " << lane << "\n"
           << "\tAddress " << printAddress(addr) << "\n"
           << "\tLoaded value " << ret_val << "\n"
           << "\tLast writer " << addrManager->printLastWriter(loc) << "\n";
        // print out basic info
        warn("%s\n", ss.str());
        // TODO add more detailed info
        // dump all error info and exit the simulation
        tester->dumpErrorLog(ss);
    }
 }
 bool
 GpuThread::checkDRF(Location atomic_loc, Location loc, bool isStore) const
 {
    if (curEpisode && curEpisode->isEpsActive()) {
        // check against the current episode this thread is executing
        return curEpisode->checkDRF(atomic_loc, loc, isStore, numLanes);
    }
    return true;
 }
 void
 GpuThread::checkDeadlock()
 {
    if ((curCycle() - lastActiveCycle) > deadlockThreshold) {
        // deadlock detected
        std::stringstream ss;
        ss << threadName << ": Deadlock detected\n"
           << "\tLast active cycle: " <<  lastActiveCycle << "\n"
           << "\tCurrent cycle: " << curCycle() << "\n"
           << "\tDeadlock threshold: " << deadlockThreshold << "\n";
        // print out basic info
        warn("%s\n", ss.str());
        // dump all error info and exit the simulation
        tester->dumpErrorLog(ss);
    } else if (!tester->checkExit()) {
        // schedule a future deadlock check event
        assert(!deadlockCheckEvent.scheduled());
        schedule(deadlockCheckEvent,
                 deadlockThreshold * clockPeriod() + curTick());
    }
 }
 void
 GpuThread::printOutstandingReqs(const OutstandingReqTable& table,
                             std::stringstream& ss) const
 {
    Cycles cur_cycle = curCycle();
    for (const auto& m : table) {
        for (const auto& req : m.second) {
            ss << "\t\t\tAddr " << printAddress(m.first)
               << ": delta (curCycle - issueCycle) = "
               << (cur_cycle - req.issueCycle) << std::endl;
        }
    }
 }
 void
 GpuThread::printAllOutstandingReqs(std::stringstream& ss) const
 {
    // dump all outstanding requests of this thread
    ss << "\t\tOutstanding Loads:\n";
    printOutstandingReqs(outstandingLoads, ss);
    ss << "\t\tOutstanding Stores:\n";
    printOutstandingReqs(outstandingStores, ss);
    ss << "\t\tOutstanding Atomics:\n";
    printOutstandingReqs(outstandingAtomics, ss);
    ss << "\t\tNumber of outstanding acquires & releases: "
       << pendingFenceCount << std::endl;
 }
--- a/src/cpu/testers/gpu_ruby_test/gpu_thread.hh
+++ b/src/cpu/testers/gpu_ruby_test/gpu_thread.hh
@@ -0,0 +1,199 @@
 /*
 * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 /*
 * GPU thread issues requests to and receives responses from Ruby memory
 */
 #ifndef CPU_TESTERS_PROTOCOL_TESTER_GPU_THREAD_HH_
 #define CPU_TESTERS_PROTOCOL_TESTER_GPU_THREAD_HH_
 #include "cpu/testers/gpu_ruby_test/address_manager.hh"
 #include "cpu/testers/gpu_ruby_test/episode.hh"
 #include "cpu/testers/gpu_ruby_test/protocol_tester.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"
 #include "sim/clocked_object.hh"
 class GpuThread : public ClockedObject
 {
  public:
    typedef GpuThreadParams Params;
    GpuThread(const Params &p);
    virtual ~GpuThread();
    typedef AddressManager::Location Location;
    typedef AddressManager::Value Value;
    void wakeup();
    void scheduleWakeup();
    void checkDeadlock();
    void scheduleDeadlockCheckEvent();
    void attachGpuThreadToPorts(ProtocolTester *_tester,
                             ProtocolTester::SeqPort *_port,
                             ProtocolTester::SeqPort *_sqcPort = nullptr,
                             ProtocolTester::SeqPort *_scalarPort = nullptr);
    const std::string& getName() const { return threadName; }
    // must be implemented by a child class
    virtual void hitCallback(PacketPtr pkt) = 0;
    int getGpuThreadId() const { return threadId; }
    int getNumLanes() const { return numLanes; }
    // check if the input location would satisfy DRF constraint
    bool checkDRF(Location atomic_loc, Location loc, bool isStore) const;
    void printAllOutstandingReqs(std::stringstream& ss) const;
  protected:
    class GpuThreadEvent : public Event
    {
      private:
        GpuThread* thread;
        std::string desc;
      public:
        GpuThreadEvent(GpuThread* _thread, std::string _description)
            : Event(CPU_Tick_Pri), thread(_thread), desc(_description)
        {}
        void setDesc(std::string _description) { desc = _description; }
        void process() { thread->wakeup(); }
        const std::string name() { return desc; }
    };
    GpuThreadEvent threadEvent;
    class DeadlockCheckEvent : public Event
    {
      private:
        GpuThread* thread;
      public:
        DeadlockCheckEvent(GpuThread* _thread)
            : Event(CPU_Tick_Pri), thread(_thread)
        {}
        void process() { thread->checkDeadlock(); }
        const std::string name() const { return "Tester deadlock check"; }
    };
    DeadlockCheckEvent deadlockCheckEvent;
    struct OutstandingReq
    {
        int lane;
        Location origLoc;
        Value storedValue;
        Cycles issueCycle;
        OutstandingReq(int _lane, Location _loc, Value _val, Cycles _cycle)
            : lane(_lane), origLoc(_loc), storedValue(_val), issueCycle(_cycle)
        {}
        ~OutstandingReq()
        {}
    };
    // the unique global id of this thread
    int threadId;
    // width of this thread (1 for cpu thread & wf size for gpu wavefront)
    int numLanes;
    // thread name
    std::string threadName;
    // pointer to the main tester
    ProtocolTester *tester;
    // pointer to the address manager
    AddressManager *addrManager;
    ProtocolTester::SeqPort *port;       // main data port (GPU-vector data)
    ProtocolTester::SeqPort *scalarPort; // nullptr for CPU
    ProtocolTester::SeqPort *sqcPort;   // nullptr for CPU
    // a list of issued episodes sorted by time
    // the last episode in the list is the current episode
    typedef std::vector<Episode*> EpisodeHistory;
    EpisodeHistory episodeHistory;
    // pointer to the current episode
    Episode *curEpisode;
    // pointer to the current action
    const Episode::Action *curAction;
    // number of outstanding requests that are waiting for their responses
    int pendingLdStCount;
    int pendingFenceCount;
    int pendingAtomicCount;
    // last cycle when there is an event in this thread
    Cycles lastActiveCycle;
    Cycles deadlockThreshold;
    // a per-address list of outstanding requests
    typedef std::vector<OutstandingReq> OutstandingReqList;
    typedef std::unordered_map<Addr, OutstandingReqList> OutstandingReqTable;
    OutstandingReqTable outstandingLoads;
    OutstandingReqTable outstandingStores;
    OutstandingReqTable outstandingAtomics;
    void issueNewEpisode();
    // check if the next action in the current episode satisfies all wait_cnt
    // constraints and is ready to issue
    bool isNextActionReady();
    void issueNextAction();
    // issue Ops to Ruby memory
    // must be implemented by a child class
    virtual void issueLoadOps() = 0;
    virtual void issueStoreOps() = 0;
    virtual void issueAtomicOps() = 0;
    virtual void issueAcquireOp() = 0;
    virtual void issueReleaseOp() = 0;
    // add an outstanding request to its corresponding table
    void addOutstandingReqs(OutstandingReqTable& req_table, Addr addr,
                            int lane, Location loc,
                            Value stored_val = AddressManager::INVALID_VALUE);
    // pop an outstanding request from the input table
    OutstandingReq popOutstandingReq(OutstandingReqTable& req_table,
                                     Addr address);
    // validate all atomic responses
    void validateAtomicResp(Location loc, int lane, Value ret_val);
    // validate all Load responses
    void validateLoadResp(Location loc, int lane, Value ret_val);
    void printOutstandingReqs(const OutstandingReqTable& table,
                              std::stringstream& ss) const;
 };
 #endif /* CPU_TESTERS_PROTOCOL_TESTER_GPU_THREAD_HH_ */
--- a/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
+++ b/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
@@ -0,0 +1,377 @@
 /*
 * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "cpu/testers/gpu_ruby_test/gpu_wavefront.hh"
 #include "debug/ProtocolTest.hh"
 GpuWavefront::GpuWavefront(const Params &p)
      : GpuThread(p), cuId(p.cu_id)
 {
    threadName = "GpuWavefront(GpuThread ID = " + std::to_string(threadId) +
                 ", CU ID = " + std::to_string(cuId) + ")";
    threadEvent.setDesc("GpuWavefront tick");
 }
 GpuWavefront::~GpuWavefront()
 {
 }
 GpuWavefront*
 GpuWavefrontParams::create() const
 {
    return new GpuWavefront(*this);
 }
 void
 GpuWavefront::issueLoadOps()
 {
    assert(curAction);
    assert(curAction->getType() == Episode::Action::Type::LOAD);
    // we should not have any outstanding fence or atomic op at this point
    assert(pendingFenceCount == 0);
    assert(pendingAtomicCount == 0);
    for (int lane = 0; lane < numLanes; ++lane) {
        Location location = curAction->getLocation(lane);
        assert(location >= AddressManager::INVALID_LOCATION);
        // Make a request if we do not get an INVALID_LOCATION for this lane.
        if (location >= 0) {
            Addr address = addrManager->getAddress(location);
            DPRINTF(ProtocolTest, "%s Episode %d: Issuing Load - Addr %s\n",
                    this->getName(), curEpisode->getEpisodeId(),
                    printAddress(address));
            int load_size = sizeof(Value);
            // for now, assert address is 4-byte aligned
            assert(address % load_size == 0);
            auto req = std::make_shared<Request>(address, load_size,
                                                 0, tester->requestorId(),
                                                 0, threadId, nullptr);
            req->setPaddr(address);
            req->setReqInstSeqNum(tester->getActionSeqNum());
            // set protocol-specific flags
            setExtraRequestFlags(req);
            PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
            uint8_t* data = new uint8_t[load_size];
            pkt->dataDynamic(data);
            pkt->senderState = new ProtocolTester::SenderState(this);
            // increment the number of outstanding ld_st requests
            pendingLdStCount++;
            if (!port->sendTimingReq(pkt)) {
                panic("Not expected failed sendTimingReq\n");
            }
            // insert an outstanding load
            addOutstandingReqs(outstandingLoads, address, lane, location);
        }
    }
 }
 void
 GpuWavefront::issueStoreOps()
 {
    assert(curAction);
    assert(curAction->getType() == Episode::Action::Type::STORE);
    // we should not have any outstanding fence or atomic op at this point
    assert(pendingFenceCount == 0);
    assert(pendingAtomicCount == 0);
    for (int lane = 0; lane < numLanes; ++lane) {
        Location location = curAction->getLocation(lane);
        assert(location >= AddressManager::INVALID_LOCATION);
        // Make a request if we do not get an INVALID_LOCATION for this lane.
        if (location >= 0) {
            // prepare the next value to store
            Value new_value = addrManager->getLoggedValue(location) + 1;
            Addr address = addrManager->getAddress(location);
            // must be aligned with store size
            assert(address % sizeof(Value) == 0);
            DPRINTF(ProtocolTest, "%s Episode %d: Issuing Store - Addr %s - "
                    "Value %d\n", this->getName(),
                    curEpisode->getEpisodeId(), printAddress(address),
                    new_value);
            auto req = std::make_shared<Request>(address, sizeof(Value),
                                                 0, tester->requestorId(), 0,
                                                 threadId, nullptr);
            req->setPaddr(address);
            req->setReqInstSeqNum(tester->getActionSeqNum());
            // set protocol-specific flags
            setExtraRequestFlags(req);
            PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
            uint8_t *writeData = new uint8_t[sizeof(Value)];
            for (int j = 0; j < sizeof(Value); ++j) {
                writeData[j] = ((uint8_t*)&new_value)[j];
            }
            pkt->dataDynamic(writeData);
            pkt->senderState = new ProtocolTester::SenderState(this);
            // increment the number of outstanding ld_st requests
            pendingLdStCount++;
            if (!port->sendTimingReq(pkt)) {
                panic("Not expecting a failed sendTimingReq\n");
            }
            // add an outstanding store
            addOutstandingReqs(outstandingStores, address, lane, location,
                               new_value);
        }
    }
 }
 void
 GpuWavefront::issueAtomicOps()
 {
    assert(curAction);
    assert(curAction->getType() == Episode::Action::Type::ATOMIC);
    // we should not have any outstanding ops at this point
    assert(pendingFenceCount == 0);
    assert(pendingLdStCount == 0);
    assert(pendingAtomicCount == 0);
    // we use atomic_inc in the tester
    Request::Flags flags = Request::ATOMIC_RETURN_OP;
    for (int lane = 0; lane < numLanes; ++lane) {
        Location location = curAction->getLocation(lane);
        assert(location >= 0);
        Addr address = addrManager->getAddress(location);
        DPRINTF(ProtocolTest, "%s Episode %d: Issuing Atomic_Inc - Addr %s\n",
                this->getName(), curEpisode->getEpisodeId(),
                printAddress(address));
        // must be aligned with store size
        assert(address % sizeof(Value) == 0);
        AtomicOpFunctor *amo_op = new AtomicOpInc<Value>();
        auto req = std::make_shared<Request>(address, sizeof(Value),
                                             flags, tester->requestorId(),
                                             0, threadId,
                                             AtomicOpFunctorPtr(amo_op));
        req->setPaddr(address);
        req->setReqInstSeqNum(tester->getActionSeqNum());
        // set protocol-specific flags
        setExtraRequestFlags(req);
        PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
        uint8_t* data = new uint8_t[sizeof(Value)];
        pkt->dataDynamic(data);
        pkt->senderState = new ProtocolTester::SenderState(this);
        if (!port->sendTimingReq(pkt)) {
            panic("Not expecting failed sendTimingReq\n");
        }
        // increment the number of outstanding atomic ops
        pendingAtomicCount++;
        // add an outstanding atomic
        addOutstandingReqs(outstandingAtomics, address, lane, location);
    }
 }
 void
 GpuWavefront::issueAcquireOp()
 {
    DPRINTF(ProtocolTest, "%s Episode %d: Issuing Acquire\n", this->getName(),
            curEpisode->getEpisodeId());
    assert(curAction);
    assert(curAction->getType() == Episode::Action::Type::ACQUIRE);
    // we should not have any outstanding ops at this point
    assert(pendingFenceCount == 0);
    assert(pendingLdStCount == 0);
    assert(pendingAtomicCount == 0);
    auto acq_req = std::make_shared<Request>(0, 0, 0,
                                             tester->requestorId(), 0,
                                             threadId, nullptr);
    acq_req->setPaddr(0);
    acq_req->setReqInstSeqNum(tester->getActionSeqNum());
    acq_req->setFlags(Request::ACQUIRE);
    // set protocol-specific flags
    setExtraRequestFlags(acq_req);
    PacketPtr pkt = new Packet(acq_req, MemCmd::MemSyncReq);
    pkt->senderState = new ProtocolTester::SenderState(this);
    // increment the number of outstanding fence requests
    pendingFenceCount++;
    if (!port->sendTimingReq(pkt)) {
        panic("Not expecting failed sendTimingReq\n");
    }
 }
 void
 GpuWavefront::issueReleaseOp()
 {
    DPRINTF(ProtocolTest, "%s Episode %d: Issuing Release\n", this->getName(),
            curEpisode->getEpisodeId());
    // A release fence simply waits for all previous stores to complete. All
    // previous loads and stores were done before this release operation is
    // issued, so issueReleaseOp is just a no-op in this tester.
    // we may be able to issue an action. Let's check
    if (!threadEvent.scheduled()) {
        scheduleWakeup();
    }
 }
 void
 GpuWavefront::hitCallback(PacketPtr pkt)
 {
    assert(pkt);
    MemCmd resp_cmd = pkt->cmd;
    Addr addr = (resp_cmd == MemCmd::WriteCompleteResp) ? 0 : pkt->getAddr();
    DPRINTF(ProtocolTest, "%s Episode %d: hitCallback - Command %s - "
                    "Addr %s\n", this->getName(),
                    curEpisode->getEpisodeId(), resp_cmd.toString(),
                    printAddress(addr));
    // whether the transaction is done after this hitCallback
    bool isTransactionDone = true;
    if (resp_cmd == MemCmd::MemSyncResp) {
        // response to a pending fence
        // no validation needed for fence responses
        assert(pendingFenceCount > 0);
        assert(pendingLdStCount == 0);
        assert(pendingAtomicCount == 0);
        pendingFenceCount--;
    } else if (resp_cmd == MemCmd::ReadResp) {
        // response to a pending read
        assert(pendingLdStCount > 0);
        assert(pendingAtomicCount == 0);
        assert(outstandingLoads.count(addr) > 0);
        // get return data
        Value value = *(pkt->getPtr<Value>());
        OutstandingReq req = popOutstandingReq(outstandingLoads, addr);
        validateLoadResp(req.origLoc, req.lane, value);
        // this Read is done
        pendingLdStCount--;
    } else if (resp_cmd == MemCmd::WriteResp) {
        // response to a pending write
        assert(pendingLdStCount > 0);
        assert(pendingAtomicCount == 0);
        // no need to validate Write response
        // just pop it from the outstanding req table so that subsequent
        // requests dependent on this write can proceed
        // note that we don't decrement pendingLdStCount here yet since
        // the write is not yet completed in downstream memory. Instead, we
        // decrement the counter when we receive the write completion ack
        assert(outstandingStores.count(addr) > 0);
        OutstandingReq req = popOutstandingReq(outstandingStores, addr);
        assert(req.storedValue != AddressManager::INVALID_VALUE);
        // update log table
        addrManager->updateLogTable(req.origLoc, threadId,
                                    curEpisode->getEpisodeId(),
                                    req.storedValue,
                                    curTick(),
                                    cuId);
        // the transaction is not done yet. Waiting for write completion ack
        isTransactionDone = false;
    } else if (resp_cmd == MemCmd::SwapResp) {
        // response to a pending atomic
        assert(pendingAtomicCount > 0);
        assert(pendingLdStCount == 0);
        assert(outstandingAtomics.count(addr) > 0);
        // get return data
        Value value = *(pkt->getPtr<Value>());
        // validate atomic op return
        OutstandingReq req = popOutstandingReq(outstandingAtomics, addr);
        validateAtomicResp(req.origLoc, req.lane, value);
        // update log table
        addrManager->updateLogTable(req.origLoc, threadId,
                                    curEpisode->getEpisodeId(), value,
                                    curTick(),
                                    cuId);
        // this Atomic is done
        pendingAtomicCount--;
    } else if (resp_cmd == MemCmd::WriteCompleteResp) {
        // write completion ACK
        assert(pendingLdStCount > 0);
        assert(pendingAtomicCount == 0);
        // the Write is now done
        pendingLdStCount--;
    } else {
        panic("Unsupported MemCmd response type");
    }
    if (isTransactionDone) {
        // no need to keep senderState and request around
        delete pkt->senderState;
    }
    delete pkt;
    // record the last active cycle to check for deadlock
    lastActiveCycle = curCycle();
    // we may be able to issue an action. Let's check
    if (!threadEvent.scheduled()) {
        scheduleWakeup();
    }
 }
 void
 GpuWavefront::setExtraRequestFlags(RequestPtr req)
 {
    // No extra request flag is set
 }
--- a/src/cpu/testers/gpu_ruby_test/gpu_wavefront.hh
+++ b/src/cpu/testers/gpu_ruby_test/gpu_wavefront.hh
@@ -0,0 +1,68 @@
 /*
 * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef CPU_TESTERS_PROTOCOL_TESTER_GPU_WAVEFRONT_HH_
 #define CPU_TESTERS_PROTOCOL_TESTER_GPU_WAVEFRONT_HH_
 #include "cpu/testers/gpu_ruby_test/gpu_thread.hh"
 #include "params/GpuWavefront.hh"
 #include "sim/clocked_object.hh"
 class GpuWavefront : public GpuThread
 {
  public:
    typedef GpuWavefrontParams Params;
    GpuWavefront(const Params &p);
    virtual ~GpuWavefront();
    typedef AddressManager::Location Location;
    typedef AddressManager::Value Value;
    virtual void hitCallback(PacketPtr pkt);
  protected:
    void issueLoadOps();
    void issueStoreOps();
    void issueAtomicOps();
    // acquire and release ops are protocol-specific, so their issue functions
    // may be redefined by a child class of GpuWavefront
    virtual void issueAcquireOp();
    virtual void issueReleaseOp();
    // set extra request flags that is specific to a target protocol
    virtual void setExtraRequestFlags(RequestPtr req);
  protected:
    int cuId;    // compute unit associated with this wavefront
 };
 #endif /* CPU_TESTERS_PROTOCOL_TESTER_GPU_WAVEFRONT_HH_ */
--- a/src/cpu/testers/gpu_ruby_test/protocol_tester.cc
+++ b/src/cpu/testers/gpu_ruby_test/protocol_tester.cc
@@ -0,0 +1,312 @@
 /*
 * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include "cpu/testers/gpu_ruby_test/protocol_tester.hh"
 #include <algorithm>
 #include <ctime>
 #include <fstream>
 #include <random>
 #include "cpu/testers/gpu_ruby_test/cpu_thread.hh"
 #include "cpu/testers/gpu_ruby_test/gpu_thread.hh"
 #include "cpu/testers/gpu_ruby_test/gpu_wavefront.hh"
 #include "debug/ProtocolTest.hh"
 #include "mem/request.hh"
 #include "sim/sim_exit.hh"
 #include "sim/system.hh"
 ProtocolTester::ProtocolTester(const Params &p)
      : ClockedObject(p),
        _requestorId(p.system->getRequestorId(this)),
        numCpuPorts(p.port_cpu_ports_connection_count),
        numVectorPorts(p.port_cu_vector_ports_connection_count),
        numSqcPorts(p.port_cu_sqc_ports_connection_count),
        numScalarPorts(p.port_cu_scalar_ports_connection_count),
        numCusPerSqc(p.cus_per_sqc),
        numCusPerScalar(p.cus_per_scalar),
        numWfsPerCu(p.wavefronts_per_cu),
        numWisPerWf(p.workitems_per_wavefront),
        numAtomicLocs(p.num_atomic_locations),
        numNormalLocsPerAtomic(p.num_normal_locs_per_atomic),
        episodeLength(p.episode_length),
        maxNumEpisodes(p.max_num_episodes),
        debugTester(p.debug_tester),
        cpuThreads(p.cpu_threads),
        wfs(p.wavefronts)
 {
    int idx = 0;  // global port index
    numCpus = numCpuPorts;     // 1 cpu port per CPU
    numCus = numVectorPorts;   // 1 vector port per CU
    // create all physical cpu's data ports
    for (int i = 0; i < numCpuPorts; ++i) {
        DPRINTF(ProtocolTest, "Creating %s\n",
                csprintf("%s-cpuPort%d", name(), i));
        cpuPorts.push_back(new SeqPort(csprintf("%s-cpuPort%d", name(), i),
                                       this, i, idx));
        idx++;
    }
    // create all physical gpu's data ports
    for (int i = 0; i < numVectorPorts; ++i) {
        DPRINTF(ProtocolTest, "Creating %s\n",
                csprintf("%s-cuVectorPort%d", name(), i));
        cuVectorPorts.push_back(new SeqPort(csprintf("%s-cuVectorPort%d",
                                                     name(), i),
                                            this, i, idx));
        idx++;
    }
    for (int i = 0; i < numScalarPorts; ++i) {
        DPRINTF(ProtocolTest, "Creating %s\n",
                              csprintf("%s-cuScalarPort%d", name(), i));
        cuScalarPorts.push_back(new SeqPort(csprintf("%s-cuScalarPort%d",
                                                     name(), i),
                                            this, i, idx));
        idx++;
    }
    for (int i = 0; i < numSqcPorts; ++i) {
        DPRINTF(ProtocolTest, "Creating %s\n",
                              csprintf("%s-cuSqcPort%d", name(), i));
        cuSqcPorts.push_back(new SeqPort(csprintf("%s-cuSqcPort%d",
                                                  name(), i),
                                         this, i, idx));
        idx++;
    }
    // create an address manager
    addrManager = new AddressManager(numAtomicLocs,
                                       numNormalLocsPerAtomic);
    nextEpisodeId = 0;
    if (!debugTester)
      warn("Data race check is not enabled\n");
    sentExitSignal = false;
    // set random seed number
    if (p.random_seed != 0) {
        srand(p.random_seed);
    } else {
        srand(time(NULL));
    }
    actionCount = 0;
    // create a new log file
    logFile = simout.create(p.log_file);
    assert(logFile);
    // print test configs
    std::stringstream ss;
    ss << "GPU Ruby test's configurations" << std::endl
       << "\tNumber of CPUs: " << numCpus << std::endl
       << "\tNumber of CUs: " << numCus << std::endl
       << "\tNumber of wavefronts per CU: " << numWfsPerCu << std::endl
       << "\tWavefront size: " << numWisPerWf << std::endl
       << "\tNumber of atomic locations: " << numAtomicLocs << std::endl
       << "\tNumber of non-atomic locations: "
       << numNormalLocsPerAtomic * numAtomicLocs << std::endl
       << "\tEpisode length: " << episodeLength << std::endl
       << "\tTest length (max number of episodes): " << maxNumEpisodes
       << std::endl
       << "\tRandom seed: " << p.random_seed
       << std::endl;
    ccprintf(*(logFile->stream()), "%s", ss.str());
    logFile->stream()->flush();
 }
 ProtocolTester::~ProtocolTester()
 {
    for (int i = 0; i < cpuPorts.size(); ++i)
        delete cpuPorts[i];
    for (int i = 0; i < cuVectorPorts.size(); ++i)
        delete cuVectorPorts[i];
    for (int i = 0; i < cuScalarPorts.size(); ++i)
        delete cuScalarPorts[i];
    for (int i = 0; i < cuSqcPorts.size(); ++i)
        delete cuSqcPorts[i];
    delete addrManager;
    // close the log file
    simout.close(logFile);
 }
 void
 ProtocolTester::init()
 {
    DPRINTF(ProtocolTest, "Attach threads to ports\n");
    // connect cpu threads to cpu's ports
    for (int cpu_id = 0; cpu_id < numCpus; ++cpu_id) {
        cpuThreads[cpu_id]->attachGpuThreadToPorts(this,
                                      static_cast<SeqPort*>(cpuPorts[cpu_id]));
        cpuThreads[cpu_id]->scheduleWakeup();
        cpuThreads[cpu_id]->scheduleDeadlockCheckEvent();
    }
    // connect gpu wavefronts to gpu's ports
    int wfId = 0;
    int vectorPortId = 0;
    int sqcPortId = 0;
    int scalarPortId = 0;
    for (int cu_id = 0; cu_id < numCus; ++cu_id) {
        vectorPortId = cu_id;
        sqcPortId = cu_id/numCusPerSqc;
        scalarPortId = cu_id/numCusPerScalar;
        for (int i = 0; i < numWfsPerCu; ++i) {
            wfId = cu_id * numWfsPerCu + i;
            wfs[wfId]->attachGpuThreadToPorts(this,
                           static_cast<SeqPort*>(cuVectorPorts[vectorPortId]),
                           static_cast<SeqPort*>(cuSqcPorts[sqcPortId]),
                           static_cast<SeqPort*>(cuScalarPorts[scalarPortId]));
            wfs[wfId]->scheduleWakeup();
            wfs[wfId]->scheduleDeadlockCheckEvent();
        }
    }
 }
 Port&
 ProtocolTester::getPort(const std::string &if_name, PortID idx)
 {
    if (if_name != "cpu_ports" && if_name != "cu_vector_ports" &&
        if_name != "cu_sqc_ports" && if_name != "cu_scalar_ports") {
        // pass along to super class
        return ClockedObject::getPort(if_name, idx);
    } else {
        if (if_name == "cpu_ports") {
            if (idx > numCpuPorts)
                panic("ProtocolTester: unknown cpu port %d\n", idx);
            return *cpuPorts[idx];
        } else if (if_name == "cu_vector_ports") {
            if (idx > numVectorPorts)
                panic("ProtocolTester: unknown cu vect port %d\n", idx);
            return *cuVectorPorts[idx];
        } else if (if_name == "cu_sqc_ports") {
            if (idx > numSqcPorts)
                panic("ProtocolTester: unknown cu sqc port %d\n", idx);
            return *cuSqcPorts[idx];
        } else {
            assert(if_name == "cu_scalar_ports");
            if (idx > numScalarPorts)
                panic("ProtocolTester: unknown cu scal port %d\n", idx);
            return *cuScalarPorts[idx];
        }
    }
    assert(false);
 }
 bool
 ProtocolTester::checkExit()
 {
    if (nextEpisodeId > maxNumEpisodes) {
        if (!sentExitSignal) {
            // all done
            inform("Total completed episodes: %d\n", nextEpisodeId - 1);
            exitSimLoop("GPU Ruby Tester: Passed!");
            sentExitSignal = true;
        }
        return true;
    }
    return false;
 }
 bool
 ProtocolTester::checkDRF(Location atomic_loc,
                         Location loc, bool isStore) const
 {
    if (debugTester) {
        // go through all active episodes in all threads
        for (const GpuThread* th : wfs) {
            if (!th->checkDRF(atomic_loc, loc, isStore))
                return false;
        }
        for (const GpuThread* th : cpuThreads) {
            if (!th->checkDRF(atomic_loc, loc, isStore))
                return false;
        }
    }
    return true;
 }
 void
 ProtocolTester::dumpErrorLog(std::stringstream& ss)
 {
    if (!sentExitSignal) {
        // go through all threads and dump their outstanding requests
        for (auto t : cpuThreads) {
            t->printAllOutstandingReqs(ss);
        }
        for (auto t : wfs) {
            t->printAllOutstandingReqs(ss);
        }
        // dump error log into a file
        assert(logFile);
        ccprintf(*(logFile->stream()), "%s", ss.str());
        logFile->stream()->flush();
        sentExitSignal = true;
        // terminate the simulation
        panic("GPU Ruby Tester: Failed!\n");
    }
 }
 bool
 ProtocolTester::SeqPort::recvTimingResp(PacketPtr pkt)
 {
    // get the requesting thread from the original sender state
    ProtocolTester::SenderState* senderState =
                    safe_cast<ProtocolTester::SenderState*>(pkt->senderState);
    GpuThread *th = senderState->th;
    th->hitCallback(pkt);
    return true;
 }
 ProtocolTester*
 ProtocolTesterParams::create() const
 {
    return new ProtocolTester(*this);
 }
--- a/src/cpu/testers/gpu_ruby_test/protocol_tester.hh
+++ b/src/cpu/testers/gpu_ruby_test/protocol_tester.hh
@@ -0,0 +1,178 @@
 /*
 * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef CPU_TESTERS_PROTOCOL_TESTER_PROTOCOL_TESTER_HH_
 #define CPU_TESTERS_PROTOCOL_TESTER_PROTOCOL_TESTER_HH_
 /*
 * The tester includes the main ProtocolTester that manages all ports to the
 * memory system.
 * GpuThreads are mapped to certain data port(s)
 *
 * GpuThreads inject memory requests through their data ports.
 * The tester receives and validates responses from the memory.
 *
 * Main components
 *    - AddressManager: generate DRF request streams &
 *                      validate data response against an internal log_table
 *    - Episode: a sequence of requests
 *    - Thread: either GPU wavefront or CPU thread
 */
 #include <iostream>
 #include <map>
 #include <string>
 #include <vector>
 #include "base/types.hh"
 #include "cpu/testers/gpu_ruby_test/address_manager.hh"
 #include "mem/packet.hh"
 #include "mem/ruby/system/RubyPort.hh"
 #include "params/ProtocolTester.hh"
 class GpuThread;
 class CpuThread;
 class GpuWavefront;
 class ProtocolTester : public ClockedObject
 {
  public:
    class SeqPort : public RequestPort
    {
      public:
        SeqPort(const std::string &_name, ProtocolTester *_tester, PortID _id,
                PortID _index)
            : RequestPort(_name, _tester, _id)
        {}
      protected:
        virtual bool recvTimingResp(PacketPtr pkt);
        virtual void recvReqRetry()
            { panic("%s does not expect a retry\n", name()); }
    };
    struct SenderState : public Packet::SenderState
    {
        GpuThread* th;
        SenderState(GpuThread* _th)
        {
            assert(_th);
            th = _th;
        }
        ~SenderState()
        {}
    };
  public:
    typedef ProtocolTesterParams Params;
    ProtocolTester(const Params &p);
    ~ProtocolTester();
    typedef AddressManager::Location Location;
    typedef AddressManager::Value Value;
    void init();
    RequestorID requestorId() { return _requestorId; };
    Port& getPort(const std::string &if_name,
                  PortID idx=InvalidPortID) override;
    int getEpisodeLength() const { return episodeLength; }
    // return pointer to the address manager
    AddressManager* getAddressManager() const { return addrManager; }
    // return true if the tester should stop issuing new episodes
    bool checkExit();
    // verify if a location to be picked for LD/ST will satisfy
    // data race free requirement
    bool checkDRF(Location atomic_loc, Location loc, bool isStore) const;
    // return the next episode id and increment it
    int getNextEpisodeID() { return nextEpisodeId++; }
    // get action sequence number
    int getActionSeqNum() { return actionCount++; }
    // dump error log into a file and exit the simulation
    void dumpErrorLog(std::stringstream& ss);
  private:
    RequestorID _requestorId;
    // list of parameters taken from python scripts
    int numCpuPorts;
    int numVectorPorts;
    int numSqcPorts;
    int numScalarPorts;
    int numCusPerSqc;
    int numCusPerScalar;
    int numWfsPerCu;
    int numWisPerWf;
    // parameters controlling the address range that the tester can access
    int numAtomicLocs;
    int numNormalLocsPerAtomic;
    // the number of actions in an episode (episodeLength +- random number)
    int episodeLength;
    // the maximum number of episodes to be completed by this tester
    int maxNumEpisodes;
    // are we debuggin the tester
    bool debugTester;
    // all available requestor ports connected to Ruby
    std::vector<RequestPort*> cpuPorts;      // cpu data ports
    std::vector<RequestPort*> cuVectorPorts; // ports to GPU vector cache
    std::vector<RequestPort*> cuSqcPorts;    // ports to GPU inst cache
    std::vector<RequestPort*> cuScalarPorts; // ports to GPU scalar cache
    // all CPU and GPU threads
    std::vector<CpuThread*> cpuThreads;
    std::vector<GpuWavefront*> wfs;
    // address manager that (1) generates DRF sequences of requests,
    //                      (2) manages an internal log table and
    //                      (3) validate response data
    AddressManager* addrManager;
    // number of CPUs and CUs
    int numCpus;
    int numCus;
    // unique id of the next episode
    int nextEpisodeId;
    // global action count. Overflow is fine. It's used to uniquely identify
    // per-wave & per-instruction memory requests in the coalescer
    int actionCount;
    // if an exit signal was already sent
    bool sentExitSignal;
    OutputStream* logFile;
 };
 #endif /* CPU_TESTERS_PROTOCOL_TESTER_PROTOCOL_TESTER_HH_ */