tests,configs,mem-ruby: Adding Ruby tester for GPU_VIPER

This patch adds the GPU protocol tester that uses data-race-free operation to discover bugs in GPU protocols including GPU_VIPER. For more information please see the following paper and the README: T. Ta, X. Zhang, A. Gutierrez and B. M. Beckmann, "Autonomous Data-Race-Free GPU Testing," 2019 IEEE International Symposium on Workload Characterization (IISWC), Orlando, FL, USA, 2019, pp. 81-92, doi: 10.1109/IISWC47752.2019.9042019. Change-Id: Ic9939d131a930d1e7014ed0290601140bdd1499f Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/32855 Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Maintainer: Matt Sinclair <mattdsinclair@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
2020-09-24 14:53:13 -05:00
parent 1a2b677728
commit f36817c367
19 changed files with 3498 additions and 103 deletions
--- a/configs/example/ruby_gpu_random_test.py
+++ b/configs/example/ruby_gpu_random_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+# Copyright (c) 2018-2020 Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # For use for simulation and test purposes only
@@ -43,145 +43,272 @@ addToPath('../')
 from common import Options
 from ruby import Ruby

-# Get paths we might need.
-config_path = os.path.dirname(os.path.abspath(__file__))
-config_root = os.path.dirname(config_path)
-m5_root = os.path.dirname(config_root)
-
-parser = optparse.OptionParser()
-Options.addNoISAOptions(parser)
-
-parser.add_option("--maxloads", metavar="N", default=100,
-                  help="Stop after N loads")
-parser.add_option("-f", "--wakeup_freq", metavar="N", default=10,
-                  help="Wakeup every N cycles")
-parser.add_option("-u", "--num-compute-units", type="int", default=1,
-                  help="number of compute units in the GPU")
-parser.add_option("--num-cp", type="int", default=0,
-                  help="Number of GPU Command Processors (CP)")
-# not super important now, but to avoid putting the number 4 everywhere, make
-# it an option/knob
-parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs \
-                  sharing an SQC (icache, and thus icache TLB)")
-parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \
-                  "per CU")
-parser.add_option("--wf-size", type="int", default=64,
-                  help="Wavefront size(in workitems)")
-parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \
-                  "WF slots per SIMD")
-
 #
 # Add the ruby specific and protocol specific options
 #
+parser = optparse.OptionParser()
+Options.addNoISAOptions(parser)
 Ruby.define_options(parser)

-exec(compile( \
-    open(os.path.join(config_root, "common", "Options.py")).read(), \
-    os.path.join(config_root, "common", "Options.py"), 'exec'))
+# GPU Ruby tester options
+parser.add_option("--cache-size", type="choice", default="small",
+                  choices=["small", "large"],
+                  help="Cache sizes to use. Small encourages races between \
+                        requests and writebacks. Large stresses write-through \
+                        and/or write-back GPU caches.")
+parser.add_option("--system-size", type="choice", default="small",
+                  choices=["small", "medium", "large"],
+                  help="This option defines how many CUs, CPUs and cache \
+                        components in the test system.")
+parser.add_option("--address-range", type="choice", default="small",
+                  choices=["small", "large"],
+                  help="This option defines the number of atomic \
+                        locations that affects the working set's size. \
+                        A small number of atomic locations encourage more \
+                        races among threads. The large option stresses cache \
+                        resources.")
+parser.add_option("--episode-length", type="choice", default="short",
+                  choices=["short", "medium", "long"],
+                  help="This option defines the number of LDs and \
+                        STs in an episode. The small option encourages races \
+                        between the start and end of an episode. The long \
+                        option encourages races between LDs and STs in the \
+                        same episode.")
+parser.add_option("--test-length", type="int", default=1,
+                  help="The number of episodes to be executed by each \
+                        wavefront. This determines the maximum number, i.e., \
+                        val X #WFs, of episodes to be executed in the test.")
+parser.add_option("--debug-tester", action='store_true',
+                  help="This option will turn on DRF checker")
+parser.add_option("--random-seed", type="int", default=0,
+                  help="Random seed number. Default value (i.e., 0) means \
+                        using runtime-specific value")
+parser.add_option("--log-file", type="string", default="gpu-ruby-test.log")

 (options, args) = parser.parse_args()

-#
-# Set the default cache size and associativity to be very small to encourage
-# races between requests and writebacks.
-#
-options.l1d_size="256B"
-options.l1i_size="256B"
-options.l2_size="512B"
-options.l3_size="1kB"
-options.l1d_assoc=2
-options.l1i_assoc=2
-options.l2_assoc=2
-options.l3_assoc=2
-
-# This file can support multiple compute units
-assert(options.num_compute_units >= 1)
-n_cu = options.num_compute_units
-
-options.num_sqc = int((n_cu + options.cu_per_sqc - 1) // options.cu_per_sqc)
-
 if args:
     print("Error: script doesn't take any positional arguments")
     sys.exit(1)

 #
-# Create the ruby random tester
+# Set up cache size - 2 options
+#   0: small cache
+#   1: large cache
 #
-
-# Check to for the GPU_RfO protocol.  Other GPU protocols are non-SC and will
-# not work with the Ruby random tester.
-assert(buildEnv['PROTOCOL'] == 'GPU_RfO')
-
-# The GPU_RfO protocol does not support cache flushes
-check_flush = False
-
-tester = RubyTester(check_flush=check_flush,
-                    checks_to_complete=options.maxloads,
-                    wakeup_frequency=options.wakeup_freq,
-                    deadlock_threshold=1000000)
+if (options.cache_size == "small"):
+    options.tcp_size="256B"
+    options.tcp_assoc=2
+    options.tcc_size="1kB"
+    options.tcc_assoc=2
+elif (options.cache_size == "large"):
+    options.tcp_size="256kB"
+    options.tcp_assoc=16
+    options.tcc_size="1024kB"
+    options.tcc_assoc=16

 #
-# Create the M5 system.  Note that the Memory Object isn't
-# actually used by the rubytester, but is included to support the
-# M5 memory size == Ruby memory size checks
+# Set up system size - 3 options
 #
-system = System(cpu=tester, mem_ranges=[AddrRange(options.mem_size)])
+if (options.system_size == "small"):
+    # 1 CU, 1 CPU, 1 SQC, 1 Scalar
+    options.wf_size = 1
+    options.wavefronts_per_cu = 1
+    options.num_cpus = 1
+    options.cu_per_sqc = 1
+    options.cu_per_scalar_cache = 1
+    options.num_compute_units = 1
+elif (options.system_size == "medium"):
+    # 4 CUs, 4 CPUs, 1 SQCs, 1 Scalars
+    options.wf_size = 16
+    options.wavefronts_per_cu = 4
+    options.num_cpus = 4
+    options.cu_per_sqc = 4
+    options.cu_per_scalar_cache = 4
+    options.num_compute_units = 4
+elif (options.system_size == "large"):
+    # 8 CUs, 4 CPUs, 1 SQCs, 1 Scalars
+    options.wf_size = 32
+    options.wavefronts_per_cu = 4
+    options.num_cpus = 4
+    options.cu_per_sqc = 4
+    options.cu_per_scalar_cache = 4
+    options.num_compute_units = 8

-# Create a top-level voltage domain and clock domain
-system.voltage_domain = VoltageDomain(voltage=options.sys_voltage)
+#
+# Set address range - 2 options
+#   level 0: small
+#   level 1: large
+# Each location corresponds to a 4-byte piece of data
+#
+options.mem_size = '1024MB'
+if (options.address_range == "small"):
+    num_atomic_locs = 10
+    num_regular_locs_per_atomic_loc = 10000
+elif (options.address_range == "large"):
+    num_atomic_locs = 100
+    num_regular_locs_per_atomic_loc = 100000

-system.clk_domain = SrcClockDomain(clock=options.sys_clock,
-                                   voltage_domain=system.voltage_domain)
+#
+# Set episode length (# of actions per episode) - 3 options
+#   0: 10 actions
+#   1: 100 actions
+#   2: 500 actions
+#
+if (options.episode_length == "short"):
+    eps_length = 10
+elif (options.episode_length == "medium"):
+    eps_length = 100
+elif (options.episode_length == "long"):
+    eps_length = 500

+#
+# Set Ruby and tester deadlock thresholds. Ruby's deadlock detection is the
+# primary check for deadlocks. The tester's deadlock threshold detection is
+# a secondary check for deadlock. If there is a bug in RubyPort that causes
+# a packet not to return to the tester properly, the tester will issue a
+# deadlock panic. We set cache_deadlock_threshold < tester_deadlock_threshold
+# to detect deadlock caused by Ruby protocol first before one caused by the
+# coalescer. Both units are in Ticks
+#
+options.cache_deadlock_threshold = 1e8
+tester_deadlock_threshold = 1e9
+
+# For now we're testing only GPU protocol, so we force num_cpus to be 0
+options.num_cpus = 0
+
+# Number of CUs
+n_CUs = options.num_compute_units
+
+# Set test length, i.e., number of episodes per wavefront * #WFs.
+# Test length can be 1x#WFs, 10x#WFs, 100x#WFs, ...
+n_WFs = n_CUs * options.wavefronts_per_cu
+max_episodes = options.test_length * n_WFs
+
+# Number of SQC and Scalar caches
+assert(n_CUs % options.cu_per_sqc == 0)
+n_SQCs = n_CUs // options.cu_per_sqc
+options.num_sqc = n_SQCs
+
+assert(options.cu_per_scalar_cache != 0)
+n_Scalars = n_CUs // options.cu_per_scalar_cache
+options.num_scalar_cache = n_Scalars
+
+#
+# Create GPU Ruby random tester
+#
+tester = ProtocolTester(cus_per_sqc = options.cu_per_sqc,
+                        cus_per_scalar = options.cu_per_scalar_cache,
+                        wavefronts_per_cu = options.wavefronts_per_cu,
+                        workitems_per_wavefront = options.wf_size,
+                        num_atomic_locations = num_atomic_locs,
+                        num_normal_locs_per_atomic = \
+                                          num_regular_locs_per_atomic_loc,
+                        max_num_episodes = max_episodes,
+                        episode_length = eps_length,
+                        debug_tester = options.debug_tester,
+                        random_seed = options.random_seed,
+                        log_file = options.log_file)
+
+#
+# Create a gem5 system. Note that the memory object isn't actually used by the
+# tester, but is included to ensure the gem5 memory size == Ruby memory size
+# checks. The system doesn't have real CPUs or CUs. It just has a tester that
+# has physical ports to be connected to Ruby
+#
+system = System(cpu = tester,
+                mem_ranges = [AddrRange(options.mem_size)],
+                cache_line_size = options.cacheline_size,
+                mem_mode = 'timing')
+
+system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
+system.clk_domain = SrcClockDomain(clock = options.sys_clock,
+                                   voltage_domain = system.voltage_domain)
+
+#
+# Command processor is not needed for the tester since we don't run real
+# kernels. Setting it to zero disables the VIPER protocol from creating
+# a command processor and its caches.
+#
+options.num_cp = 0
+
+#
+# Create the Ruby system
+#
 Ruby.create_system(options, False, system)

-# Create a seperate clock domain for Ruby
-system.ruby.clk_domain = SrcClockDomain(clock=options.ruby_clock,
-                                       voltage_domain=system.voltage_domain)
-
-tester.num_cpus = len(system.ruby._cpu_ports)
-
 #
 # The tester is most effective when randomization is turned on and
 # artifical delay is randomly inserted on messages
 #
 system.ruby.randomization = True

-for ruby_port in system.ruby._cpu_ports:
+# Assert that we got the right number of Ruby ports
+assert(len(system.ruby._cpu_ports) == n_CUs + n_SQCs + n_Scalars)

-    #
-    # Tie the ruby tester ports to the ruby cpu read and write ports
-    #
-    if ruby_port.support_data_reqs and ruby_port.support_inst_reqs:
-        tester.cpuInstDataPort = ruby_port.slave
-    elif ruby_port.support_data_reqs:
-        tester.cpuDataPort = ruby_port.slave
-    elif ruby_port.support_inst_reqs:
-        tester.cpuInstPort = ruby_port.slave
-
-    # Do not automatically retry stalled Ruby requests
+#
+# Attach Ruby ports to the tester in the order:
+#               cpu_sequencers,
+#               vector_coalescers,
+#               sqc_sequencers,
+#               scalar_sequencers
+#
+# Note that this requires the protocol to create sequencers in this order
+#
+print("Attaching ruby ports to the tester")
+for i, ruby_port in enumerate(system.ruby._cpu_ports):
    ruby_port.no_retry_on_stall = True
-
-    #
-    # Tell each sequencer this is the ruby tester so that it
-    # copies the subblock back to the checker
-    #
    ruby_port.using_ruby_tester = True

-# -----------------------
-# run simulation
-# -----------------------
+    if i < n_CUs:
+        tester.cu_vector_ports = ruby_port.in_ports
+        tester.cu_token_ports = ruby_port.gmTokenPort
+        tester.max_cu_tokens = 4*n_WFs
+    elif i < (n_CUs + n_SQCs):
+        tester.cu_sqc_ports = ruby_port.in_ports
+    else:
+        tester.cu_scalar_ports = ruby_port.in_ports

-root = Root( full_system = False, system = system )
-root.system.mem_mode = 'timing'
+    i += 1
+
+#
+# No CPU threads are needed for GPU tester
+#
+tester.cpu_threads = []
+
+#
+# Create GPU wavefronts
+#
+thread_clock = SrcClockDomain(clock = '1GHz',
+                              voltage_domain = system.voltage_domain)
+wavefronts = []
+g_thread_idx = 0
+print("Creating %i WFs attached to %i CUs" % \
+                (n_CUs * tester.wavefronts_per_cu, n_CUs))
+for cu_idx in range(n_CUs):
+    for wf_idx in range(tester.wavefronts_per_cu):
+        wavefronts.append(GpuWavefront(thread_id = g_thread_idx,
+                                         cu_id = cu_idx,
+                                         num_lanes = options.wf_size,
+                                         clk_domain = thread_clock,
+                                         deadlock_threshold = \
+                                                tester_deadlock_threshold))
+        g_thread_idx += 1
+tester.wavefronts = wavefronts
+
+#
+# Run simulation
+#
+root = Root(full_system = False, system = system)

 # Not much point in this being higher than the L1 latency
 m5.ticks.setGlobalFrequency('1ns')

-# instantiate configuration
+# Instantiate configuration
 m5.instantiate()

-# simulate until program terminates
-exit_event = m5.simulate(options.abs_max_tick)
+# Simulate until tester completes
+exit_event = m5.simulate()

-print('Exiting @ tick', m5.curTick(), 'because', exit_event.getCause())
+print('Exiting tick: ', m5.curTick())
+print('Exiting because ', exit_event.getCause())
--- a/src/cpu/testers/gpu_ruby_test/CpuThread.py
+++ b/src/cpu/testers/gpu_ruby_test/CpuThread.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+
+from m5.objects.GpuThread import GpuThread
+
+class CpuThread(GpuThread):
+    type = 'CpuThread'
+    cxx_header = "cpu/testers/gpu_ruby_test/cpu_thread.hh"
--- a/src/cpu/testers/gpu_ruby_test/GpuThread.py
+++ b/src/cpu/testers/gpu_ruby_test/GpuThread.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+from m5.objects.ClockedObject import ClockedObject
+from m5.params import *
+from m5.proxy import *
+
+class GpuThread(ClockedObject):
+    type = 'GpuThread'
+    abstract = True
+    cxx_header = "cpu/testers/gpu_ruby_test/gpu_thread.hh"
+    thread_id = Param.Int("Unique GpuThread ID")
+    num_lanes = Param.Int("Number of lanes this thread has")
+    deadlock_threshold = Param.Cycles(1000000000, "Deadlock threshold")
--- a/src/cpu/testers/gpu_ruby_test/GpuWavefront.py
+++ b/src/cpu/testers/gpu_ruby_test/GpuWavefront.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+
+from m5.objects.GpuThread import GpuThread
+
+class GpuWavefront(GpuThread):
+    type = 'GpuWavefront'
+    cxx_header = "cpu/testers/gpu_ruby_test/gpu_wavefront.hh"
+    cu_id = Param.Int("Compute Unit ID")
--- a/src/cpu/testers/gpu_ruby_test/ProtocolTester.py
+++ b/src/cpu/testers/gpu_ruby_test/ProtocolTester.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+from m5.objects.ClockedObject import ClockedObject
+from m5.params import *
+from m5.proxy import *
+
+class ProtocolTester(ClockedObject):
+    type = 'ProtocolTester'
+    cxx_header = "cpu/testers/gpu_ruby_test/protocol_tester.hh"
+
+    cpu_ports = VectorRequestPort("Ports for CPUs")
+    cu_vector_ports = VectorRequestPort("Vector ports for GPUs")
+    cu_sqc_ports = VectorRequestPort("SQC ports for GPUs")
+    cu_scalar_ports = VectorRequestPort("Scalar ports for GPUs")
+
+    cus_per_sqc = Param.Int(4, "Number of CUs per SQC")
+    cus_per_scalar = Param.Int(4, "Number of CUs per scalar cache")
+
+    wavefronts_per_cu = Param.Int(1, "Number of wavefronts per CU")
+    workitems_per_wavefront = Param.Int(64, "Number of workitems per wf")
+
+    cpu_threads = VectorParam.CpuThread("All cpus")
+    wavefronts = VectorParam.GpuWavefront("All wavefronts")
+
+    num_atomic_locations = Param.Int(2, "Number of atomic locations")
+    num_normal_locs_per_atomic = Param.Int(1000, \
+                                "Number of normal locations per atomic")
+
+    episode_length = Param.Int(10, "Number of actions per episode")
+    max_num_episodes = Param.Int(20, "Maximum number of episodes")
+    debug_tester = Param.Bool(False, "Are we debugging the tester?")
+    random_seed = Param.Int(0, "Random seed number. Default value (0) means \
+                                using runtime-specific value.")
+    log_file = Param.String("Log file's name")
+    system = Param.System(Parent.any, "System we belong to")
--- a/src/cpu/testers/gpu_ruby_test/README
+++ b/src/cpu/testers/gpu_ruby_test/README
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+This directory contains a tester for gem5 GPU protocols. Unlike the Ruby random
+teter, this tester does not rely on sequential consistency. Instead, it
+assumes tested protocols supports release consistency.
+
+----- Getting Started -----
+
+To start using the tester quickly, you can use the following example command
+line to get running immediately:
+
+build/GCN3_X86/gem5.opt configs/example/ruby_gpu_random_test.py \
+            --test-length=1000 --system-size=medium --cache-size=small
+
+An overview of the main command line options is as follows. For all options
+use `build/GCN3_X86/gem5.opt configs/example/ruby_gpu_random_test.py --help`
+or see the configuration file.
+
+ * --cache-size (small, large): Use smaller sizes for testing evict, etc.
+ * --system-size (small, medium, large): Effectively the number of threads in
+                 the GPU model. Large size will have more contention. Larger
+                 sizes are useful for checking contention.
+ * --episode-length (short, medium, long): Number of loads and stores in an
+                 episode. Episodes will also have atomics mixed in. See below
+                 for a definition of episode.
+ * --test-length (int): Number of episodes to execute. This will determine the
+                 amount of time the tester runs for. Longer time will stress
+                 the protocol harder.
+
+The remainder of this file describes the theory behind the tester design and
+a link to a more detailed research paper is provided at the end.
+
+----- Theory Overview -----
+
+The GPU Ruby tester creates a system consisting of both CPU threads and GPU
+wavefronts. CPU threads are scalar, so there is one lane per CPU thread. GPU
+wavefront may have multiple lanes. The number of lanes is initialized when
+a thread/wavefront is created.
+
+Each thread/wavefront executes a number of episodes. Each episode is a series
+of memory actions (i.e., atomic, load, store, acquire and release). In a
+wavefront, all lanes execute the same sequence of actions, but they may target
+different addresses. One can think of an episode as a critical section which
+is bounded by a lock acquire in the beginning and a lock release at the end. An
+episode consists of actions in the following order:
+
+1 - Atomic action
+2 - Acquire action
+3 - A number of load and store actions
+4 - Release action
+5 - Atomic action that targets the same address as (1) does
+
+There are two separate set of addresses: atomic and non-atomic. Atomic actions
+target only atomic addresses. Load and store actions target only non-atomic
+addresses. Memory addresses are all 4-byte aligned in the tester.
+
+To test false sharing cases in which both atomic and non-atomic addresses are
+placed in the same cache line, we abstract out the concept of memory addresses
+from the tester's perspective by introducing the concept of location. Locations
+are numbered from 0 to N-1 (if there are N addresses). The first X locations
+[0..X-1] are atomic locations, and the rest are non-atomic locations.
+The 1-1 mapping between locations and addresses are randomly created when the
+tester is initialized.
+
+Per load and store action, its target location is selected so that there is no
+data race in the generated stream of memory requests at any time during the
+test. Since in Data-Race-Free model, the memory system's behavior is undefined
+in data race cases, we exclude data race scenarios from our protocol test.
+
+Once location per load/store action is determined, each thread/wavefront either
+loads current value at the location or stores an incremental value to that
+location. The tester maintains a table tracking all last writers and their
+written values, so we know what value should be returned from a load and what
+value should be written next at a particular location. Value returned from a
+load must match with the value written by the last writer.
+
+----- Directory Structure -----
+
+ProtocolTester.hh/cc -- This is the main tester class that orchestrates the
+                        entire test.
+AddressManager.hh/cc -- This manages address space, randomly maps address to
+                        location, generates locations for all episodes,
+                        maintains per-location last writer and validates
+                        values returned from load actions.
+GpuThread.hh/cc         -- This is abstract class for CPU threads and GPU
+                        wavefronts. It generates and executes a series of
+                        episodes.
+CpuThread.hh/cc      -- Thread class for CPU threads. Not fully implemented yet
+GpuWavefront.hh/cc   -- GpuThread class for GPU wavefronts.
+Episode.hh/cc        -- Class to encapsulate an episode, notably including
+                        episode load/store structure and ordering.
+
+For more detail, please see the following paper:
+
+T. Ta, X. Zhang, A. Gutierrez and B. M. Beckmann, "Autonomous Data-Race-Free
+GPU Testing," 2019 IEEE International Symposium on Workload Characterization
+(IISWC), Orlando, FL, USA, 2019, pp. 81-92, doi:
+10.1109/IISWC47752.2019.9042019.
--- a/src/cpu/testers/gpu_ruby_test/SConscript
+++ b/src/cpu/testers/gpu_ruby_test/SConscript
@@ -0,0 +1,54 @@
+#
+# Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+
+Import('*')
+
+if not env['BUILD_GPU']:
+    Return()
+
+if env['PROTOCOL'] == 'None':
+    Return()
+
+SimObject('ProtocolTester.py')
+SimObject('GpuThread.py')
+SimObject('CpuThread.py')
+SimObject('GpuWavefront.py')
+
+Source('address_manager.cc')
+Source('episode.cc')
+Source('protocol_tester.cc')
+Source('gpu_thread.cc')
+Source('cpu_thread.cc')
+Source('gpu_wavefront.cc')
+
+DebugFlag('ProtocolTest')
--- a/src/cpu/testers/gpu_ruby_test/address_manager.cc
+++ b/src/cpu/testers/gpu_ruby_test/address_manager.cc
@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/testers/gpu_ruby_test/address_manager.hh"
+
+#include <algorithm>
+
+#include "base/intmath.hh"
+#include "base/logging.hh"
+#include "base/random.hh"
+#include "base/trace.hh"
+
+const int AddressManager::INVALID_VALUE = -1;
+const int AddressManager::INVALID_LOCATION = -1;
+
+AddressManager::AddressManager(int n_atomic_locs, int n_normal_locs_per_atomic)
+      : numAtomicLocs(n_atomic_locs),
+        numLocsPerAtomic(n_normal_locs_per_atomic)
+{
+    assert(numAtomicLocs > 0 && numLocsPerAtomic > 0);
+    numNormalLocs = numAtomicLocs * numLocsPerAtomic;
+
+    // generate random address map
+    randAddressMap.resize(numAtomicLocs + numNormalLocs);
+    for (Location i = 0; i < numAtomicLocs + numNormalLocs; ++i) {
+        // all addresses are sizeof(Value) (i.e., 4-byte) aligned
+        randAddressMap[i] = (Addr)((i + 128) << floorLog2(sizeof(Value)));
+    }
+
+    // randomly shuffle randAddressMap
+    std::random_shuffle(randAddressMap.begin(), randAddressMap.end());
+
+    // initialize atomic locations
+    // first and last normal location per atomic location
+    Location first, last;
+    for (Location atomic_loc = 0; atomic_loc < numAtomicLocs; ++atomic_loc) {
+        first = numAtomicLocs + numLocsPerAtomic * atomic_loc;
+        last = first + numLocsPerAtomic - 1;
+        atomicStructs.push_back(new AtomicStruct(atomic_loc, first, last));
+    }
+
+    // initialize log table
+    for (Location loc = 0; loc < numAtomicLocs + numNormalLocs; ++loc) {
+        logTable.push_back(new LastWriter());
+    }
+}
+
+AddressManager::~AddressManager()
+{
+    for (AtomicStruct* atomic_struct : atomicStructs)
+        delete atomic_struct;
+    for (LastWriter* lw : logTable)
+        delete lw;
+}
+
+Addr
+AddressManager::getAddress(Location loc)
+{
+    assert(loc < numAtomicLocs + numNormalLocs && loc >= 0);
+    return randAddressMap[loc];
+}
+
+AddressManager::Location
+AddressManager::getAtomicLoc()
+{
+    Location ret_atomic_loc = random() % numAtomicLocs;
+    atomicStructs[ret_atomic_loc]->startLocSelection();
+    return ret_atomic_loc;
+}
+
+AddressManager::Location
+AddressManager::getLoadLoc(Location atomic_loc)
+{
+    assert(atomic_loc >= 0 && atomic_loc < numAtomicLocs);
+    return atomicStructs[atomic_loc]->getLoadLoc();
+}
+
+AddressManager::Location
+AddressManager::getStoreLoc(Location atomic_loc)
+{
+    assert(atomic_loc >= 0 && atomic_loc < numAtomicLocs);
+    return atomicStructs[atomic_loc]->getStoreLoc();
+}
+
+void
+AddressManager::finishLocSelection(Location atomic_loc)
+{
+    assert(atomic_loc >= 0 && atomic_loc < numAtomicLocs);
+    atomicStructs[atomic_loc]->endLocSelection();
+}
+
+void
+AddressManager::releaseLocation(Location atomic_loc, Location loc)
+{
+    assert(atomic_loc >= 0 && atomic_loc < numAtomicLocs);
+    atomicStructs[atomic_loc]->releaseLoc(loc);
+}
+
+std::string
+AddressManager::printLastWriter(Location loc) const
+{
+    return logTable[loc]->print();
+}
+
+// ------------------- AtomicStruct --------------------------
+AddressManager::AtomicStruct::AtomicStruct(Location atomic_loc,
+                                           Location loc_begin,
+                                           Location loc_end)
+{
+    // the location range must have at least 1 location
+    assert(loc_begin <= loc_end);
+
+    atomicLoc = atomic_loc;
+    arraySize = loc_end - loc_begin + 1;
+    locationBase = loc_begin;
+
+    // allocate an array of arrray_size
+    locArray = new Location[arraySize];
+
+    // initialize locArray & locProps
+    Location loc;
+    for (int offset = 0; offset < arraySize; ++offset) {
+        loc = locationBase + offset;
+        locArray[offset] = loc;
+        locProps.push_back(LocProperty(offset, 0));
+    }
+
+    // region (1) and (3) are initially empty
+    firstMark = 0;
+    secondMark = arraySize;
+    // no request made at this location so far
+    requestCount = 0;
+}
+
+AddressManager::AtomicStruct::~AtomicStruct()
+{
+    delete[] locArray;
+}
+
+void
+AddressManager::AtomicStruct::startLocSelection()
+{
+    assert(firstMark >= 0);
+    assert(firstMark <= secondMark);
+    assert(secondMark <= arraySize);
+    // make sure loadStoreMap has been cleared
+    assert(loadStoreMap.empty());
+
+    // this atomic location is picked for Atomic_ACQ
+    // and Atomic_REL in an episode
+    requestCount += 2;
+    // add two expected values in expectedValues set
+    expectedValues.insert(requestCount - 1);
+    expectedValues.insert(requestCount - 2);
+}
+
+AddressManager::Location
+AddressManager::AtomicStruct::getLoadLoc()
+{
+    assert(firstMark >= 0);
+    assert(firstMark <= secondMark);
+    assert(secondMark <= arraySize);
+
+    if (firstMark == arraySize) {
+        // no location can be picked for a LD now, so return an empty location
+        return INVALID_LOCATION;
+    } else {
+        // we can pick any location btw
+        // locArray [firstMark : arraySize-1]
+        int range_size = arraySize - firstMark;
+        Location ret_loc = locArray[firstMark + random() % range_size];
+
+        // update loadStoreMap
+        LdStMap::iterator it = loadStoreMap.find(ret_loc);
+
+        if (it == loadStoreMap.end()) {
+            // insert a new entry to the map b/c the entry is not there yet
+            // to mark this location has been picked for a LD
+            loadStoreMap.insert(std::pair<Location, LdStBits>
+                                            (ret_loc, LdStBits(true,false)));
+        } else {
+            // otherwise, just update the LD bit
+            (it->second).first = true;
+        }
+
+        return ret_loc;
+    }
+}
+
+AddressManager::Location
+AddressManager::AtomicStruct::getStoreLoc()
+{
+    assert(firstMark >= 0);
+    assert(firstMark <= secondMark);
+    assert(secondMark <= arraySize);
+
+    if (firstMark == secondMark) {
+        // no location can be picked for a ST now, return an invalid location
+        return INVALID_LOCATION;
+    } else {
+        // we can pick any location btw [firstMark : secondMark-1]
+        int range_size = secondMark - firstMark;
+        Location ret_loc = locArray[firstMark + random() % range_size];
+
+        // update loadStoreMap
+        LdStMap::iterator it = loadStoreMap.find(ret_loc);
+
+        if (it == loadStoreMap.end()) {
+            // insert a new entry to the map b/c the entry is not there yet
+            // to mark this location has been picked for a ST
+            loadStoreMap.insert(std::pair<Location, LdStBits>
+                                            (ret_loc, LdStBits(false,true)));
+        } else {
+            // otherwise, just update the ST bit
+            (it->second).second = true;
+        }
+
+        return ret_loc;
+    }
+}
+
+// for each entry in loadStoreMap,
+//  if <LD_bit, ST_bit> == <1,0>
+//    - if the location is in (2), then move it to (3)
+//    - if the location is in (3), no move
+//    - otherwise, throw an error
+//  if <LD_bit, ST_bit> == <0,1> or <1,1>
+//    - move it from (2) to (1)
+void
+AddressManager::AtomicStruct::endLocSelection()
+{
+    assert(firstMark >= 0);
+    assert(firstMark <= secondMark);
+    assert(secondMark <= arraySize);
+
+    for (auto& it : loadStoreMap) {
+        Location loc = it.first;
+        LdStBits p = it.second;
+
+        assert(loc >= locationBase && loc < locationBase + arraySize);
+        LocProperty& loc_prop = locProps[loc - locationBase];
+
+        if (p.first && !p.second) {
+            // this location has been picked for LD(s) but not ST
+            // it must be in either region (2) or (3)
+            assert(inSecondRegion(loc_prop.first) ||
+                   inThirdRegion(loc_prop.first));
+
+            if (inSecondRegion(loc_prop.first)) {
+                // there is no owner of this location yet
+                assert(loc_prop.second == 0);
+
+                // pick the last location in (2) to swap
+                Location swapped_loc = locArray[secondMark - 1];
+                LocProperty& swapped_loc_prop =
+                                         locProps[swapped_loc - locationBase];
+
+                // swap loc and swapped_loc
+                swap(loc_prop, swapped_loc_prop);
+
+                // then, expand (3)
+                secondMark--;
+            }
+
+            // increment the location's number of owners
+            loc_prop.second++;
+        } else if (p.second) {
+            // this location has been picked for ST(s) and/or LD(s)
+            // it must be in region (2)
+            assert(inSecondRegion(loc_prop.first) && loc_prop.second == 0);
+
+            // pick the first location in (2) to swap
+            Location swapped_loc = locArray[firstMark];
+            LocProperty& swapped_loc_prop =
+                                        locProps[swapped_loc - locationBase];
+
+            // swap loc and swapped_loc
+            swap(loc_prop, swapped_loc_prop);
+
+            // then, expand (1)
+            firstMark++;
+
+            // increment the location's number of owners
+            loc_prop.second++;
+        } else {
+            panic("Location in loadStoreMap but wasn't picked in any"
+                            " action\n");
+        }
+    }
+
+    // clear the ld_st_map
+    loadStoreMap.clear();
+}
+
+void
+AddressManager::AtomicStruct::releaseLoc(Location loc)
+{
+    assert(loc >= locationBase && loc < locationBase + arraySize);
+
+    LocProperty& loc_prop = locProps[loc - locationBase];
+
+    if (inFirstRegion(loc_prop.first)) {
+        // this location must have exactly 1 owner
+        assert(loc_prop.second == 1);
+
+        // pick the last location in region 1 to swap
+        Location swapped_loc = locArray[firstMark - 1];
+        LocProperty& swapped_loc_prop = locProps[swapped_loc - locationBase];
+
+        // swap loc and swapped_loc
+        swap(loc_prop, swapped_loc_prop);
+
+        // then shrink (1)
+        firstMark--;
+
+        // reset the location's number of owners
+        loc_prop.second = 0;
+    } else if (inThirdRegion(loc_prop.first)) {
+        // this location must have at least 1 owner
+        assert(loc_prop.second >= 1);
+
+        if (loc_prop.second == 1) {
+            // pick the first location in region 3 to swap
+            Location swapped_loc = locArray[secondMark];
+            LocProperty& swapped_loc_prop =
+                                        locProps[swapped_loc - locationBase];
+
+            // swap loc and swapped_loc
+            swap(loc_prop, swapped_loc_prop);
+
+            // then shrink (3)
+            secondMark++;
+        }
+        // decrement the loc's number of owners
+        loc_prop.second--;
+    } else {
+        // some one else must already reset this counter
+        assert(inSecondRegion(loc_prop.first) && loc_prop.second == 0);
+    }
+}
+
+bool
+AddressManager::AtomicStruct::isExpectedValue(Value val)
+{
+    ExpectedValueSet::iterator it = expectedValues.find(val);
+
+    if (it == expectedValues.end()) {
+        std::stringstream exp_val_ss;
+        for (auto& val : expectedValues) {
+            exp_val_ss << " " << val;
+        }
+
+        warn("Expected return values are:\n\t%s\n", exp_val_ss.str());
+
+        return false;
+    }
+
+    // erase this value b/c it's done
+    expectedValues.erase(it);
+
+    return true;
+}
+
+void
+AddressManager::AtomicStruct::swap(LocProperty& prop_1, LocProperty& prop_2)
+{
+    int new_idx_1 = prop_2.first;
+    int new_idx_2 = prop_1.first;
+
+    // swap the two locations in locArray
+    Location tmp = locArray[prop_1.first];
+    locArray[prop_1.first] = locArray[prop_2.first];
+    locArray[prop_2.first] = tmp;
+
+    // update their new indices
+    prop_1.first = new_idx_1;
+    prop_2.first = new_idx_2;
+}
+
+// ------------------ log table ---------------------
+void
+AddressManager::updateLogTable(Location loc, int thread_id, int episode_id,
+                               Value new_value, Tick cur_tick, int cu_id)
+{
+    assert(loc >= 0 && loc < numAtomicLocs + numNormalLocs);
+    logTable[loc]->update(thread_id, cu_id, episode_id, new_value, cur_tick);
+}
+
+AddressManager::Value
+AddressManager::getLoggedValue(Location loc) const
+{
+    assert(loc >= 0 && loc < numAtomicLocs + numNormalLocs);
+    return logTable[loc]->getLastStoredValue();
+}
+
+bool
+AddressManager::validateAtomicResp(Location loc, Value ret_val)
+{
+    assert(loc >= 0 && loc < numAtomicLocs);
+    return atomicStructs[loc]->isExpectedValue(ret_val);
+}
--- a/src/cpu/testers/gpu_ruby_test/address_manager.hh
+++ b/src/cpu/testers/gpu_ruby_test/address_manager.hh
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef CPU_TESTERS_PROTOCOL_TESTER_ADDRESS_MANAGER_HH_
+#define CPU_TESTERS_PROTOCOL_TESTER_ADDRESS_MANAGER_HH_
+
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "base/types.hh"
+#include "sim/eventq.hh"
+
+/*
+ * --- AddressManager has 3 main tasks ---
+ *    (1) generate DRF request sequences
+ *    (2) maintain internal log table
+ *    (3) validate return values against ones in the log table
+ *
+ * A location is an abstract index of a unique real address.
+ *    It's used internally within the tester only.
+ *    randAddressMap has the mapping between a location and its real address.
+ *
+ * A value is an integer that a location in real memory can store.
+ *    for now, we assume a value is 4-byte
+ *
+ * The location range (randAddressMap) has two distinct parts:
+ *    Atomic locations: in the 1st part of randAddressMap &
+ *    Non-atomic locations (or just locations): in the 2nd part
+ */
+
+/*
+ * --- DRF request sequence generation ---
+ *    Each lane of an episode starts selecting its location by calling:
+ *      (1) getAtomicLoc
+ *      (2) getLoadLoc/getStoreLoc
+ *      (3) finishLocSelection
+ *
+ *    Each lane of an episode completes its executing by calling:
+ *      releaseLocation for all locations it selected
+ */
+
+/*
+ * --- Internal structures ---
+ *  There are multiple atomic structures, each of which corresponds
+ *    to an atomic location.
+ *
+ *  Each atomic structure manages a distinct range of locations in locArray
+ *  This array is partitioned into 3 parts that are used to select locations
+ *  for LDs and STs. Here is the location selecting rule:
+ *                  |    (1)    |    (2)    |    (3)    |
+ *    - all locations in (1) cannot be picked for any LD and ST action
+ *    - all locations in (2) can be picked for either LD or ST action
+ *    - all locations in (3) can be picked for LD action only
+ *
+ *  We maintain the 3 parts by 2 indices firstMark and secondMark.
+ *  As locations are moved between partitions, both indices are updated
+ *  accordingly.
+ *    [0 .. firstMark-1]                  part (1)
+ *    [firstMark .. secondMark-1]      part (2)
+ *    [secondMark .. arraySize-1]        part (3)
+ *
+ *  Each location has its context/property. locProps maintains
+ *  contexts/properties of all locations. Context/property includes
+ *      - current index of a location in locArray
+ *      - the number of owners who are currently using the location
+ *
+ *  To guarantee DRF constraints, the following conditions must hold
+ *    - all locations in (1) have exactly 1 owner
+ *    - all locations in (2) have exactly 0 owner
+ *    - all locations in (3) have at least 1 owner
+ *    - A LD request can randomly pick any location in (2) & (3)
+ *    - A ST request can randomly pick any location in (2)
+ *
+ *  loadStoreMap maintains all locations already selected for LDs/STs so far
+ *
+ *  When endLocSelection is called (i.e., we've picked all locations for an
+ *  episode), we need to move each selected location to its right partition.
+ *    if LD_bit == 1 && ST_bit == 0 (i.e., picked for LDs), then move the
+ *          location to (3) -> future LDs can pick it.
+ *    if LD_bit == 0 && ST_bit == 1, then move the location to (1) -> NO future
+ *          action can pick it until this episode is done.
+ *    if LD_bit == 1 && ST_bit == 1, then move the location to (1) -> NO future
+ *          action can pick it until this episode is done.
+ *    clear the loadStoreMap
+ */
+
+class AddressManager
+{
+  public:
+    AddressManager(int n_atomic_locs, int numNormalLocsPerAtomic);
+    ~AddressManager();
+
+    typedef int32_t Value;
+    typedef int32_t Location;
+
+    // return the unique address mapped to a location
+    Addr getAddress(Location loc);
+    // return a unique atomic location & start picking locations
+    Location getAtomicLoc();
+    // return a random location for LD
+    Location getLoadLoc(Location atomic_loc);
+    // return a random location for ST
+    Location getStoreLoc(Location atomic_loc);
+    // finish picking locations
+    void finishLocSelection(Location atomic_loc);
+    // an episode is done, release location I've picked
+    void releaseLocation(Location atomic_loc, Location loc);
+    // update a log table entry with a given set of values
+    void updateLogTable(Location loc, int threadId, int episodeId,
+                        Value new_value, Tick curTick, int cuId = -1);
+    // return the current value in the log table
+    Value getLoggedValue(Location loc) const;
+    // validate atomic response
+    bool validateAtomicResp(Location loc, Value ret_val);
+
+    std::string printLastWriter(Location loc) const;
+
+    static const int INVALID_VALUE;
+    static const int INVALID_LOCATION;
+
+  private:
+    class LastWriter
+    {
+      public:
+        LastWriter()
+            : threadId(-1), cuId(-1), episodeId(-1), value(0),
+              writeTick(0)
+        { }
+
+        const std::string print() const
+        {
+            return "(GpuThread ID " + std::to_string(threadId) +
+                   ", CU ID " + std::to_string(cuId) +
+                   ", Episode ID " + std::to_string(episodeId) +
+                   ", Value " + std::to_string(value) +
+                   ", Tick " + std::to_string(writeTick) +
+                   ")";
+        }
+
+        void update(int _thread, int _cu, int _episode, Value _value,
+                    Tick _tick)
+        {
+            threadId = _thread;
+            cuId = _cu;
+            episodeId = _episode;
+            value = _value;
+            writeTick = _tick;
+        }
+
+        Value getLastStoredValue() const { return value; }
+
+      private:
+        int threadId;
+        int cuId;
+        int episodeId;
+        Value value;
+        Tick writeTick;
+    };
+
+    class AtomicStruct
+    {
+      public:
+        AtomicStruct(Location atom_loc, Location loc_begin, Location loc_end);
+        ~AtomicStruct();
+
+        // functions picking locations for LD/ST/ATOMIC ops
+        void startLocSelection();
+        Location getLoadLoc();
+        Location getStoreLoc();
+        void endLocSelection();
+
+        // an episode completed its actions
+        // return locations to their correct positions
+        void releaseLoc(Location loc);
+        // is the value what we expect?
+        bool isExpectedValue(Value val);
+
+      private:
+        Location atomicLoc;
+        Location locationBase;
+
+        // array storing all locations this structure is managing
+        Location* locArray;
+        int firstMark, secondMark;
+        int arraySize;
+
+        // a vector of location's properties
+        typedef std::pair<int, int> LocProperty;
+        typedef std::vector<LocProperty> LocPropTable;
+        LocPropTable locProps;
+
+        // a temporary map of location and its LD/ST selection
+        typedef std::pair<bool, bool> LdStBits;
+        typedef std::unordered_map<Location, LdStBits> LdStMap;
+        LdStMap loadStoreMap;
+
+        // number of atomic requests at this location so far
+        int requestCount;
+        // a set of expected values
+        // when we request the first n atomic ops, we expect to receive n
+        // return values from [0 .. n-1]
+        typedef std::unordered_set<Value> ExpectedValueSet;
+        ExpectedValueSet expectedValues;
+
+        // swap two locations in locArray
+        void swap(LocProperty& prop_1, LocProperty& prop_2);
+
+        bool inFirstRegion(int idx) const
+        {
+            return (idx >= 0 && idx < firstMark);
+        }
+        bool inSecondRegion(int idx) const
+        {
+            return (idx >= firstMark && idx < secondMark);
+        }
+        bool inThirdRegion(int idx) const
+        {
+            return (idx >= secondMark && idx < arraySize);
+        }
+    };
+
+    // number of atomic locations
+    int numAtomicLocs;
+    // number of normal/non-atomic locations per atomic structure
+    int numLocsPerAtomic;
+    // total number of non-atomic locations
+    int numNormalLocs;
+
+    // location - address mapping
+    typedef std::vector<Addr> AddressMap;
+    AddressMap randAddressMap;
+
+    // a list of atomic structures
+    typedef std::vector<AtomicStruct*> AtomicStructTable;
+    AtomicStructTable atomicStructs;
+
+    // internal log table
+    typedef std::vector<LastWriter*> LogTable;
+    LogTable logTable;
+};
+
+#endif /* CPU_TESTERS_PROTOCOL_TESTER_ADDRESS_MANAGER_HH_ */
--- a/src/cpu/testers/gpu_ruby_test/cpu_thread.cc
+++ b/src/cpu/testers/gpu_ruby_test/cpu_thread.cc
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/testers/gpu_ruby_test/cpu_thread.hh"
+
+#include "debug/ProtocolTest.hh"
+
+CpuThread::CpuThread(const Params &p)
+    :GpuThread(p)
+{
+    threadName = "CpuThread(Thread ID " + std::to_string(threadId) + ")";
+    threadEvent.setDesc("CpuThread tick");
+    assert(numLanes == 1);
+}
+
+CpuThread*
+CpuThreadParams::create() const
+{
+    return new CpuThread(*this);
+}
+
+void
+CpuThread::issueLoadOps()
+{
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::LOAD);
+    // we should not have any outstanding fence or atomic op at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    fatal("CpuThread::issueLoadOps - not yet implemented");
+}
+
+void
+CpuThread::issueStoreOps()
+{
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::STORE);
+    // we should not have any outstanding fence or atomic op at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    fatal("CpuThread::issueStoreOps - not yet implemented");
+}
+
+void
+CpuThread::issueAtomicOps()
+{
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::ATOMIC);
+    // we should not have any outstanding ops at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingLdStCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    fatal("CpuThread::issueAtomicOps - not yet implemented");
+}
+
+void
+CpuThread::issueAcquireOp()
+{
+    DPRINTF(ProtocolTest, "Issuing Acquire Op ...\n");
+
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::ACQUIRE);
+    // we should not have any outstanding ops at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingLdStCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    // no-op: Acquire does not apply to CPU threads
+}
+
+void
+CpuThread::issueReleaseOp()
+{
+    DPRINTF(ProtocolTest, "Issuing Release Op ...\n");
+
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::RELEASE);
+    // we should not have any outstanding ops at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingLdStCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    // no-op: Release does not apply to CPU threads
+}
+
+void
+CpuThread::hitCallback(PacketPtr pkt)
+{
+    fatal("CpuThread::hitCallback - not yet implemented");
+}
--- a/src/cpu/testers/gpu_ruby_test/cpu_thread.hh
+++ b/src/cpu/testers/gpu_ruby_test/cpu_thread.hh
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef CPU_TESTERS_PROTOCOL_TESTER_CPU_THREAD_HH_
+#define CPU_TESTERS_PROTOCOL_TESTER_CPU_THREAD_HH_
+
+#include "cpu/testers/gpu_ruby_test/gpu_thread.hh"
+#include "params/CpuThread.hh"
+#include "sim/clocked_object.hh"
+
+class CpuThread : public GpuThread
+{
+  public:
+    typedef CpuThreadParams Params;
+    CpuThread(const Params &p);
+    virtual ~CpuThread() = default;
+
+    typedef AddressManager::Location Location;
+    typedef AddressManager::Value Value;
+
+    void hitCallback(PacketPtr pkt);
+
+  protected:
+    void issueLoadOps();
+    void issueStoreOps();
+    void issueAtomicOps();
+    void issueAcquireOp();
+    void issueReleaseOp();
+};
+
+#endif /* CPU_TESTERS_PROTOCOL_TESTER_CPU_THREAD_HH_ */
--- a/src/cpu/testers/gpu_ruby_test/episode.cc
+++ b/src/cpu/testers/gpu_ruby_test/episode.cc
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/testers/gpu_ruby_test/episode.hh"
+
+#include <fstream>
+#include <unordered_set>
+
+#include "cpu/testers/gpu_ruby_test/gpu_thread.hh"
+#include "cpu/testers/gpu_ruby_test/protocol_tester.hh"
+
+Episode::Episode(ProtocolTester* _tester, GpuThread* _thread, int num_loads,
+                 int num_stores)
+      : tester(_tester),
+        thread(_thread),
+        numLoads(num_loads),
+        numStores(num_stores),
+        nextActionIdx(0)
+{
+    assert(tester && thread);
+
+    episodeId = tester->getNextEpisodeID();
+    numLanes = thread->getNumLanes();
+    assert(numLanes > 0);
+
+    addrManager = tester->getAddressManager();
+    assert(addrManager);
+
+    atomicLocs.resize(numLanes, AddressManager::INVALID_LOCATION);
+    // generate a sequence of actions
+    initActions();
+    isActive = true;
+
+    DPRINTFN("Episode %d\n", episodeId);
+}
+
+Episode::~Episode()
+{
+    for (Episode::Action* action : actions) {
+        assert(action);
+        delete action;
+    }
+}
+
+const Episode::Action*
+Episode::peekCurAction() const
+{
+    if (nextActionIdx < actions.size())
+        return actions[nextActionIdx];
+    else
+        return nullptr;
+}
+
+void
+Episode::popAction()
+{
+    assert(nextActionIdx < actions.size());
+    nextActionIdx++;
+}
+
+void
+Episode::initActions()
+{
+    // first, push Atomic & then Acquire action
+    actions.push_back(new Action(Action::Type::ATOMIC, numLanes));
+    actions.push_back(new Action(Action::Type::ACQUIRE, numLanes));
+
+    // second, push a number of LD/ST actions
+    int num_loads = numLoads;
+    int num_stores = numStores;
+    while ((num_loads + num_stores) > 0) {
+        switch (random() % 2) {
+            case 0: // Load
+                if (num_loads > 0) {
+                    actions.push_back(new Action(Action::Type::LOAD,
+                                                   numLanes));
+                    num_loads--;
+                }
+                break;
+            case 1: // Store
+                if (num_stores > 0) {
+                    actions.push_back(new Action(Action::Type::STORE,
+                                                   numLanes));
+                    num_stores--;
+                }
+                break;
+            default:
+                assert(false);
+        }
+    }
+
+    // last, push an Release & then Atomic action
+    actions.push_back(new Action(Action::Type::RELEASE, numLanes));
+    actions.push_back(new Action(Action::Type::ATOMIC, numLanes));
+
+    // for each lane, pick a list of locations
+    Location normal_loc;
+
+    for (int lane = 0; lane < numLanes; ++lane) {
+        normal_loc = AddressManager::INVALID_LOCATION;
+
+        // first, we select atomic loc for this lane
+        // atomic loc for this lane should not have been picked yet
+        assert(atomicLocs[lane] == AddressManager::INVALID_LOCATION);
+        // pick randomly an atomic location
+        atomicLocs[lane] = addrManager->getAtomicLoc();
+        assert(atomicLocs[lane] >= 0);
+
+        // go through each action in this lane and set its location
+        for (Action* action : actions) {
+            assert(action);
+
+            switch (action->getType()) {
+                case Action::Type::ATOMIC:
+                    action->setLocation(lane, atomicLocs[lane]);
+                    break;
+                case Action::Type::LOAD:
+                    // pick randomly a normal location
+                    normal_loc = addrManager->
+                                            getLoadLoc(atomicLocs[lane]);
+                    assert(normal_loc >= AddressManager::INVALID_LOCATION);
+
+                    if (normal_loc != AddressManager::INVALID_LOCATION) {
+                        // check DRF
+                        if (!tester->checkDRF(atomicLocs[lane],
+                                                normal_loc, false) ||
+                            !this->checkDRF(atomicLocs[lane], normal_loc,
+                                            false, lane)) {
+                            panic("GpuTh %d - Data race detected. STOPPED!\n",
+                                  thread->getGpuThreadId());
+                        }
+                    }
+
+                    action->setLocation(lane, normal_loc);
+                    break;
+                case Action::Type::STORE:
+                    // pick randomly a normal location
+                    normal_loc = addrManager->
+                                            getStoreLoc(atomicLocs[lane]);
+                    assert(normal_loc >= AddressManager::INVALID_LOCATION);
+
+                    if (normal_loc != AddressManager::INVALID_LOCATION) {
+                        // check DRF
+                        if (!tester->checkDRF(atomicLocs[lane],
+                                                normal_loc, true) ||
+                            !this->checkDRF(atomicLocs[lane], normal_loc,
+                                            true, lane)) {
+                            panic("GpuTh %d - Data race detected. STOPPED!\n",
+                                  thread->getGpuThreadId());
+                        }
+                    }
+
+                    action->setLocation(lane, normal_loc);
+                    break;
+                case Action::Type::ACQUIRE:
+                case Action::Type::RELEASE:
+                    // no op
+                    break;
+                default:
+                    panic("Invalid action type\n");
+            }
+        }
+
+        addrManager->finishLocSelection(atomicLocs[lane]);
+    }
+}
+
+void
+Episode::completeEpisode()
+{
+    // release all locations this episode has picked and used
+    Location atomic_loc, normal_loc;
+    for (int lane = 0; lane < numLanes; ++lane) {
+        atomic_loc = AddressManager::INVALID_LOCATION;
+        normal_loc = AddressManager::INVALID_LOCATION;
+
+        std::unordered_set<Location> unique_loc_set;
+
+        for (Action* action : actions) {
+            assert(action);
+
+            if (action->isAtomicAction()) {
+                if (atomic_loc == AddressManager::INVALID_LOCATION) {
+                    atomic_loc = action->getLocation(lane);
+                } else {
+                    // both atomic ops in the same lane must be
+                    // at the same location
+                    assert(atomic_loc == action->getLocation(lane));
+                }
+            } else if (!action->isMemFenceAction()) {
+                assert(atomic_loc >= 0);
+                normal_loc = action->getLocation(lane);
+
+                if (normal_loc >= 0)
+                    unique_loc_set.insert(normal_loc);
+            }
+        }
+
+        // each unique loc can be released only once
+        for (Location loc : unique_loc_set)
+            addrManager->releaseLocation(atomic_loc, loc);
+    }
+
+    // this episode is no longer active
+    isActive = false;
+}
+
+bool
+Episode::checkDRF(Location atomic_loc, Location loc, bool isStore,
+                  int max_lane) const
+{
+    assert(atomic_loc != AddressManager::INVALID_LOCATION);
+    assert(loc != AddressManager::INVALID_LOCATION);
+    assert(max_lane <= numLanes);
+
+    for (int lane = 0; lane < max_lane; ++lane) {
+        if (atomic_loc == atomicLocs[lane]) {
+            for (const Action* action : actions) {
+                if (!action->isAtomicAction() &&
+                    !action->isMemFenceAction()) {
+                    if (isStore && loc == action->getLocation(lane)) {
+                        warn("ST at location %d races against thread %d\n",
+                             loc, thread->getGpuThreadId());
+                        return false;
+                    } else if (!isStore &&
+                               action->getType() == Action::Type::STORE &&
+                               loc == action->getLocation(lane)) {
+                        warn("LD at location %d races against thread %d\n",
+                             loc, thread->getGpuThreadId());
+                        return false;
+                    }
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+// -------------------- Action class ----------------------------
+Episode::Action::Action(Type t, int num_lanes)
+    : type(t),
+      numLanes(num_lanes)
+{
+    assert(numLanes > 0);
+    locations.resize(numLanes);
+    for (Location &loc : locations) loc = AddressManager::INVALID_LOCATION;
+}
+
+void
+Episode::Action::setLocation(int lane, Location loc)
+{
+    assert(lane >= 0 && lane < numLanes);
+    locations[lane] = loc;
+}
+
+AddressManager::Location
+Episode::Action::getLocation(int lane) const
+{
+    assert(lane >= 0 && lane < numLanes);
+    return locations[lane];
+}
+
+bool
+Episode::Action::isAtomicAction() const
+{
+    return (type == Type::ATOMIC);
+}
+
+bool
+Episode::Action::isMemFenceAction() const
+{
+    return (type == Type::ACQUIRE || type == Type::RELEASE);
+}
+
+const std::string
+Episode::Action::printType() const
+{
+    if (type == Type::ACQUIRE)
+        return "ACQUIRE";
+    else if (type == Type::RELEASE)
+        return "RELEASE";
+    else if (type == Type::ATOMIC)
+        return "ATOMIC";
+    else if (type == Type::LOAD)
+        return "LOAD";
+    else if (type == Type::STORE)
+        return "STORE";
+    else
+        panic("Invalid action type\n");
+}
--- a/src/cpu/testers/gpu_ruby_test/episode.hh
+++ b/src/cpu/testers/gpu_ruby_test/episode.hh
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef CPU_TESTERS_PROTOCOL_TESTER_EPISODE_HH_
+#define CPU_TESTERS_PROTOCOL_TESTER_EPISODE_HH_
+
+#include <vector>
+
+#include "cpu/testers/gpu_ruby_test/address_manager.hh"
+
+class ProtocolTester;
+class GpuThread;
+
+class Episode
+{
+  public:
+    typedef AddressManager::Location Location;
+    typedef AddressManager::Value Value;
+
+    class Action {
+      public:
+        enum class Type {
+            ACQUIRE,
+            RELEASE,
+            ATOMIC,
+            LOAD,
+            STORE,
+        };
+
+        Action(Type t, int num_lanes);
+        ~Action() {}
+
+        Type getType() const { return type; }
+        void setLocation(int lane, Location loc);
+        Location getLocation(int lane) const;
+        bool isAtomicAction() const;
+        bool isMemFenceAction() const;
+        const std::string printType() const;
+
+      private:
+        Type type;
+        int numLanes;
+        typedef std::vector<Location> LocationList;
+        LocationList locations;
+    };
+
+    Episode(ProtocolTester* tester, GpuThread* thread, int num_loads,
+            int num_stores);
+    ~Episode();
+
+    // return episode id
+    int getEpisodeId() const { return episodeId; }
+    // return the action at the head of the action queue
+    const Action* peekCurAction() const;
+    // pop the action at the head of the action queue
+    void popAction();
+    // check if there is more action to be issued in this episode
+    bool hasMoreActions() const { return nextActionIdx < actions.size();}
+    // complete this episode by releasing all locations & updating st effects
+    void completeEpisode();
+    // check if this episode is executing
+    bool isEpsActive() const { return isActive; }
+    // check if the input episode and this one have any data race
+    bool checkDRF(Location atomic_loc, Location loc, bool isStore,
+                  int max_lane) const;
+
+  private:
+    // pointers to tester, thread and address amanger structures
+    ProtocolTester *tester;
+    GpuThread *thread;
+    AddressManager *addrManager;
+
+    // a unique episode id
+    int episodeId;
+    // list of actions in this episode
+    typedef std::vector<Action*> ActionList;
+    ActionList actions;
+    // list of atomic locations picked for this episode
+    typedef std::vector<Location> AtomicLocationList;
+    AtomicLocationList atomicLocs;
+
+    // is a thread running this episode?
+    bool isActive;
+    // episode length = num_loads + num_stores
+    int numLoads;
+    int numStores;
+    // index of the next action in actions
+    int nextActionIdx;
+    // number of lanes in this thread
+    int numLanes;
+
+    // randomly generate actions in this episode
+    void initActions();
+};
+
+#endif /* CPU_TESTERS_PROTOCOL_TESTER_EPISODE_HH_ */
--- a/src/cpu/testers/gpu_ruby_test/gpu_thread.cc
+++ b/src/cpu/testers/gpu_ruby_test/gpu_thread.cc
@@ -0,0 +1,430 @@
+/*
+ * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/testers/gpu_ruby_test/gpu_thread.hh"
+
+#include <fstream>
+
+#include "debug/ProtocolTest.hh"
+
+GpuThread::GpuThread(const Params &p)
+      : ClockedObject(p),
+        threadEvent(this, "GpuThread tick"),
+        deadlockCheckEvent(this),
+        threadId(p.thread_id),
+        numLanes(p.num_lanes),
+        tester(nullptr), addrManager(nullptr), port(nullptr),
+        scalarPort(nullptr), sqcPort(nullptr), curEpisode(nullptr),
+        curAction(nullptr), pendingLdStCount(0), pendingFenceCount(0),
+        pendingAtomicCount(0), lastActiveCycle(Cycles(0)),
+        deadlockThreshold(p.deadlock_threshold)
+{
+}
+
+GpuThread::~GpuThread()
+{
+    for (auto ep : episodeHistory) {
+        assert(ep != nullptr);
+        delete ep;
+    }
+}
+
+void
+GpuThread::wakeup()
+{
+    // this thread is waken up by one of the following events
+    //      - hitCallback is called
+    //      - a new episode is created
+
+    // check if this is the first episode in this thread
+    if (curEpisode == nullptr) {
+        issueNewEpisode();
+        assert(curEpisode);
+    }
+
+    if (isNextActionReady()) {
+        // isNextActionReady should check if the action list is empty
+        assert(curAction != nullptr);
+
+        // issue the next action
+        issueNextAction();
+    } else {
+        // check for completion of the current episode
+        // completion = no outstanding requests + not having more actions
+        if (!curEpisode->hasMoreActions() &&
+            pendingLdStCount == 0 &&
+            pendingFenceCount == 0 &&
+            pendingAtomicCount == 0) {
+
+            curEpisode->completeEpisode();
+
+            // check if it's time to stop the tester
+            if (tester->checkExit()) {
+                // no more event is scheduled for this thread
+                return;
+            }
+
+            // issue the next episode
+            issueNewEpisode();
+            assert(curEpisode);
+
+            // now we get a new episode
+            // let's wake up the thread in the next cycle
+            if (!threadEvent.scheduled()) {
+                scheduleWakeup();
+            }
+        }
+    }
+}
+
+void
+GpuThread::scheduleWakeup()
+{
+    assert(!threadEvent.scheduled());
+    schedule(threadEvent, nextCycle());
+}
+
+void
+GpuThread::scheduleDeadlockCheckEvent()
+{
+    // after this first schedule, the deadlock event is scheduled by itself
+    assert(!deadlockCheckEvent.scheduled());
+    schedule(deadlockCheckEvent, nextCycle());
+}
+
+void
+GpuThread::attachGpuThreadToPorts(ProtocolTester *_tester,
+                            ProtocolTester::SeqPort *_port,
+                            ProtocolTester::SeqPort *_scalarPort,
+                            ProtocolTester::SeqPort *_sqcPort)
+{
+    tester = _tester;
+    port = _port;
+    scalarPort = _scalarPort;
+    sqcPort = _sqcPort;
+
+    assert(tester && port);
+    addrManager = tester->getAddressManager();
+    assert(addrManager);
+}
+
+void
+GpuThread::issueNewEpisode()
+{
+    int num_reg_loads = random() % tester->getEpisodeLength();
+    int num_reg_stores = tester->getEpisodeLength() - num_reg_loads;
+
+    // create a new episode
+    curEpisode = new Episode(tester, this, num_reg_loads, num_reg_stores);
+    episodeHistory.push_back(curEpisode);
+}
+
+bool
+GpuThread::isNextActionReady()
+{
+    if (!curEpisode->hasMoreActions()) {
+        return false;
+    } else {
+        curAction = curEpisode->peekCurAction();
+
+        switch(curAction->getType()) {
+            case Episode::Action::Type::ATOMIC:
+                // an atomic action must wait for all previous requests
+                // to complete
+                if (pendingLdStCount == 0 &&
+                    pendingFenceCount == 0 &&
+                    pendingAtomicCount == 0) {
+                    return true;
+                }
+
+                return false;
+            case Episode::Action::Type::ACQUIRE:
+                // we should not see any outstanding ld_st or fence here
+                assert(pendingLdStCount == 0 &&
+                       pendingFenceCount == 0);
+
+                // an acquire action must wait for all previous atomic
+                // requests to complete
+                if (pendingAtomicCount == 0) {
+                    return true;
+                }
+
+                return false;
+            case Episode::Action::Type::RELEASE:
+                // we should not see any outstanding atomic or fence here
+                assert(pendingAtomicCount == 0 &&
+                       pendingFenceCount == 0);
+
+                // a release action must wait for all previous ld/st
+                // requests to complete
+                if (pendingLdStCount == 0) {
+                    return true;
+                }
+
+                return false;
+            case Episode::Action::Type::LOAD:
+            case Episode::Action::Type::STORE:
+                // we should not see any outstanding atomic here
+                assert(pendingAtomicCount == 0);
+
+                // can't issue if there is a pending fence
+                if (pendingFenceCount > 0) {
+                    return false;
+                }
+
+                // a Load or Store is ready if it doesn't overlap
+                // with any outstanding request
+                for (int lane = 0; lane < numLanes; ++lane) {
+                    Location loc = curAction->getLocation(lane);
+
+                    if (loc != AddressManager::INVALID_LOCATION) {
+                        Addr addr = addrManager->getAddress(loc);
+
+                        if (outstandingLoads.find(addr) !=
+                            outstandingLoads.end()) {
+                            return false;
+                        }
+
+                        if (outstandingStores.find(addr) !=
+                            outstandingStores.end()) {
+                            return false;
+                        }
+
+                        if (outstandingAtomics.find(addr) !=
+                            outstandingAtomics.end()) {
+                            // this is not an atomic action, so the address
+                            // should not be in outstandingAtomics list
+                            assert(false);
+                        }
+                    }
+                }
+
+                return true;
+            default:
+                panic("The tester got an invalid action\n");
+        }
+    }
+}
+
+void
+GpuThread::issueNextAction()
+{
+    switch(curAction->getType()) {
+        case Episode::Action::Type::ATOMIC:
+            issueAtomicOps();
+            break;
+        case Episode::Action::Type::ACQUIRE:
+            issueAcquireOp();
+            break;
+        case Episode::Action::Type::RELEASE:
+            issueReleaseOp();
+            break;
+        case Episode::Action::Type::LOAD:
+            issueLoadOps();
+            break;
+        case Episode::Action::Type::STORE:
+            issueStoreOps();
+            break;
+        default:
+            panic("The tester got an invalid action\n");
+    }
+
+    // the current action has been issued, pop it from the action list
+    curEpisode->popAction();
+    lastActiveCycle = curCycle();
+
+    // we may be able to schedule the next action
+    // just wake up this thread in the next cycle
+    if (!threadEvent.scheduled()) {
+        scheduleWakeup();
+    }
+}
+
+void
+GpuThread::addOutstandingReqs(OutstandingReqTable& req_table, Addr address,
+                           int lane, Location loc, Value stored_val)
+{
+    OutstandingReqTable::iterator it = req_table.find(address);
+    OutstandingReq req(lane, loc, stored_val, curCycle());
+
+    if (it == req_table.end()) {
+        // insert a new list of requests for this address
+        req_table.insert(std::pair<Addr, OutstandingReqList>(address,
+                                                OutstandingReqList(1, req)));
+    } else {
+        // add a new request
+        (it->second).push_back(req);
+    }
+}
+
+GpuThread::OutstandingReq
+GpuThread::popOutstandingReq(OutstandingReqTable& req_table, Addr addr)
+{
+    OutstandingReqTable::iterator it = req_table.find(addr);
+
+    // there must be exactly one list of requests for this address in the table
+    assert(it != req_table.end());
+
+    // get the request list
+    OutstandingReqList& req_list = it->second;
+    assert(!req_list.empty());
+
+    // save a request
+    OutstandingReq ret_req = req_list.back();
+
+    // remove the request from the list
+    req_list.pop_back();
+
+    // if the list is now empty, remove it from req_table
+    if (req_list.empty()) {
+        req_table.erase(it);
+    }
+
+    return ret_req;
+}
+
+void
+GpuThread::validateAtomicResp(Location loc, int lane, Value ret_val)
+{
+    if (!addrManager->validateAtomicResp(loc, ret_val)) {
+        std::stringstream ss;
+        Addr addr = addrManager->getAddress(loc);
+
+        // basic info
+        ss << threadName << ": Atomic Op returned unexpected value\n"
+           << "\tEpisode " << curEpisode->getEpisodeId() << "\n"
+           << "\tLane ID " << lane << "\n"
+           << "\tAddress " << printAddress(addr) << "\n"
+           << "\tAtomic Op's return value " << ret_val << "\n";
+
+        // print out basic info
+        warn("%s\n", ss.str());
+
+        // TODO add more detailed info
+
+        // dump all error info and exit the simulation
+        tester->dumpErrorLog(ss);
+    }
+}
+
+void
+GpuThread::validateLoadResp(Location loc, int lane, Value ret_val)
+{
+    if (ret_val != addrManager->getLoggedValue(loc)) {
+        std::stringstream ss;
+        Addr addr = addrManager->getAddress(loc);
+
+        // basic info
+        ss << threadName << ": Loaded value is not consistent with "
+           << "the last stored value\n"
+           << "\tGpuThread " << threadId << "\n"
+           << "\tEpisode " << curEpisode->getEpisodeId() << "\n"
+           << "\tLane ID " << lane << "\n"
+           << "\tAddress " << printAddress(addr) << "\n"
+           << "\tLoaded value " << ret_val << "\n"
+           << "\tLast writer " << addrManager->printLastWriter(loc) << "\n";
+
+        // print out basic info
+        warn("%s\n", ss.str());
+
+        // TODO add more detailed info
+
+        // dump all error info and exit the simulation
+        tester->dumpErrorLog(ss);
+    }
+}
+
+bool
+GpuThread::checkDRF(Location atomic_loc, Location loc, bool isStore) const
+{
+    if (curEpisode && curEpisode->isEpsActive()) {
+        // check against the current episode this thread is executing
+        return curEpisode->checkDRF(atomic_loc, loc, isStore, numLanes);
+    }
+
+    return true;
+}
+
+void
+GpuThread::checkDeadlock()
+{
+    if ((curCycle() - lastActiveCycle) > deadlockThreshold) {
+        // deadlock detected
+        std::stringstream ss;
+
+        ss << threadName << ": Deadlock detected\n"
+           << "\tLast active cycle: " <<  lastActiveCycle << "\n"
+           << "\tCurrent cycle: " << curCycle() << "\n"
+           << "\tDeadlock threshold: " << deadlockThreshold << "\n";
+
+        // print out basic info
+        warn("%s\n", ss.str());
+
+        // dump all error info and exit the simulation
+        tester->dumpErrorLog(ss);
+    } else if (!tester->checkExit()) {
+        // schedule a future deadlock check event
+        assert(!deadlockCheckEvent.scheduled());
+        schedule(deadlockCheckEvent,
+                 deadlockThreshold * clockPeriod() + curTick());
+    }
+}
+
+void
+GpuThread::printOutstandingReqs(const OutstandingReqTable& table,
+                             std::stringstream& ss) const
+{
+    Cycles cur_cycle = curCycle();
+
+    for (const auto& m : table) {
+        for (const auto& req : m.second) {
+            ss << "\t\t\tAddr " << printAddress(m.first)
+               << ": delta (curCycle - issueCycle) = "
+               << (cur_cycle - req.issueCycle) << std::endl;
+        }
+    }
+}
+
+void
+GpuThread::printAllOutstandingReqs(std::stringstream& ss) const
+{
+    // dump all outstanding requests of this thread
+    ss << "\t\tOutstanding Loads:\n";
+    printOutstandingReqs(outstandingLoads, ss);
+    ss << "\t\tOutstanding Stores:\n";
+    printOutstandingReqs(outstandingStores, ss);
+    ss << "\t\tOutstanding Atomics:\n";
+    printOutstandingReqs(outstandingAtomics, ss);
+    ss << "\t\tNumber of outstanding acquires & releases: "
+       << pendingFenceCount << std::endl;
+}
--- a/src/cpu/testers/gpu_ruby_test/gpu_thread.hh
+++ b/src/cpu/testers/gpu_ruby_test/gpu_thread.hh
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * GPU thread issues requests to and receives responses from Ruby memory
+ */
+
+#ifndef CPU_TESTERS_PROTOCOL_TESTER_GPU_THREAD_HH_
+#define CPU_TESTERS_PROTOCOL_TESTER_GPU_THREAD_HH_
+
+#include "cpu/testers/gpu_ruby_test/address_manager.hh"
+#include "cpu/testers/gpu_ruby_test/episode.hh"
+#include "cpu/testers/gpu_ruby_test/protocol_tester.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "sim/clocked_object.hh"
+
+class GpuThread : public ClockedObject
+{
+  public:
+    typedef GpuThreadParams Params;
+    GpuThread(const Params &p);
+    virtual ~GpuThread();
+
+    typedef AddressManager::Location Location;
+    typedef AddressManager::Value Value;
+
+    void wakeup();
+    void scheduleWakeup();
+    void checkDeadlock();
+    void scheduleDeadlockCheckEvent();
+
+    void attachGpuThreadToPorts(ProtocolTester *_tester,
+                             ProtocolTester::SeqPort *_port,
+                             ProtocolTester::SeqPort *_sqcPort = nullptr,
+                             ProtocolTester::SeqPort *_scalarPort = nullptr);
+
+    const std::string& getName() const { return threadName; }
+
+    // must be implemented by a child class
+    virtual void hitCallback(PacketPtr pkt) = 0;
+
+    int getGpuThreadId() const { return threadId; }
+    int getNumLanes() const { return numLanes; }
+    // check if the input location would satisfy DRF constraint
+    bool checkDRF(Location atomic_loc, Location loc, bool isStore) const;
+
+    void printAllOutstandingReqs(std::stringstream& ss) const;
+
+  protected:
+    class GpuThreadEvent : public Event
+    {
+      private:
+        GpuThread* thread;
+        std::string desc;
+
+      public:
+        GpuThreadEvent(GpuThread* _thread, std::string _description)
+            : Event(CPU_Tick_Pri), thread(_thread), desc(_description)
+        {}
+        void setDesc(std::string _description) { desc = _description; }
+        void process() { thread->wakeup(); }
+        const std::string name() { return desc; }
+    };
+
+    GpuThreadEvent threadEvent;
+
+    class DeadlockCheckEvent : public Event
+    {
+      private:
+        GpuThread* thread;
+
+      public:
+        DeadlockCheckEvent(GpuThread* _thread)
+            : Event(CPU_Tick_Pri), thread(_thread)
+        {}
+        void process() { thread->checkDeadlock(); }
+        const std::string name() const { return "Tester deadlock check"; }
+    };
+
+    DeadlockCheckEvent deadlockCheckEvent;
+
+    struct OutstandingReq
+    {
+        int lane;
+        Location origLoc;
+        Value storedValue;
+        Cycles issueCycle;
+
+        OutstandingReq(int _lane, Location _loc, Value _val, Cycles _cycle)
+            : lane(_lane), origLoc(_loc), storedValue(_val), issueCycle(_cycle)
+        {}
+
+        ~OutstandingReq()
+        {}
+    };
+
+    // the unique global id of this thread
+    int threadId;
+    // width of this thread (1 for cpu thread & wf size for gpu wavefront)
+    int numLanes;
+    // thread name
+    std::string threadName;
+    // pointer to the main tester
+    ProtocolTester *tester;
+    // pointer to the address manager
+    AddressManager *addrManager;
+
+    ProtocolTester::SeqPort *port;       // main data port (GPU-vector data)
+    ProtocolTester::SeqPort *scalarPort; // nullptr for CPU
+    ProtocolTester::SeqPort *sqcPort;   // nullptr for CPU
+
+    // a list of issued episodes sorted by time
+    // the last episode in the list is the current episode
+    typedef std::vector<Episode*> EpisodeHistory;
+    EpisodeHistory episodeHistory;
+    // pointer to the current episode
+    Episode *curEpisode;
+    // pointer to the current action
+    const Episode::Action *curAction;
+
+    // number of outstanding requests that are waiting for their responses
+    int pendingLdStCount;
+    int pendingFenceCount;
+    int pendingAtomicCount;
+
+    // last cycle when there is an event in this thread
+    Cycles lastActiveCycle;
+    Cycles deadlockThreshold;
+
+    // a per-address list of outstanding requests
+    typedef std::vector<OutstandingReq> OutstandingReqList;
+    typedef std::unordered_map<Addr, OutstandingReqList> OutstandingReqTable;
+    OutstandingReqTable outstandingLoads;
+    OutstandingReqTable outstandingStores;
+    OutstandingReqTable outstandingAtomics;
+
+    void issueNewEpisode();
+    // check if the next action in the current episode satisfies all wait_cnt
+    // constraints and is ready to issue
+    bool isNextActionReady();
+    void issueNextAction();
+
+    // issue Ops to Ruby memory
+    // must be implemented by a child class
+    virtual void issueLoadOps() = 0;
+    virtual void issueStoreOps() = 0;
+    virtual void issueAtomicOps() = 0;
+    virtual void issueAcquireOp() = 0;
+    virtual void issueReleaseOp() = 0;
+
+    // add an outstanding request to its corresponding table
+    void addOutstandingReqs(OutstandingReqTable& req_table, Addr addr,
+                            int lane, Location loc,
+                            Value stored_val = AddressManager::INVALID_VALUE);
+
+    // pop an outstanding request from the input table
+    OutstandingReq popOutstandingReq(OutstandingReqTable& req_table,
+                                     Addr address);
+
+    // validate all atomic responses
+    void validateAtomicResp(Location loc, int lane, Value ret_val);
+    // validate all Load responses
+    void validateLoadResp(Location loc, int lane, Value ret_val);
+
+    void printOutstandingReqs(const OutstandingReqTable& table,
+                              std::stringstream& ss) const;
+};
+
+#endif /* CPU_TESTERS_PROTOCOL_TESTER_GPU_THREAD_HH_ */
--- a/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
+++ b/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/testers/gpu_ruby_test/gpu_wavefront.hh"
+
+#include "debug/ProtocolTest.hh"
+
+GpuWavefront::GpuWavefront(const Params &p)
+      : GpuThread(p), cuId(p.cu_id)
+{
+    threadName = "GpuWavefront(GpuThread ID = " + std::to_string(threadId) +
+                 ", CU ID = " + std::to_string(cuId) + ")";
+    threadEvent.setDesc("GpuWavefront tick");
+}
+
+GpuWavefront::~GpuWavefront()
+{
+
+}
+
+GpuWavefront*
+GpuWavefrontParams::create() const
+{
+    return new GpuWavefront(*this);
+}
+
+void
+GpuWavefront::issueLoadOps()
+{
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::LOAD);
+    // we should not have any outstanding fence or atomic op at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    for (int lane = 0; lane < numLanes; ++lane) {
+        Location location = curAction->getLocation(lane);
+        assert(location >= AddressManager::INVALID_LOCATION);
+
+        // Make a request if we do not get an INVALID_LOCATION for this lane.
+        if (location >= 0) {
+            Addr address = addrManager->getAddress(location);
+            DPRINTF(ProtocolTest, "%s Episode %d: Issuing Load - Addr %s\n",
+                    this->getName(), curEpisode->getEpisodeId(),
+                    printAddress(address));
+
+            int load_size = sizeof(Value);
+
+            // for now, assert address is 4-byte aligned
+            assert(address % load_size == 0);
+
+            auto req = std::make_shared<Request>(address, load_size,
+                                                 0, tester->requestorId(),
+                                                 0, threadId, nullptr);
+            req->setPaddr(address);
+            req->setReqInstSeqNum(tester->getActionSeqNum());
+            // set protocol-specific flags
+            setExtraRequestFlags(req);
+
+            PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+            uint8_t* data = new uint8_t[load_size];
+            pkt->dataDynamic(data);
+            pkt->senderState = new ProtocolTester::SenderState(this);
+
+            // increment the number of outstanding ld_st requests
+            pendingLdStCount++;
+
+            if (!port->sendTimingReq(pkt)) {
+                panic("Not expected failed sendTimingReq\n");
+            }
+
+            // insert an outstanding load
+            addOutstandingReqs(outstandingLoads, address, lane, location);
+        }
+    }
+}
+
+void
+GpuWavefront::issueStoreOps()
+{
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::STORE);
+    // we should not have any outstanding fence or atomic op at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    for (int lane = 0; lane < numLanes; ++lane) {
+        Location location = curAction->getLocation(lane);
+        assert(location >= AddressManager::INVALID_LOCATION);
+
+        // Make a request if we do not get an INVALID_LOCATION for this lane.
+        if (location >= 0) {
+            // prepare the next value to store
+            Value new_value = addrManager->getLoggedValue(location) + 1;
+
+            Addr address = addrManager->getAddress(location);
+            // must be aligned with store size
+            assert(address % sizeof(Value) == 0);
+
+            DPRINTF(ProtocolTest, "%s Episode %d: Issuing Store - Addr %s - "
+                    "Value %d\n", this->getName(),
+                    curEpisode->getEpisodeId(), printAddress(address),
+                    new_value);
+
+            auto req = std::make_shared<Request>(address, sizeof(Value),
+                                                 0, tester->requestorId(), 0,
+                                                 threadId, nullptr);
+            req->setPaddr(address);
+            req->setReqInstSeqNum(tester->getActionSeqNum());
+            // set protocol-specific flags
+            setExtraRequestFlags(req);
+
+            PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+            uint8_t *writeData = new uint8_t[sizeof(Value)];
+            for (int j = 0; j < sizeof(Value); ++j) {
+                writeData[j] = ((uint8_t*)&new_value)[j];
+            }
+            pkt->dataDynamic(writeData);
+            pkt->senderState = new ProtocolTester::SenderState(this);
+
+            // increment the number of outstanding ld_st requests
+            pendingLdStCount++;
+
+            if (!port->sendTimingReq(pkt)) {
+                panic("Not expecting a failed sendTimingReq\n");
+            }
+
+            // add an outstanding store
+            addOutstandingReqs(outstandingStores, address, lane, location,
+                               new_value);
+        }
+    }
+}
+
+void
+GpuWavefront::issueAtomicOps()
+{
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::ATOMIC);
+    // we should not have any outstanding ops at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingLdStCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    // we use atomic_inc in the tester
+    Request::Flags flags = Request::ATOMIC_RETURN_OP;
+
+    for (int lane = 0; lane < numLanes; ++lane) {
+        Location location = curAction->getLocation(lane);
+        assert(location >= 0);
+
+        Addr address = addrManager->getAddress(location);
+
+        DPRINTF(ProtocolTest, "%s Episode %d: Issuing Atomic_Inc - Addr %s\n",
+                this->getName(), curEpisode->getEpisodeId(),
+                printAddress(address));
+
+        // must be aligned with store size
+        assert(address % sizeof(Value) == 0);
+        AtomicOpFunctor *amo_op = new AtomicOpInc<Value>();
+        auto req = std::make_shared<Request>(address, sizeof(Value),
+                                             flags, tester->requestorId(),
+                                             0, threadId,
+                                             AtomicOpFunctorPtr(amo_op));
+        req->setPaddr(address);
+        req->setReqInstSeqNum(tester->getActionSeqNum());
+        // set protocol-specific flags
+        setExtraRequestFlags(req);
+
+        PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
+        uint8_t* data = new uint8_t[sizeof(Value)];
+        pkt->dataDynamic(data);
+        pkt->senderState = new ProtocolTester::SenderState(this);
+
+        if (!port->sendTimingReq(pkt)) {
+            panic("Not expecting failed sendTimingReq\n");
+        }
+
+        // increment the number of outstanding atomic ops
+        pendingAtomicCount++;
+
+        // add an outstanding atomic
+        addOutstandingReqs(outstandingAtomics, address, lane, location);
+    }
+}
+
+void
+GpuWavefront::issueAcquireOp()
+{
+    DPRINTF(ProtocolTest, "%s Episode %d: Issuing Acquire\n", this->getName(),
+            curEpisode->getEpisodeId());
+
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::ACQUIRE);
+    // we should not have any outstanding ops at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingLdStCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    auto acq_req = std::make_shared<Request>(0, 0, 0,
+                                             tester->requestorId(), 0,
+                                             threadId, nullptr);
+    acq_req->setPaddr(0);
+    acq_req->setReqInstSeqNum(tester->getActionSeqNum());
+    acq_req->setFlags(Request::ACQUIRE);
+    // set protocol-specific flags
+    setExtraRequestFlags(acq_req);
+
+    PacketPtr pkt = new Packet(acq_req, MemCmd::MemSyncReq);
+    pkt->senderState = new ProtocolTester::SenderState(this);
+
+    // increment the number of outstanding fence requests
+    pendingFenceCount++;
+
+    if (!port->sendTimingReq(pkt)) {
+        panic("Not expecting failed sendTimingReq\n");
+    }
+}
+
+void
+GpuWavefront::issueReleaseOp()
+{
+    DPRINTF(ProtocolTest, "%s Episode %d: Issuing Release\n", this->getName(),
+            curEpisode->getEpisodeId());
+
+    // A release fence simply waits for all previous stores to complete. All
+    // previous loads and stores were done before this release operation is
+    // issued, so issueReleaseOp is just a no-op in this tester.
+
+    // we may be able to issue an action. Let's check
+    if (!threadEvent.scheduled()) {
+        scheduleWakeup();
+    }
+}
+
+void
+GpuWavefront::hitCallback(PacketPtr pkt)
+{
+    assert(pkt);
+    MemCmd resp_cmd = pkt->cmd;
+    Addr addr = (resp_cmd == MemCmd::WriteCompleteResp) ? 0 : pkt->getAddr();
+
+    DPRINTF(ProtocolTest, "%s Episode %d: hitCallback - Command %s - "
+                    "Addr %s\n", this->getName(),
+                    curEpisode->getEpisodeId(), resp_cmd.toString(),
+                    printAddress(addr));
+
+    // whether the transaction is done after this hitCallback
+    bool isTransactionDone = true;
+
+    if (resp_cmd == MemCmd::MemSyncResp) {
+        // response to a pending fence
+        // no validation needed for fence responses
+        assert(pendingFenceCount > 0);
+        assert(pendingLdStCount == 0);
+        assert(pendingAtomicCount == 0);
+        pendingFenceCount--;
+    } else if (resp_cmd == MemCmd::ReadResp) {
+        // response to a pending read
+        assert(pendingLdStCount > 0);
+        assert(pendingAtomicCount == 0);
+        assert(outstandingLoads.count(addr) > 0);
+
+        // get return data
+        Value value = *(pkt->getPtr<Value>());
+        OutstandingReq req = popOutstandingReq(outstandingLoads, addr);
+        validateLoadResp(req.origLoc, req.lane, value);
+
+        // this Read is done
+        pendingLdStCount--;
+    } else if (resp_cmd == MemCmd::WriteResp) {
+        // response to a pending write
+        assert(pendingLdStCount > 0);
+        assert(pendingAtomicCount == 0);
+
+        // no need to validate Write response
+        // just pop it from the outstanding req table so that subsequent
+        // requests dependent on this write can proceed
+        // note that we don't decrement pendingLdStCount here yet since
+        // the write is not yet completed in downstream memory. Instead, we
+        // decrement the counter when we receive the write completion ack
+        assert(outstandingStores.count(addr) > 0);
+        OutstandingReq req = popOutstandingReq(outstandingStores, addr);
+        assert(req.storedValue != AddressManager::INVALID_VALUE);
+
+        // update log table
+        addrManager->updateLogTable(req.origLoc, threadId,
+                                    curEpisode->getEpisodeId(),
+                                    req.storedValue,
+                                    curTick(),
+                                    cuId);
+
+        // the transaction is not done yet. Waiting for write completion ack
+        isTransactionDone = false;
+    } else if (resp_cmd == MemCmd::SwapResp) {
+        // response to a pending atomic
+        assert(pendingAtomicCount > 0);
+        assert(pendingLdStCount == 0);
+        assert(outstandingAtomics.count(addr) > 0);
+
+        // get return data
+        Value value = *(pkt->getPtr<Value>());
+
+        // validate atomic op return
+        OutstandingReq req = popOutstandingReq(outstandingAtomics, addr);
+        validateAtomicResp(req.origLoc, req.lane, value);
+
+        // update log table
+        addrManager->updateLogTable(req.origLoc, threadId,
+                                    curEpisode->getEpisodeId(), value,
+                                    curTick(),
+                                    cuId);
+
+        // this Atomic is done
+        pendingAtomicCount--;
+    } else if (resp_cmd == MemCmd::WriteCompleteResp) {
+        // write completion ACK
+        assert(pendingLdStCount > 0);
+        assert(pendingAtomicCount == 0);
+
+        // the Write is now done
+        pendingLdStCount--;
+    } else {
+        panic("Unsupported MemCmd response type");
+    }
+
+    if (isTransactionDone) {
+        // no need to keep senderState and request around
+        delete pkt->senderState;
+    }
+
+    delete pkt;
+
+    // record the last active cycle to check for deadlock
+    lastActiveCycle = curCycle();
+
+    // we may be able to issue an action. Let's check
+    if (!threadEvent.scheduled()) {
+        scheduleWakeup();
+    }
+}
+
+void
+GpuWavefront::setExtraRequestFlags(RequestPtr req)
+{
+    // No extra request flag is set
+}
--- a/src/cpu/testers/gpu_ruby_test/gpu_wavefront.hh
+++ b/src/cpu/testers/gpu_ruby_test/gpu_wavefront.hh
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef CPU_TESTERS_PROTOCOL_TESTER_GPU_WAVEFRONT_HH_
+#define CPU_TESTERS_PROTOCOL_TESTER_GPU_WAVEFRONT_HH_
+
+#include "cpu/testers/gpu_ruby_test/gpu_thread.hh"
+#include "params/GpuWavefront.hh"
+#include "sim/clocked_object.hh"
+
+class GpuWavefront : public GpuThread
+{
+  public:
+    typedef GpuWavefrontParams Params;
+    GpuWavefront(const Params &p);
+    virtual ~GpuWavefront();
+
+    typedef AddressManager::Location Location;
+    typedef AddressManager::Value Value;
+
+    virtual void hitCallback(PacketPtr pkt);
+
+  protected:
+    void issueLoadOps();
+    void issueStoreOps();
+    void issueAtomicOps();
+    // acquire and release ops are protocol-specific, so their issue functions
+    // may be redefined by a child class of GpuWavefront
+    virtual void issueAcquireOp();
+    virtual void issueReleaseOp();
+    // set extra request flags that is specific to a target protocol
+    virtual void setExtraRequestFlags(RequestPtr req);
+
+  protected:
+    int cuId;    // compute unit associated with this wavefront
+};
+
+#endif /* CPU_TESTERS_PROTOCOL_TESTER_GPU_WAVEFRONT_HH_ */
--- a/src/cpu/testers/gpu_ruby_test/protocol_tester.cc
+++ b/src/cpu/testers/gpu_ruby_test/protocol_tester.cc
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/testers/gpu_ruby_test/protocol_tester.hh"
+
+#include <algorithm>
+#include <ctime>
+#include <fstream>
+#include <random>
+
+#include "cpu/testers/gpu_ruby_test/cpu_thread.hh"
+#include "cpu/testers/gpu_ruby_test/gpu_thread.hh"
+#include "cpu/testers/gpu_ruby_test/gpu_wavefront.hh"
+#include "debug/ProtocolTest.hh"
+#include "mem/request.hh"
+#include "sim/sim_exit.hh"
+#include "sim/system.hh"
+
+ProtocolTester::ProtocolTester(const Params &p)
+      : ClockedObject(p),
+        _requestorId(p.system->getRequestorId(this)),
+        numCpuPorts(p.port_cpu_ports_connection_count),
+        numVectorPorts(p.port_cu_vector_ports_connection_count),
+        numSqcPorts(p.port_cu_sqc_ports_connection_count),
+        numScalarPorts(p.port_cu_scalar_ports_connection_count),
+        numCusPerSqc(p.cus_per_sqc),
+        numCusPerScalar(p.cus_per_scalar),
+        numWfsPerCu(p.wavefronts_per_cu),
+        numWisPerWf(p.workitems_per_wavefront),
+        numAtomicLocs(p.num_atomic_locations),
+        numNormalLocsPerAtomic(p.num_normal_locs_per_atomic),
+        episodeLength(p.episode_length),
+        maxNumEpisodes(p.max_num_episodes),
+        debugTester(p.debug_tester),
+        cpuThreads(p.cpu_threads),
+        wfs(p.wavefronts)
+{
+    int idx = 0;  // global port index
+
+    numCpus = numCpuPorts;     // 1 cpu port per CPU
+    numCus = numVectorPorts;   // 1 vector port per CU
+
+    // create all physical cpu's data ports
+    for (int i = 0; i < numCpuPorts; ++i) {
+        DPRINTF(ProtocolTest, "Creating %s\n",
+                csprintf("%s-cpuPort%d", name(), i));
+        cpuPorts.push_back(new SeqPort(csprintf("%s-cpuPort%d", name(), i),
+                                       this, i, idx));
+        idx++;
+    }
+
+    // create all physical gpu's data ports
+    for (int i = 0; i < numVectorPorts; ++i) {
+        DPRINTF(ProtocolTest, "Creating %s\n",
+                csprintf("%s-cuVectorPort%d", name(), i));
+        cuVectorPorts.push_back(new SeqPort(csprintf("%s-cuVectorPort%d",
+                                                     name(), i),
+                                            this, i, idx));
+        idx++;
+    }
+
+    for (int i = 0; i < numScalarPorts; ++i) {
+        DPRINTF(ProtocolTest, "Creating %s\n",
+                              csprintf("%s-cuScalarPort%d", name(), i));
+        cuScalarPorts.push_back(new SeqPort(csprintf("%s-cuScalarPort%d",
+                                                     name(), i),
+                                            this, i, idx));
+        idx++;
+    }
+
+    for (int i = 0; i < numSqcPorts; ++i) {
+        DPRINTF(ProtocolTest, "Creating %s\n",
+                              csprintf("%s-cuSqcPort%d", name(), i));
+        cuSqcPorts.push_back(new SeqPort(csprintf("%s-cuSqcPort%d",
+                                                  name(), i),
+                                         this, i, idx));
+        idx++;
+    }
+
+    // create an address manager
+    addrManager = new AddressManager(numAtomicLocs,
+                                       numNormalLocsPerAtomic);
+    nextEpisodeId = 0;
+
+    if (!debugTester)
+      warn("Data race check is not enabled\n");
+
+    sentExitSignal = false;
+
+    // set random seed number
+    if (p.random_seed != 0) {
+        srand(p.random_seed);
+    } else {
+        srand(time(NULL));
+    }
+
+    actionCount = 0;
+
+    // create a new log file
+    logFile = simout.create(p.log_file);
+    assert(logFile);
+
+    // print test configs
+    std::stringstream ss;
+    ss << "GPU Ruby test's configurations" << std::endl
+       << "\tNumber of CPUs: " << numCpus << std::endl
+       << "\tNumber of CUs: " << numCus << std::endl
+       << "\tNumber of wavefronts per CU: " << numWfsPerCu << std::endl
+       << "\tWavefront size: " << numWisPerWf << std::endl
+       << "\tNumber of atomic locations: " << numAtomicLocs << std::endl
+       << "\tNumber of non-atomic locations: "
+       << numNormalLocsPerAtomic * numAtomicLocs << std::endl
+       << "\tEpisode length: " << episodeLength << std::endl
+       << "\tTest length (max number of episodes): " << maxNumEpisodes
+       << std::endl
+       << "\tRandom seed: " << p.random_seed
+       << std::endl;
+
+    ccprintf(*(logFile->stream()), "%s", ss.str());
+    logFile->stream()->flush();
+}
+
+ProtocolTester::~ProtocolTester()
+{
+    for (int i = 0; i < cpuPorts.size(); ++i)
+        delete cpuPorts[i];
+    for (int i = 0; i < cuVectorPorts.size(); ++i)
+        delete cuVectorPorts[i];
+    for (int i = 0; i < cuScalarPorts.size(); ++i)
+        delete cuScalarPorts[i];
+    for (int i = 0; i < cuSqcPorts.size(); ++i)
+        delete cuSqcPorts[i];
+    delete addrManager;
+
+    // close the log file
+    simout.close(logFile);
+}
+
+void
+ProtocolTester::init()
+{
+    DPRINTF(ProtocolTest, "Attach threads to ports\n");
+
+    // connect cpu threads to cpu's ports
+    for (int cpu_id = 0; cpu_id < numCpus; ++cpu_id) {
+        cpuThreads[cpu_id]->attachGpuThreadToPorts(this,
+                                      static_cast<SeqPort*>(cpuPorts[cpu_id]));
+        cpuThreads[cpu_id]->scheduleWakeup();
+        cpuThreads[cpu_id]->scheduleDeadlockCheckEvent();
+    }
+
+    // connect gpu wavefronts to gpu's ports
+    int wfId = 0;
+    int vectorPortId = 0;
+    int sqcPortId = 0;
+    int scalarPortId = 0;
+
+    for (int cu_id = 0; cu_id < numCus; ++cu_id) {
+        vectorPortId = cu_id;
+        sqcPortId = cu_id/numCusPerSqc;
+        scalarPortId = cu_id/numCusPerScalar;
+
+        for (int i = 0; i < numWfsPerCu; ++i) {
+            wfId = cu_id * numWfsPerCu + i;
+            wfs[wfId]->attachGpuThreadToPorts(this,
+                           static_cast<SeqPort*>(cuVectorPorts[vectorPortId]),
+                           static_cast<SeqPort*>(cuSqcPorts[sqcPortId]),
+                           static_cast<SeqPort*>(cuScalarPorts[scalarPortId]));
+            wfs[wfId]->scheduleWakeup();
+            wfs[wfId]->scheduleDeadlockCheckEvent();
+        }
+    }
+}
+
+Port&
+ProtocolTester::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name != "cpu_ports" && if_name != "cu_vector_ports" &&
+        if_name != "cu_sqc_ports" && if_name != "cu_scalar_ports") {
+        // pass along to super class
+        return ClockedObject::getPort(if_name, idx);
+    } else {
+        if (if_name == "cpu_ports") {
+            if (idx > numCpuPorts)
+                panic("ProtocolTester: unknown cpu port %d\n", idx);
+            return *cpuPorts[idx];
+        } else if (if_name == "cu_vector_ports") {
+            if (idx > numVectorPorts)
+                panic("ProtocolTester: unknown cu vect port %d\n", idx);
+            return *cuVectorPorts[idx];
+        } else if (if_name == "cu_sqc_ports") {
+            if (idx > numSqcPorts)
+                panic("ProtocolTester: unknown cu sqc port %d\n", idx);
+            return *cuSqcPorts[idx];
+        } else {
+            assert(if_name == "cu_scalar_ports");
+            if (idx > numScalarPorts)
+                panic("ProtocolTester: unknown cu scal port %d\n", idx);
+            return *cuScalarPorts[idx];
+        }
+    }
+
+    assert(false);
+}
+
+bool
+ProtocolTester::checkExit()
+{
+    if (nextEpisodeId > maxNumEpisodes) {
+        if (!sentExitSignal) {
+            // all done
+            inform("Total completed episodes: %d\n", nextEpisodeId - 1);
+            exitSimLoop("GPU Ruby Tester: Passed!");
+            sentExitSignal = true;
+        }
+        return true;
+    }
+    return false;
+}
+
+bool
+ProtocolTester::checkDRF(Location atomic_loc,
+                         Location loc, bool isStore) const
+{
+    if (debugTester) {
+        // go through all active episodes in all threads
+        for (const GpuThread* th : wfs) {
+            if (!th->checkDRF(atomic_loc, loc, isStore))
+                return false;
+        }
+
+        for (const GpuThread* th : cpuThreads) {
+            if (!th->checkDRF(atomic_loc, loc, isStore))
+                return false;
+        }
+    }
+
+    return true;
+}
+
+void
+ProtocolTester::dumpErrorLog(std::stringstream& ss)
+{
+    if (!sentExitSignal) {
+        // go through all threads and dump their outstanding requests
+        for (auto t : cpuThreads) {
+            t->printAllOutstandingReqs(ss);
+        }
+
+        for (auto t : wfs) {
+            t->printAllOutstandingReqs(ss);
+        }
+
+        // dump error log into a file
+        assert(logFile);
+        ccprintf(*(logFile->stream()), "%s", ss.str());
+        logFile->stream()->flush();
+
+        sentExitSignal = true;
+        // terminate the simulation
+        panic("GPU Ruby Tester: Failed!\n");
+    }
+}
+
+bool
+ProtocolTester::SeqPort::recvTimingResp(PacketPtr pkt)
+{
+    // get the requesting thread from the original sender state
+    ProtocolTester::SenderState* senderState =
+                    safe_cast<ProtocolTester::SenderState*>(pkt->senderState);
+    GpuThread *th = senderState->th;
+
+    th->hitCallback(pkt);
+
+    return true;
+}
+
+ProtocolTester*
+ProtocolTesterParams::create() const
+{
+    return new ProtocolTester(*this);
+}
--- a/src/cpu/testers/gpu_ruby_test/protocol_tester.hh
+++ b/src/cpu/testers/gpu_ruby_test/protocol_tester.hh
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef CPU_TESTERS_PROTOCOL_TESTER_PROTOCOL_TESTER_HH_
+#define CPU_TESTERS_PROTOCOL_TESTER_PROTOCOL_TESTER_HH_
+
+/*
+ * The tester includes the main ProtocolTester that manages all ports to the
+ * memory system.
+ * GpuThreads are mapped to certain data port(s)
+ *
+ * GpuThreads inject memory requests through their data ports.
+ * The tester receives and validates responses from the memory.
+ *
+ * Main components
+ *    - AddressManager: generate DRF request streams &
+ *                      validate data response against an internal log_table
+ *    - Episode: a sequence of requests
+ *    - Thread: either GPU wavefront or CPU thread
+ */
+
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "base/types.hh"
+#include "cpu/testers/gpu_ruby_test/address_manager.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/system/RubyPort.hh"
+#include "params/ProtocolTester.hh"
+
+class GpuThread;
+class CpuThread;
+class GpuWavefront;
+
+class ProtocolTester : public ClockedObject
+{
+  public:
+    class SeqPort : public RequestPort
+    {
+      public:
+        SeqPort(const std::string &_name, ProtocolTester *_tester, PortID _id,
+                PortID _index)
+            : RequestPort(_name, _tester, _id)
+        {}
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry()
+            { panic("%s does not expect a retry\n", name()); }
+    };
+
+    struct SenderState : public Packet::SenderState
+    {
+        GpuThread* th;
+        SenderState(GpuThread* _th)
+        {
+            assert(_th);
+            th = _th;
+        }
+
+        ~SenderState()
+        {}
+    };
+
+  public:
+    typedef ProtocolTesterParams Params;
+    ProtocolTester(const Params &p);
+    ~ProtocolTester();
+
+    typedef AddressManager::Location Location;
+    typedef AddressManager::Value Value;
+
+    void init();
+    RequestorID requestorId() { return _requestorId; };
+    Port& getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
+
+    int getEpisodeLength() const { return episodeLength; }
+    // return pointer to the address manager
+    AddressManager* getAddressManager() const { return addrManager; }
+    // return true if the tester should stop issuing new episodes
+    bool checkExit();
+    // verify if a location to be picked for LD/ST will satisfy
+    // data race free requirement
+    bool checkDRF(Location atomic_loc, Location loc, bool isStore) const;
+    // return the next episode id and increment it
+    int getNextEpisodeID() { return nextEpisodeId++; }
+    // get action sequence number
+    int getActionSeqNum() { return actionCount++; }
+
+    // dump error log into a file and exit the simulation
+    void dumpErrorLog(std::stringstream& ss);
+
+  private:
+    RequestorID _requestorId;
+
+    // list of parameters taken from python scripts
+    int numCpuPorts;
+    int numVectorPorts;
+    int numSqcPorts;
+    int numScalarPorts;
+    int numCusPerSqc;
+    int numCusPerScalar;
+    int numWfsPerCu;
+    int numWisPerWf;
+    // parameters controlling the address range that the tester can access
+    int numAtomicLocs;
+    int numNormalLocsPerAtomic;
+    // the number of actions in an episode (episodeLength +- random number)
+    int episodeLength;
+    // the maximum number of episodes to be completed by this tester
+    int maxNumEpisodes;
+    // are we debuggin the tester
+    bool debugTester;
+
+    // all available requestor ports connected to Ruby
+    std::vector<RequestPort*> cpuPorts;      // cpu data ports
+    std::vector<RequestPort*> cuVectorPorts; // ports to GPU vector cache
+    std::vector<RequestPort*> cuSqcPorts;    // ports to GPU inst cache
+    std::vector<RequestPort*> cuScalarPorts; // ports to GPU scalar cache
+    // all CPU and GPU threads
+    std::vector<CpuThread*> cpuThreads;
+    std::vector<GpuWavefront*> wfs;
+
+    // address manager that (1) generates DRF sequences of requests,
+    //                      (2) manages an internal log table and
+    //                      (3) validate response data
+    AddressManager* addrManager;
+
+    // number of CPUs and CUs
+    int numCpus;
+    int numCus;
+    // unique id of the next episode
+    int nextEpisodeId;
+
+    // global action count. Overflow is fine. It's used to uniquely identify
+    // per-wave & per-instruction memory requests in the coalescer
+    int actionCount;
+
+    // if an exit signal was already sent
+    bool sentExitSignal;
+
+    OutputStream* logFile;
+};
+
+#endif /* CPU_TESTERS_PROTOCOL_TESTER_PROTOCOL_TESTER_HH_ */