diff --git a/configs/example/ruby_gpu_random_test.py b/configs/example/ruby_gpu_random_test.py
index ae4ff0bf12..133c13a4bb 100644
--- a/configs/example/ruby_gpu_random_test.py
+++ b/configs/example/ruby_gpu_random_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2020 Advanced Micro Devices, Inc.
+# Copyright (c) 2018-2021 Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # For use for simulation and test purposes only
@@ -81,6 +81,8 @@ parser.add_option("--random-seed", type="int", default=0,
                   help="Random seed number. Default value (i.e., 0) means \
                         using runtime-specific value")
 parser.add_option("--log-file", type="string", default="gpu-ruby-test.log")
+parser.add_option("--num-dmas", type="int", default=0,
+                  help="The number of DMA engines to use in tester config.")
 
 (options, args) = parser.parse_args()
 
@@ -112,6 +114,7 @@ if (options.system_size == "small"):
     options.wf_size = 1
     options.wavefronts_per_cu = 1
     options.num_cpus = 1
+    options.num_dmas = 1
     options.cu_per_sqc = 1
     options.cu_per_scalar_cache = 1
     options.num_compute_units = 1
@@ -120,6 +123,7 @@ elif (options.system_size == "medium"):
     options.wf_size = 16
     options.wavefronts_per_cu = 4
     options.num_cpus = 4
+    options.num_dmas = 2
     options.cu_per_sqc = 4
     options.cu_per_scalar_cache = 4
     options.num_compute_units = 4
@@ -128,6 +132,7 @@ elif (options.system_size == "large"):
     options.wf_size = 32
     options.wavefronts_per_cu = 4
     options.num_cpus = 4
+    options.num_dmas = 4
     options.cu_per_sqc = 4
     options.cu_per_scalar_cache = 4
     options.num_compute_units = 8
@@ -174,6 +179,9 @@ tester_deadlock_threshold = 1e9
 # For now we're testing only GPU protocol, so we force num_cpus to be 0
 options.num_cpus = 0
 
+# Number of DMA engines
+n_DMAs = options.num_dmas
+
 # Number of CUs
 n_CUs = options.num_compute_units
 
@@ -229,10 +237,20 @@ system.clk_domain = SrcClockDomain(clock = options.sys_clock,
 #
 options.num_cp = 0
 
+#
+# Make generic DMA sequencer for Ruby to use
+#
+dma_devices = [TesterDma()] * n_DMAs
+system.piobus = IOXBar()
+for _, dma_device in enumerate(dma_devices):
+    dma_device.pio = system.piobus.mem_side_ports
+system.dma_devices = dma_devices
+
 #
 # Create the Ruby system
 #
-Ruby.create_system(options, False, system)
+Ruby.create_system(options = options, full_system = False,
+                   system = system, dma_ports = system.dma_devices)
 
 #
 # The tester is most effective when randomization is turned on and
@@ -256,6 +274,7 @@ print("Attaching ruby ports to the tester")
 for i, ruby_port in enumerate(system.ruby._cpu_ports):
     ruby_port.no_retry_on_stall = True
     ruby_port.using_ruby_tester = True
+    ruby_port.mem_request_port = system.piobus.cpu_side_ports
 
     if i < n_CUs:
         tester.cu_vector_ports = ruby_port.in_ports
@@ -269,17 +288,45 @@ for i, ruby_port in enumerate(system.ruby._cpu_ports):
     i += 1
 
 #
-# No CPU threads are needed for GPU tester
+# Attach DMA ports. Since Ruby.py doesn't return these they need to be found.
+# Connect tester's request port to each DMA sequencer's in_ports. This assumes
+# the protocol names these system.dma_cntrl<#>.
+#
+dma_ports = []
+for i in range(n_DMAs):
+    dma_cntrl = getattr(system, 'dma_cntrl' + str(i))
+    dma_ports.append(dma_cntrl.dma_sequencer.in_ports)
+tester.dma_ports = dma_ports
+
+#
+# Common variables for all types of threads
+#
+thread_clock = SrcClockDomain(clock = '1GHz',
+                              voltage_domain = system.voltage_domain)
+g_thread_idx = 0
+
+#
+# No CPU threads are used for GPU tester
 #
 tester.cpu_threads = []
 
+#
+# Create DMA threads
+#
+dma_threads = []
+print("Creating %i DMAs" % n_DMAs)
+for dma_idx in range(n_DMAs):
+    dma_threads.append(DmaThread(thread_id = g_thread_idx,
+                                 num_lanes = 1, clk_domain = thread_clock,
+                                 deadlock_threshold = \
+                                         tester_deadlock_threshold))
+    g_thread_idx += 1
+tester.dma_threads = dma_threads
+
 #
 # Create GPU wavefronts
 #
-thread_clock = SrcClockDomain(clock = '1GHz',
-                              voltage_domain = system.voltage_domain)
 wavefronts = []
-g_thread_idx = 0
 print("Creating %i WFs attached to %i CUs" % \
                 (n_CUs * tester.wavefronts_per_cu, n_CUs))
 for cu_idx in range(n_CUs):
diff --git a/src/cpu/testers/gpu_ruby_test/DmaThread.py b/src/cpu/testers/gpu_ruby_test/DmaThread.py
new file mode 100644
index 0000000000..570c6ae399
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/DmaThread.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2021 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+
+from m5.objects.TesterThread import TesterThread
+
+class DmaThread(TesterThread):
+    type = 'DmaThread'
+    cxx_header = "cpu/testers/gpu_ruby_test/dma_thread.hh"
diff --git a/src/cpu/testers/gpu_ruby_test/ProtocolTester.py b/src/cpu/testers/gpu_ruby_test/ProtocolTester.py
index ed0e0a88ee..a1b55c866e 100644
--- a/src/cpu/testers/gpu_ruby_test/ProtocolTester.py
+++ b/src/cpu/testers/gpu_ruby_test/ProtocolTester.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2020 Advanced Micro Devices, Inc.
+# Copyright (c) 2017-2021 Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # For use for simulation and test purposes only
@@ -38,6 +38,7 @@ class ProtocolTester(ClockedObject):
     cxx_header = "cpu/testers/gpu_ruby_test/protocol_tester.hh"
 
     cpu_ports = VectorRequestPort("Ports for CPUs")
+    dma_ports = VectorRequestPort("Ports for DMAs")
     cu_vector_ports = VectorRequestPort("Vector ports for GPUs")
     cu_sqc_ports = VectorRequestPort("SQC ports for GPUs")
     cu_scalar_ports = VectorRequestPort("Scalar ports for GPUs")
@@ -55,6 +56,7 @@ class ProtocolTester(ClockedObject):
                                  " coalescer.")
 
     cpu_threads = VectorParam.CpuThread("All cpus")
+    dma_threads = VectorParam.DmaThread("All DMAs")
     wavefronts = VectorParam.GpuWavefront("All wavefronts")
 
     num_atomic_locations = Param.Int(2, "Number of atomic locations")
diff --git a/src/cpu/testers/gpu_ruby_test/SConscript b/src/cpu/testers/gpu_ruby_test/SConscript
index 28c8006452..5dcfbcbc09 100644
--- a/src/cpu/testers/gpu_ruby_test/SConscript
+++ b/src/cpu/testers/gpu_ruby_test/SConscript
@@ -41,13 +41,16 @@ if env['PROTOCOL'] == 'None':
 
 SimObject('ProtocolTester.py')
 SimObject('CpuThread.py')
+SimObject('DmaThread.py')
 SimObject('GpuWavefront.py')
 SimObject('TesterThread.py')
+SimObject('TesterDma.py')
 
 Source('address_manager.cc')
 Source('episode.cc')
 Source('protocol_tester.cc')
 Source('cpu_thread.cc')
+Source('dma_thread.cc')
 Source('gpu_wavefront.cc')
 Source('tester_thread.cc')
 
diff --git a/src/cpu/testers/gpu_ruby_test/TesterDma.py b/src/cpu/testers/gpu_ruby_test/TesterDma.py
new file mode 100644
index 0000000000..2f669c069d
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/TesterDma.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2021 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from this
+# software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+from m5.objects.Device import DmaDevice
+
+class TesterDma(DmaDevice):
+    type = 'TesterDma'
+    cxx_header = "cpu/testers/gpu_ruby_test/tester_dma.hh"
diff --git a/src/cpu/testers/gpu_ruby_test/dma_thread.cc b/src/cpu/testers/gpu_ruby_test/dma_thread.cc
new file mode 100644
index 0000000000..254158dc00
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/dma_thread.cc
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpu/testers/gpu_ruby_test/dma_thread.hh"
+
+#include "debug/ProtocolTest.hh"
+
+DmaThread::DmaThread(const Params& _params)
+    : TesterThread(_params)
+{
+    threadName = "DmaThread(Thread ID " + std::to_string(threadId) + ")";
+    threadEvent.setDesc("DmaThread tick");
+    assert(numLanes == 1);
+}
+
+DmaThread::~DmaThread()
+{
+
+}
+
+DmaThread*
+DmaThreadParams::create() const
+{
+    return new DmaThread(*this);
+}
+
+void
+DmaThread::issueLoadOps()
+{
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::LOAD);
+    // we should not have any outstanding fence or atomic op at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    // DMA thread is a scalar thread so always set lane to zero. This allows
+    // us to reuse the API for GPU threads rather than have a specific API
+    // for scalar tester threads
+    int lane = 0;
+
+    Location location = curAction->getLocation(lane);
+    assert(location >= AddressManager::INVALID_LOCATION);
+
+    if (location >= 0) {
+        Addr address = addrManager->getAddress(location);
+        DPRINTF(ProtocolTest, "%s Episode %d: Issuing Load - Addr %s\n",
+                this->getName(), curEpisode->getEpisodeId(),
+                printAddress(address));
+
+        int load_size = sizeof(Value);
+
+        // for now, assert address is 4-byte aligned
+        assert(address % load_size == 0);
+
+        auto req = std::make_shared<Request>(address, load_size,
+                                             0, tester->requestorId(),
+                                             0, threadId, nullptr);
+        req->setPaddr(address);
+        req->setReqInstSeqNum(tester->getActionSeqNum());
+
+        PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+        uint8_t* data = new uint8_t[load_size];
+        pkt->dataDynamic(data);
+        pkt->senderState = new ProtocolTester::SenderState(this);
+
+        if (!port->sendTimingReq(pkt)) {
+            panic("Not expected failed sendTimingReq\n");
+        }
+
+        // insert an outstanding load
+        addOutstandingReqs(outstandingLoads, address, lane, location);
+
+        // increment the number of outstanding ld_st requests
+        pendingLdStCount++;
+    }
+}
+
+void
+DmaThread::issueStoreOps()
+{
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::STORE);
+    // we should not have any outstanding fence or atomic op at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    // DMA thread is a scalar thread so always set lane to zero. This allows
+    // us to reuse the API for GPU threads rather than have a specific API
+    // for scalar tester threads
+    int lane = 0;
+
+    Location location = curAction->getLocation(lane);
+    assert(location >= AddressManager::INVALID_LOCATION);
+
+    if (location >= 0) {
+        // prepare the next value to store
+        Value new_value = addrManager->getLoggedValue(location) + 1;
+
+        Addr address = addrManager->getAddress(location);
+        // must be aligned with store size
+        assert(address % sizeof(Value) == 0);
+
+        DPRINTF(ProtocolTest, "%s Episode %d: Issuing Store - Addr %s - "
+                "Value %d\n", this->getName(),
+                curEpisode->getEpisodeId(), printAddress(address),
+                new_value);
+
+        auto req = std::make_shared<Request>(address, sizeof(Value),
+                                             0, tester->requestorId(), 0,
+                                             threadId, nullptr);
+        req->setPaddr(address);
+        req->setReqInstSeqNum(tester->getActionSeqNum());
+
+        PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+        uint8_t *writeData = new uint8_t[sizeof(Value)];
+        for (int j = 0; j < sizeof(Value); ++j) {
+            writeData[j] = ((uint8_t*)&new_value)[j];
+        }
+        pkt->dataDynamic(writeData);
+        pkt->senderState = new ProtocolTester::SenderState(this);
+
+        if (!port->sendTimingReq(pkt)) {
+            panic("Not expecting a failed sendTimingReq\n");
+        }
+
+        // add an outstanding store
+        addOutstandingReqs(outstandingStores, address, lane, location,
+                           new_value);
+
+        // increment the number of outstanding ld_st requests
+        pendingLdStCount++;
+    }
+}
+
+void
+DmaThread::issueAtomicOps()
+{
+    DPRINTF(ProtocolTest, "Issuing Atomic Op ...\n");
+
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::ATOMIC);
+    // we should not have any outstanding ops at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingLdStCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    // no-op: No DMA protocol exists with Atomics
+}
+
+void
+DmaThread::issueAcquireOp()
+{
+    DPRINTF(ProtocolTest, "Issuing Acquire Op ...\n");
+
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::ACQUIRE);
+    // we should not have any outstanding ops at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingLdStCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    // no-op: Acquire does not apply to DMA threads
+}
+
+void
+DmaThread::issueReleaseOp()
+{
+    DPRINTF(ProtocolTest, "Issuing Release Op ...\n");
+
+    assert(curAction);
+    assert(curAction->getType() == Episode::Action::Type::RELEASE);
+    // we should not have any outstanding ops at this point
+    assert(pendingFenceCount == 0);
+    assert(pendingLdStCount == 0);
+    assert(pendingAtomicCount == 0);
+
+    // no-op: Release does not apply to DMA threads
+}
+
+void
+DmaThread::hitCallback(PacketPtr pkt)
+{
+    assert(pkt);
+    MemCmd resp_cmd = pkt->cmd;
+    Addr addr = pkt->getAddr();
+
+    DPRINTF(ProtocolTest, "%s Episode %d: hitCallback - Command %s -"
+            " Addr %s\n", this->getName(), curEpisode->getEpisodeId(),
+            resp_cmd.toString(), printAddress(addr));
+
+    if (resp_cmd == MemCmd::SwapResp) {
+        // response to a pending atomic
+        assert(pendingAtomicCount > 0);
+        assert(pendingLdStCount == 0);
+        assert(outstandingAtomics.count(addr) > 0);
+
+        // get return data
+        Value value = *(pkt->getPtr<Value>());
+
+        // validate atomic op return
+        OutstandingReq req = popOutstandingReq(outstandingAtomics, addr);
+        assert(req.lane == 0);
+        validateAtomicResp(req.origLoc, req.lane, value);
+
+        // update log table
+        addrManager->updateLogTable(req.origLoc, threadId,
+                                    curEpisode->getEpisodeId(), value,
+                                    curTick(),
+                                    0);
+
+        // this Atomic is done
+        pendingAtomicCount--;
+    } else if (resp_cmd == MemCmd::ReadResp) {
+        // response to a pending read
+        assert(pendingLdStCount > 0);
+        assert(pendingAtomicCount == 0);
+        assert(outstandingLoads.count(addr) > 0);
+
+        // get return data
+        Value value = *(pkt->getPtr<Value>());
+        OutstandingReq req = popOutstandingReq(outstandingLoads, addr);
+        assert(req.lane == 0);
+        validateLoadResp(req.origLoc, req.lane, value);
+
+        // this Read is done
+        pendingLdStCount--;
+    } else if (resp_cmd == MemCmd::WriteResp) {
+        // response to a pending write
+        assert(pendingLdStCount > 0);
+        assert(pendingAtomicCount == 0);
+
+        // no need to validate Write response
+        // just pop it from the outstanding req table so that subsequent
+        // requests dependent on this write can proceed
+        // note that unlike GpuWavefront we do decrement pendingLdStCount here
+        // since the write is guaranteed to be completed in downstream memory.
+        assert(outstandingStores.count(addr) > 0);
+        OutstandingReq req = popOutstandingReq(outstandingStores, addr);
+        assert(req.storedValue != AddressManager::INVALID_VALUE);
+
+        // update log table
+        addrManager->updateLogTable(req.origLoc, threadId,
+                                    curEpisode->getEpisodeId(),
+                                    req.storedValue,
+                                    curTick(),
+                                    0);
+
+        // the Write is now done
+        pendingLdStCount--;
+    } else {
+        panic("UnsupportedMemCmd response type: %s",
+              resp_cmd.toString().c_str());
+    }
+
+    delete pkt->senderState;
+    delete pkt;
+
+    // record the last active cycle to check for deadlock
+    lastActiveCycle = curCycle();
+
+    // we may be able to issue an action. Let's check
+    if (!threadEvent.scheduled()) {
+        scheduleWakeup();
+    }
+}
diff --git a/src/cpu/testers/gpu_ruby_test/dma_thread.hh b/src/cpu/testers/gpu_ruby_test/dma_thread.hh
new file mode 100644
index 0000000000..1b6fd2b576
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/dma_thread.hh
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef CPU_TESTERS_PROTOCOL_TESTER_DMATHREAD_HH_
+#define CPU_TESTERS_PROTOCOL_TESTER_DMATHREAD_HH_
+
+#include "cpu/testers/gpu_ruby_test/tester_thread.hh"
+#include "params/DmaThread.hh"
+
+class DmaThread : public TesterThread
+{
+  public:
+    typedef DmaThreadParams Params;
+    DmaThread(const Params& _params);
+    virtual ~DmaThread();
+
+    typedef AddressManager::Location Location;
+    typedef AddressManager::Value Value;
+
+    void hitCallback(PacketPtr pkt);
+
+  protected:
+    void issueLoadOps();
+    void issueStoreOps();
+    void issueAtomicOps();
+    void issueAcquireOp();
+    void issueReleaseOp();
+};
+
+#endif /* CPU_TESTERS_PROTOCOL_TESTER_DMATHREAD_HH_ */
diff --git a/src/cpu/testers/gpu_ruby_test/protocol_tester.cc b/src/cpu/testers/gpu_ruby_test/protocol_tester.cc
index dd50bb26e8..a8f84081c4 100644
--- a/src/cpu/testers/gpu_ruby_test/protocol_tester.cc
+++ b/src/cpu/testers/gpu_ruby_test/protocol_tester.cc
@@ -39,6 +39,7 @@
 #include <random>
 
 #include "cpu/testers/gpu_ruby_test/cpu_thread.hh"
+#include "cpu/testers/gpu_ruby_test/dma_thread.hh"
 #include "cpu/testers/gpu_ruby_test/gpu_wavefront.hh"
 #include "cpu/testers/gpu_ruby_test/tester_thread.hh"
 #include "debug/ProtocolTest.hh"
@@ -50,6 +51,7 @@ ProtocolTester::ProtocolTester(const Params &p)
       : ClockedObject(p),
         _requestorId(p.system->getRequestorId(this)),
         numCpuPorts(p.port_cpu_ports_connection_count),
+        numDmaPorts(p.port_dma_ports_connection_count),
         numVectorPorts(p.port_cu_vector_ports_connection_count),
         numSqcPorts(p.port_cu_sqc_ports_connection_count),
         numScalarPorts(p.port_cu_scalar_ports_connection_count),
@@ -65,11 +67,13 @@ ProtocolTester::ProtocolTester(const Params &p)
         maxNumEpisodes(p.max_num_episodes),
         debugTester(p.debug_tester),
         cpuThreads(p.cpu_threads),
+        dmaThreads(p.dma_threads),
         wfs(p.wavefronts)
 {
     int idx = 0;  // global port index
 
     numCpus = numCpuPorts;     // 1 cpu port per CPU
+    numDmas = numDmaPorts;     // 1 dma port per DMA
     numCus = numVectorPorts;   // 1 vector port per CU
 
     // create all physical cpu's data ports
@@ -81,6 +85,15 @@ ProtocolTester::ProtocolTester(const Params &p)
         idx++;
     }
 
+    // create all physical DMA data ports
+    for (int i = 0; i < numDmaPorts; ++i) {
+        DPRINTF(ProtocolTest, "Creating %s\n",
+                csprintf("%s-dmaPort%d", name(), i));
+        dmaPorts.push_back(new SeqPort(csprintf("%s-dmaPort%d", name(), i),
+                                       this, i, idx));
+        idx++;
+    }
+
     // create all physical gpu's data ports
     for (int i = 0; i < numVectorPorts; ++i) {
         DPRINTF(ProtocolTest, "Creating %s\n",
@@ -144,6 +157,7 @@ ProtocolTester::ProtocolTester(const Params &p)
     std::stringstream ss;
     ss << "GPU Ruby test's configurations" << std::endl
        << "\tNumber of CPUs: " << numCpus << std::endl
+       << "\tNumber of DMAs: " << numDmas << std::endl
        << "\tNumber of CUs: " << numCus << std::endl
        << "\tNumber of wavefronts per CU: " << numWfsPerCu << std::endl
        << "\tWavefront size: " << numWisPerWf << std::endl
@@ -164,6 +178,8 @@ ProtocolTester::~ProtocolTester()
 {
     for (int i = 0; i < cpuPorts.size(); ++i)
         delete cpuPorts[i];
+    for (int i = 0; i < dmaPorts.size(); ++i)
+        delete dmaPorts[i];
     for (int i = 0; i < cuVectorPorts.size(); ++i)
         delete cuVectorPorts[i];
     for (int i = 0; i < cuScalarPorts.size(); ++i)
@@ -189,6 +205,14 @@ ProtocolTester::init()
         cpuThreads[cpu_id]->scheduleDeadlockCheckEvent();
     }
 
+    // connect dma threads to dma's ports
+    for (int dma_id = 0; dma_id < numDmas; ++dma_id) {
+        dmaThreads[dma_id]->attachTesterThreadToPorts(this,
+                                      static_cast<SeqPort*>(dmaPorts[dma_id]));
+        dmaThreads[dma_id]->scheduleWakeup();
+        dmaThreads[dma_id]->scheduleDeadlockCheckEvent();
+    }
+
     // connect gpu wavefronts to gpu's ports
     int wfId = 0;
     int vectorPortId = 0;
@@ -216,9 +240,9 @@ ProtocolTester::init()
 Port&
 ProtocolTester::getPort(const std::string &if_name, PortID idx)
 {
-    if (if_name != "cpu_ports" && if_name != "cu_vector_ports" &&
-        if_name != "cu_sqc_ports" && if_name != "cu_scalar_ports" &&
-        if_name != "cu_token_ports") {
+    if (if_name != "cpu_ports" && if_name != "dma_ports" &&
+        if_name != "cu_vector_ports" && if_name != "cu_sqc_ports" &&
+        if_name != "cu_scalar_ports" && if_name != "cu_token_ports") {
         // pass along to super class
         return ClockedObject::getPort(if_name, idx);
     } else {
@@ -226,6 +250,10 @@ ProtocolTester::getPort(const std::string &if_name, PortID idx)
             if (idx > numCpuPorts)
                 panic("ProtocolTester: unknown cpu port %d\n", idx);
             return *cpuPorts[idx];
+        } else if (if_name == "dma_ports") {
+            if (idx > numDmaPorts)
+                panic("ProtocolTester: unknown dma port %d\n", idx);
+            return *dmaPorts[idx];
         } else if (if_name == "cu_vector_ports") {
             if (idx > numVectorPorts)
                 panic("ProtocolTester: unknown cu vect port %d\n", idx);
@@ -279,6 +307,11 @@ ProtocolTester::checkDRF(Location atomic_loc,
             if (!th->checkDRF(atomic_loc, loc, isStore))
                 return false;
         }
+
+        for (const TesterThread* th : dmaThreads) {
+            if (!th->checkDRF(atomic_loc, loc, isStore))
+                return false;
+        }
     }
 
     return true;
@@ -293,6 +326,10 @@ ProtocolTester::dumpErrorLog(std::stringstream& ss)
             t->printAllOutstandingReqs(ss);
         }
 
+        for (auto t : dmaThreads) {
+            t->printAllOutstandingReqs(ss);
+        }
+
         for (auto t : wfs) {
             t->printAllOutstandingReqs(ss);
         }
diff --git a/src/cpu/testers/gpu_ruby_test/protocol_tester.hh b/src/cpu/testers/gpu_ruby_test/protocol_tester.hh
index 3ecc06780a..57e9d1887b 100644
--- a/src/cpu/testers/gpu_ruby_test/protocol_tester.hh
+++ b/src/cpu/testers/gpu_ruby_test/protocol_tester.hh
@@ -143,6 +143,7 @@ class ProtocolTester : public ClockedObject
 
     // list of parameters taken from python scripts
     int numCpuPorts;
+    int numDmaPorts;
     int numVectorPorts;
     int numSqcPorts;
     int numScalarPorts;
@@ -164,13 +165,15 @@ class ProtocolTester : public ClockedObject
 
     // all available requestor ports connected to Ruby
     std::vector<RequestPort*> cpuPorts;      // cpu data ports
+    std::vector<RequestPort*> dmaPorts;      // DMA data ports
     std::vector<RequestPort*> cuVectorPorts; // ports to GPU vector cache
     std::vector<RequestPort*> cuSqcPorts;    // ports to GPU inst cache
     std::vector<RequestPort*> cuScalarPorts; // ports to GPU scalar cache
     std::vector<TokenManager*> cuTokenManagers;
     std::vector<GMTokenPort*> cuTokenPorts;
-    // all CPU and GPU threads
+    // all CPU, DMA, and GPU threads
     std::vector<CpuThread*> cpuThreads;
+    std::vector<DmaThread*> dmaThreads;
     std::vector<GpuWavefront*> wfs;
 
     // address manager that (1) generates DRF sequences of requests,
@@ -180,6 +183,7 @@ class ProtocolTester : public ClockedObject
 
     // number of CPUs and CUs
     int numCpus;
+    int numDmas;
     int numCus;
     // unique id of the next episode
     int nextEpisodeId;
diff --git a/src/cpu/testers/gpu_ruby_test/tester_dma.hh b/src/cpu/testers/gpu_ruby_test/tester_dma.hh
new file mode 100644
index 0000000000..772745b939
--- /dev/null
+++ b/src/cpu/testers/gpu_ruby_test/tester_dma.hh
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This is a fake DMA device to pass to Ruby.py so it will create the DMA
+ * sequencers and controllers without being protocol specific. It otherwise
+ * does nothing.
+ */
+
+#ifndef __CPU_TESTERS_GPU_RUBY_TEST_TESTER_DMA_HH__
+#define __CPU_TESTERS_GPU_RUBY_TEST_TESTER_DMA_HH__
+
+#include "dev/dma_device.hh"
+#include "params/TesterDma.hh"
+
+class TesterDma : public DmaDevice
+{
+  public:
+    typedef TesterDmaParams Params;
+    TesterDma(const Params &p) : DmaDevice(p) { }
+    virtual ~TesterDma() { }
+
+    // The tester does not use a huge memory range. The range itself is
+    // choosen arbitrarily
+    AddrRangeList
+    getAddrRanges() const override
+    {
+        AddrRangeList ranges;
+        ranges.push_back(RangeSize(0, 0xc0000000));
+        return ranges;
+    }
+
+    // These latencies are not important. Return any integer.
+    Tick read(PacketPtr) override { return 10; }
+    Tick write(PacketPtr) override { return 10; }
+};
+
+#endif /* __CPU_TESTERS_GPU_RUBY_TEST_TESTER_DMA_HH__ */
diff --git a/src/cpu/testers/gpu_ruby_test/tester_thread.cc b/src/cpu/testers/gpu_ruby_test/tester_thread.cc
index 9d12489729..0164b5e81f 100644
--- a/src/cpu/testers/gpu_ruby_test/tester_thread.cc
+++ b/src/cpu/testers/gpu_ruby_test/tester_thread.cc
@@ -159,6 +159,10 @@ TesterThread::isNextActionReady()
     } else {
         curAction = curEpisode->peekCurAction();
 
+        // Only GPU wavefront threads have a token port. For all other types
+        // of threads evaluate to true.
+        bool haveTokens = tokenPort ? tokenPort->haveTokens(numLanes) : true;
+
         switch(curAction->getType()) {
             case Episode::Action::Type::ATOMIC:
                 // an atomic action must wait for all previous requests
@@ -166,7 +170,7 @@ TesterThread::isNextActionReady()
                 if (pendingLdStCount == 0 &&
                     pendingFenceCount == 0 &&
                     pendingAtomicCount == 0 &&
-                    tokenPort->haveTokens(numLanes)) {
+                    haveTokens) {
                     return true;
                 }
 
@@ -201,8 +205,7 @@ TesterThread::isNextActionReady()
                 assert(pendingAtomicCount == 0);
 
                 // can't issue if there is a pending fence
-                if (pendingFenceCount > 0 ||
-                    !tokenPort->haveTokens(numLanes)) {
+                if (pendingFenceCount > 0 || !haveTokens) {
                     return false;
                 }
 
@@ -245,7 +248,9 @@ TesterThread::issueNextAction()
 {
     switch(curAction->getType()) {
         case Episode::Action::Type::ATOMIC:
-            tokenPort->acquireTokens(numLanes);
+            if (tokenPort) {
+                tokenPort->acquireTokens(numLanes);
+            }
             issueAtomicOps();
             break;
         case Episode::Action::Type::ACQUIRE:
@@ -255,11 +260,15 @@ TesterThread::issueNextAction()
             issueReleaseOp();
             break;
         case Episode::Action::Type::LOAD:
-            tokenPort->acquireTokens(numLanes);
+            if (tokenPort) {
+                tokenPort->acquireTokens(numLanes);
+            }
             issueLoadOps();
             break;
         case Episode::Action::Type::STORE:
-            tokenPort->acquireTokens(numLanes);
+            if (tokenPort) {
+                tokenPort->acquireTokens(numLanes);
+            }
             issueStoreOps();
             break;
         default: