diff --git a/configs/example/ruby_gpu_random_test.py b/configs/example/ruby_gpu_random_test.py index ae4ff0bf12..133c13a4bb 100644 --- a/configs/example/ruby_gpu_random_test.py +++ b/configs/example/ruby_gpu_random_test.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2020 Advanced Micro Devices, Inc. +# Copyright (c) 2018-2021 Advanced Micro Devices, Inc. # All rights reserved. # # For use for simulation and test purposes only @@ -81,6 +81,8 @@ parser.add_option("--random-seed", type="int", default=0, help="Random seed number. Default value (i.e., 0) means \ using runtime-specific value") parser.add_option("--log-file", type="string", default="gpu-ruby-test.log") +parser.add_option("--num-dmas", type="int", default=0, + help="The number of DMA engines to use in tester config.") (options, args) = parser.parse_args() @@ -112,6 +114,7 @@ if (options.system_size == "small"): options.wf_size = 1 options.wavefronts_per_cu = 1 options.num_cpus = 1 + options.num_dmas = 1 options.cu_per_sqc = 1 options.cu_per_scalar_cache = 1 options.num_compute_units = 1 @@ -120,6 +123,7 @@ elif (options.system_size == "medium"): options.wf_size = 16 options.wavefronts_per_cu = 4 options.num_cpus = 4 + options.num_dmas = 2 options.cu_per_sqc = 4 options.cu_per_scalar_cache = 4 options.num_compute_units = 4 @@ -128,6 +132,7 @@ elif (options.system_size == "large"): options.wf_size = 32 options.wavefronts_per_cu = 4 options.num_cpus = 4 + options.num_dmas = 4 options.cu_per_sqc = 4 options.cu_per_scalar_cache = 4 options.num_compute_units = 8 @@ -174,6 +179,9 @@ tester_deadlock_threshold = 1e9 # For now we're testing only GPU protocol, so we force num_cpus to be 0 options.num_cpus = 0 +# Number of DMA engines +n_DMAs = options.num_dmas + # Number of CUs n_CUs = options.num_compute_units @@ -229,10 +237,20 @@ system.clk_domain = SrcClockDomain(clock = options.sys_clock, # options.num_cp = 0 +# +# Make generic DMA sequencer for Ruby to use +# +dma_devices = [TesterDma()] * n_DMAs +system.piobus = IOXBar() +for _, dma_device in enumerate(dma_devices): + dma_device.pio = system.piobus.mem_side_ports +system.dma_devices = dma_devices + # # Create the Ruby system # -Ruby.create_system(options, False, system) +Ruby.create_system(options = options, full_system = False, + system = system, dma_ports = system.dma_devices) # # The tester is most effective when randomization is turned on and @@ -256,6 +274,7 @@ print("Attaching ruby ports to the tester") for i, ruby_port in enumerate(system.ruby._cpu_ports): ruby_port.no_retry_on_stall = True ruby_port.using_ruby_tester = True + ruby_port.mem_request_port = system.piobus.cpu_side_ports if i < n_CUs: tester.cu_vector_ports = ruby_port.in_ports @@ -269,17 +288,45 @@ for i, ruby_port in enumerate(system.ruby._cpu_ports): i += 1 # -# No CPU threads are needed for GPU tester +# Attach DMA ports. Since Ruby.py doesn't return these they need to be found. +# Connect tester's request port to each DMA sequencer's in_ports. This assumes +# the protocol names these system.dma_cntrl<#>. +# +dma_ports = [] +for i in range(n_DMAs): + dma_cntrl = getattr(system, 'dma_cntrl' + str(i)) + dma_ports.append(dma_cntrl.dma_sequencer.in_ports) +tester.dma_ports = dma_ports + +# +# Common variables for all types of threads +# +thread_clock = SrcClockDomain(clock = '1GHz', + voltage_domain = system.voltage_domain) +g_thread_idx = 0 + +# +# No CPU threads are used for GPU tester # tester.cpu_threads = [] +# +# Create DMA threads +# +dma_threads = [] +print("Creating %i DMAs" % n_DMAs) +for dma_idx in range(n_DMAs): + dma_threads.append(DmaThread(thread_id = g_thread_idx, + num_lanes = 1, clk_domain = thread_clock, + deadlock_threshold = \ + tester_deadlock_threshold)) + g_thread_idx += 1 +tester.dma_threads = dma_threads + # # Create GPU wavefronts # -thread_clock = SrcClockDomain(clock = '1GHz', - voltage_domain = system.voltage_domain) wavefronts = [] -g_thread_idx = 0 print("Creating %i WFs attached to %i CUs" % \ (n_CUs * tester.wavefronts_per_cu, n_CUs)) for cu_idx in range(n_CUs): diff --git a/src/cpu/testers/gpu_ruby_test/DmaThread.py b/src/cpu/testers/gpu_ruby_test/DmaThread.py new file mode 100644 index 0000000000..570c6ae399 --- /dev/null +++ b/src/cpu/testers/gpu_ruby_test/DmaThread.py @@ -0,0 +1,39 @@ +# Copyright (c) 2021 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * + +from m5.objects.TesterThread import TesterThread + +class DmaThread(TesterThread): + type = 'DmaThread' + cxx_header = "cpu/testers/gpu_ruby_test/dma_thread.hh" diff --git a/src/cpu/testers/gpu_ruby_test/ProtocolTester.py b/src/cpu/testers/gpu_ruby_test/ProtocolTester.py index ed0e0a88ee..a1b55c866e 100644 --- a/src/cpu/testers/gpu_ruby_test/ProtocolTester.py +++ b/src/cpu/testers/gpu_ruby_test/ProtocolTester.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020 Advanced Micro Devices, Inc. +# Copyright (c) 2017-2021 Advanced Micro Devices, Inc. # All rights reserved. # # For use for simulation and test purposes only @@ -38,6 +38,7 @@ class ProtocolTester(ClockedObject): cxx_header = "cpu/testers/gpu_ruby_test/protocol_tester.hh" cpu_ports = VectorRequestPort("Ports for CPUs") + dma_ports = VectorRequestPort("Ports for DMAs") cu_vector_ports = VectorRequestPort("Vector ports for GPUs") cu_sqc_ports = VectorRequestPort("SQC ports for GPUs") cu_scalar_ports = VectorRequestPort("Scalar ports for GPUs") @@ -55,6 +56,7 @@ class ProtocolTester(ClockedObject): " coalescer.") cpu_threads = VectorParam.CpuThread("All cpus") + dma_threads = VectorParam.DmaThread("All DMAs") wavefronts = VectorParam.GpuWavefront("All wavefronts") num_atomic_locations = Param.Int(2, "Number of atomic locations") diff --git a/src/cpu/testers/gpu_ruby_test/SConscript b/src/cpu/testers/gpu_ruby_test/SConscript index 28c8006452..5dcfbcbc09 100644 --- a/src/cpu/testers/gpu_ruby_test/SConscript +++ b/src/cpu/testers/gpu_ruby_test/SConscript @@ -41,13 +41,16 @@ if env['PROTOCOL'] == 'None': SimObject('ProtocolTester.py') SimObject('CpuThread.py') +SimObject('DmaThread.py') SimObject('GpuWavefront.py') SimObject('TesterThread.py') +SimObject('TesterDma.py') Source('address_manager.cc') Source('episode.cc') Source('protocol_tester.cc') Source('cpu_thread.cc') +Source('dma_thread.cc') Source('gpu_wavefront.cc') Source('tester_thread.cc') diff --git a/src/cpu/testers/gpu_ruby_test/TesterDma.py b/src/cpu/testers/gpu_ruby_test/TesterDma.py new file mode 100644 index 0000000000..2f669c069d --- /dev/null +++ b/src/cpu/testers/gpu_ruby_test/TesterDma.py @@ -0,0 +1,36 @@ +# Copyright (c) 2021 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from m5.objects.Device import DmaDevice + +class TesterDma(DmaDevice): + type = 'TesterDma' + cxx_header = "cpu/testers/gpu_ruby_test/tester_dma.hh" diff --git a/src/cpu/testers/gpu_ruby_test/dma_thread.cc b/src/cpu/testers/gpu_ruby_test/dma_thread.cc new file mode 100644 index 0000000000..254158dc00 --- /dev/null +++ b/src/cpu/testers/gpu_ruby_test/dma_thread.cc @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "cpu/testers/gpu_ruby_test/dma_thread.hh" + +#include "debug/ProtocolTest.hh" + +DmaThread::DmaThread(const Params& _params) + : TesterThread(_params) +{ + threadName = "DmaThread(Thread ID " + std::to_string(threadId) + ")"; + threadEvent.setDesc("DmaThread tick"); + assert(numLanes == 1); +} + +DmaThread::~DmaThread() +{ + +} + +DmaThread* +DmaThreadParams::create() const +{ + return new DmaThread(*this); +} + +void +DmaThread::issueLoadOps() +{ + assert(curAction); + assert(curAction->getType() == Episode::Action::Type::LOAD); + // we should not have any outstanding fence or atomic op at this point + assert(pendingFenceCount == 0); + assert(pendingAtomicCount == 0); + + // DMA thread is a scalar thread so always set lane to zero. This allows + // us to reuse the API for GPU threads rather than have a specific API + // for scalar tester threads + int lane = 0; + + Location location = curAction->getLocation(lane); + assert(location >= AddressManager::INVALID_LOCATION); + + if (location >= 0) { + Addr address = addrManager->getAddress(location); + DPRINTF(ProtocolTest, "%s Episode %d: Issuing Load - Addr %s\n", + this->getName(), curEpisode->getEpisodeId(), + printAddress(address)); + + int load_size = sizeof(Value); + + // for now, assert address is 4-byte aligned + assert(address % load_size == 0); + + auto req = std::make_shared(address, load_size, + 0, tester->requestorId(), + 0, threadId, nullptr); + req->setPaddr(address); + req->setReqInstSeqNum(tester->getActionSeqNum()); + + PacketPtr pkt = new Packet(req, MemCmd::ReadReq); + uint8_t* data = new uint8_t[load_size]; + pkt->dataDynamic(data); + pkt->senderState = new ProtocolTester::SenderState(this); + + if (!port->sendTimingReq(pkt)) { + panic("Not expected failed sendTimingReq\n"); + } + + // insert an outstanding load + addOutstandingReqs(outstandingLoads, address, lane, location); + + // increment the number of outstanding ld_st requests + pendingLdStCount++; + } +} + +void +DmaThread::issueStoreOps() +{ + assert(curAction); + assert(curAction->getType() == Episode::Action::Type::STORE); + // we should not have any outstanding fence or atomic op at this point + assert(pendingFenceCount == 0); + assert(pendingAtomicCount == 0); + + // DMA thread is a scalar thread so always set lane to zero. This allows + // us to reuse the API for GPU threads rather than have a specific API + // for scalar tester threads + int lane = 0; + + Location location = curAction->getLocation(lane); + assert(location >= AddressManager::INVALID_LOCATION); + + if (location >= 0) { + // prepare the next value to store + Value new_value = addrManager->getLoggedValue(location) + 1; + + Addr address = addrManager->getAddress(location); + // must be aligned with store size + assert(address % sizeof(Value) == 0); + + DPRINTF(ProtocolTest, "%s Episode %d: Issuing Store - Addr %s - " + "Value %d\n", this->getName(), + curEpisode->getEpisodeId(), printAddress(address), + new_value); + + auto req = std::make_shared(address, sizeof(Value), + 0, tester->requestorId(), 0, + threadId, nullptr); + req->setPaddr(address); + req->setReqInstSeqNum(tester->getActionSeqNum()); + + PacketPtr pkt = new Packet(req, MemCmd::WriteReq); + uint8_t *writeData = new uint8_t[sizeof(Value)]; + for (int j = 0; j < sizeof(Value); ++j) { + writeData[j] = ((uint8_t*)&new_value)[j]; + } + pkt->dataDynamic(writeData); + pkt->senderState = new ProtocolTester::SenderState(this); + + if (!port->sendTimingReq(pkt)) { + panic("Not expecting a failed sendTimingReq\n"); + } + + // add an outstanding store + addOutstandingReqs(outstandingStores, address, lane, location, + new_value); + + // increment the number of outstanding ld_st requests + pendingLdStCount++; + } +} + +void +DmaThread::issueAtomicOps() +{ + DPRINTF(ProtocolTest, "Issuing Atomic Op ...\n"); + + assert(curAction); + assert(curAction->getType() == Episode::Action::Type::ATOMIC); + // we should not have any outstanding ops at this point + assert(pendingFenceCount == 0); + assert(pendingLdStCount == 0); + assert(pendingAtomicCount == 0); + + // no-op: No DMA protocol exists with Atomics +} + +void +DmaThread::issueAcquireOp() +{ + DPRINTF(ProtocolTest, "Issuing Acquire Op ...\n"); + + assert(curAction); + assert(curAction->getType() == Episode::Action::Type::ACQUIRE); + // we should not have any outstanding ops at this point + assert(pendingFenceCount == 0); + assert(pendingLdStCount == 0); + assert(pendingAtomicCount == 0); + + // no-op: Acquire does not apply to DMA threads +} + +void +DmaThread::issueReleaseOp() +{ + DPRINTF(ProtocolTest, "Issuing Release Op ...\n"); + + assert(curAction); + assert(curAction->getType() == Episode::Action::Type::RELEASE); + // we should not have any outstanding ops at this point + assert(pendingFenceCount == 0); + assert(pendingLdStCount == 0); + assert(pendingAtomicCount == 0); + + // no-op: Release does not apply to DMA threads +} + +void +DmaThread::hitCallback(PacketPtr pkt) +{ + assert(pkt); + MemCmd resp_cmd = pkt->cmd; + Addr addr = pkt->getAddr(); + + DPRINTF(ProtocolTest, "%s Episode %d: hitCallback - Command %s -" + " Addr %s\n", this->getName(), curEpisode->getEpisodeId(), + resp_cmd.toString(), printAddress(addr)); + + if (resp_cmd == MemCmd::SwapResp) { + // response to a pending atomic + assert(pendingAtomicCount > 0); + assert(pendingLdStCount == 0); + assert(outstandingAtomics.count(addr) > 0); + + // get return data + Value value = *(pkt->getPtr()); + + // validate atomic op return + OutstandingReq req = popOutstandingReq(outstandingAtomics, addr); + assert(req.lane == 0); + validateAtomicResp(req.origLoc, req.lane, value); + + // update log table + addrManager->updateLogTable(req.origLoc, threadId, + curEpisode->getEpisodeId(), value, + curTick(), + 0); + + // this Atomic is done + pendingAtomicCount--; + } else if (resp_cmd == MemCmd::ReadResp) { + // response to a pending read + assert(pendingLdStCount > 0); + assert(pendingAtomicCount == 0); + assert(outstandingLoads.count(addr) > 0); + + // get return data + Value value = *(pkt->getPtr()); + OutstandingReq req = popOutstandingReq(outstandingLoads, addr); + assert(req.lane == 0); + validateLoadResp(req.origLoc, req.lane, value); + + // this Read is done + pendingLdStCount--; + } else if (resp_cmd == MemCmd::WriteResp) { + // response to a pending write + assert(pendingLdStCount > 0); + assert(pendingAtomicCount == 0); + + // no need to validate Write response + // just pop it from the outstanding req table so that subsequent + // requests dependent on this write can proceed + // note that unlike GpuWavefront we do decrement pendingLdStCount here + // since the write is guaranteed to be completed in downstream memory. + assert(outstandingStores.count(addr) > 0); + OutstandingReq req = popOutstandingReq(outstandingStores, addr); + assert(req.storedValue != AddressManager::INVALID_VALUE); + + // update log table + addrManager->updateLogTable(req.origLoc, threadId, + curEpisode->getEpisodeId(), + req.storedValue, + curTick(), + 0); + + // the Write is now done + pendingLdStCount--; + } else { + panic("UnsupportedMemCmd response type: %s", + resp_cmd.toString().c_str()); + } + + delete pkt->senderState; + delete pkt; + + // record the last active cycle to check for deadlock + lastActiveCycle = curCycle(); + + // we may be able to issue an action. Let's check + if (!threadEvent.scheduled()) { + scheduleWakeup(); + } +} diff --git a/src/cpu/testers/gpu_ruby_test/dma_thread.hh b/src/cpu/testers/gpu_ruby_test/dma_thread.hh new file mode 100644 index 0000000000..1b6fd2b576 --- /dev/null +++ b/src/cpu/testers/gpu_ruby_test/dma_thread.hh @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CPU_TESTERS_PROTOCOL_TESTER_DMATHREAD_HH_ +#define CPU_TESTERS_PROTOCOL_TESTER_DMATHREAD_HH_ + +#include "cpu/testers/gpu_ruby_test/tester_thread.hh" +#include "params/DmaThread.hh" + +class DmaThread : public TesterThread +{ + public: + typedef DmaThreadParams Params; + DmaThread(const Params& _params); + virtual ~DmaThread(); + + typedef AddressManager::Location Location; + typedef AddressManager::Value Value; + + void hitCallback(PacketPtr pkt); + + protected: + void issueLoadOps(); + void issueStoreOps(); + void issueAtomicOps(); + void issueAcquireOp(); + void issueReleaseOp(); +}; + +#endif /* CPU_TESTERS_PROTOCOL_TESTER_DMATHREAD_HH_ */ diff --git a/src/cpu/testers/gpu_ruby_test/protocol_tester.cc b/src/cpu/testers/gpu_ruby_test/protocol_tester.cc index dd50bb26e8..a8f84081c4 100644 --- a/src/cpu/testers/gpu_ruby_test/protocol_tester.cc +++ b/src/cpu/testers/gpu_ruby_test/protocol_tester.cc @@ -39,6 +39,7 @@ #include #include "cpu/testers/gpu_ruby_test/cpu_thread.hh" +#include "cpu/testers/gpu_ruby_test/dma_thread.hh" #include "cpu/testers/gpu_ruby_test/gpu_wavefront.hh" #include "cpu/testers/gpu_ruby_test/tester_thread.hh" #include "debug/ProtocolTest.hh" @@ -50,6 +51,7 @@ ProtocolTester::ProtocolTester(const Params &p) : ClockedObject(p), _requestorId(p.system->getRequestorId(this)), numCpuPorts(p.port_cpu_ports_connection_count), + numDmaPorts(p.port_dma_ports_connection_count), numVectorPorts(p.port_cu_vector_ports_connection_count), numSqcPorts(p.port_cu_sqc_ports_connection_count), numScalarPorts(p.port_cu_scalar_ports_connection_count), @@ -65,11 +67,13 @@ ProtocolTester::ProtocolTester(const Params &p) maxNumEpisodes(p.max_num_episodes), debugTester(p.debug_tester), cpuThreads(p.cpu_threads), + dmaThreads(p.dma_threads), wfs(p.wavefronts) { int idx = 0; // global port index numCpus = numCpuPorts; // 1 cpu port per CPU + numDmas = numDmaPorts; // 1 dma port per DMA numCus = numVectorPorts; // 1 vector port per CU // create all physical cpu's data ports @@ -81,6 +85,15 @@ ProtocolTester::ProtocolTester(const Params &p) idx++; } + // create all physical DMA data ports + for (int i = 0; i < numDmaPorts; ++i) { + DPRINTF(ProtocolTest, "Creating %s\n", + csprintf("%s-dmaPort%d", name(), i)); + dmaPorts.push_back(new SeqPort(csprintf("%s-dmaPort%d", name(), i), + this, i, idx)); + idx++; + } + // create all physical gpu's data ports for (int i = 0; i < numVectorPorts; ++i) { DPRINTF(ProtocolTest, "Creating %s\n", @@ -144,6 +157,7 @@ ProtocolTester::ProtocolTester(const Params &p) std::stringstream ss; ss << "GPU Ruby test's configurations" << std::endl << "\tNumber of CPUs: " << numCpus << std::endl + << "\tNumber of DMAs: " << numDmas << std::endl << "\tNumber of CUs: " << numCus << std::endl << "\tNumber of wavefronts per CU: " << numWfsPerCu << std::endl << "\tWavefront size: " << numWisPerWf << std::endl @@ -164,6 +178,8 @@ ProtocolTester::~ProtocolTester() { for (int i = 0; i < cpuPorts.size(); ++i) delete cpuPorts[i]; + for (int i = 0; i < dmaPorts.size(); ++i) + delete dmaPorts[i]; for (int i = 0; i < cuVectorPorts.size(); ++i) delete cuVectorPorts[i]; for (int i = 0; i < cuScalarPorts.size(); ++i) @@ -189,6 +205,14 @@ ProtocolTester::init() cpuThreads[cpu_id]->scheduleDeadlockCheckEvent(); } + // connect dma threads to dma's ports + for (int dma_id = 0; dma_id < numDmas; ++dma_id) { + dmaThreads[dma_id]->attachTesterThreadToPorts(this, + static_cast(dmaPorts[dma_id])); + dmaThreads[dma_id]->scheduleWakeup(); + dmaThreads[dma_id]->scheduleDeadlockCheckEvent(); + } + // connect gpu wavefronts to gpu's ports int wfId = 0; int vectorPortId = 0; @@ -216,9 +240,9 @@ ProtocolTester::init() Port& ProtocolTester::getPort(const std::string &if_name, PortID idx) { - if (if_name != "cpu_ports" && if_name != "cu_vector_ports" && - if_name != "cu_sqc_ports" && if_name != "cu_scalar_ports" && - if_name != "cu_token_ports") { + if (if_name != "cpu_ports" && if_name != "dma_ports" && + if_name != "cu_vector_ports" && if_name != "cu_sqc_ports" && + if_name != "cu_scalar_ports" && if_name != "cu_token_ports") { // pass along to super class return ClockedObject::getPort(if_name, idx); } else { @@ -226,6 +250,10 @@ ProtocolTester::getPort(const std::string &if_name, PortID idx) if (idx > numCpuPorts) panic("ProtocolTester: unknown cpu port %d\n", idx); return *cpuPorts[idx]; + } else if (if_name == "dma_ports") { + if (idx > numDmaPorts) + panic("ProtocolTester: unknown dma port %d\n", idx); + return *dmaPorts[idx]; } else if (if_name == "cu_vector_ports") { if (idx > numVectorPorts) panic("ProtocolTester: unknown cu vect port %d\n", idx); @@ -279,6 +307,11 @@ ProtocolTester::checkDRF(Location atomic_loc, if (!th->checkDRF(atomic_loc, loc, isStore)) return false; } + + for (const TesterThread* th : dmaThreads) { + if (!th->checkDRF(atomic_loc, loc, isStore)) + return false; + } } return true; @@ -293,6 +326,10 @@ ProtocolTester::dumpErrorLog(std::stringstream& ss) t->printAllOutstandingReqs(ss); } + for (auto t : dmaThreads) { + t->printAllOutstandingReqs(ss); + } + for (auto t : wfs) { t->printAllOutstandingReqs(ss); } diff --git a/src/cpu/testers/gpu_ruby_test/protocol_tester.hh b/src/cpu/testers/gpu_ruby_test/protocol_tester.hh index 3ecc06780a..57e9d1887b 100644 --- a/src/cpu/testers/gpu_ruby_test/protocol_tester.hh +++ b/src/cpu/testers/gpu_ruby_test/protocol_tester.hh @@ -143,6 +143,7 @@ class ProtocolTester : public ClockedObject // list of parameters taken from python scripts int numCpuPorts; + int numDmaPorts; int numVectorPorts; int numSqcPorts; int numScalarPorts; @@ -164,13 +165,15 @@ class ProtocolTester : public ClockedObject // all available requestor ports connected to Ruby std::vector cpuPorts; // cpu data ports + std::vector dmaPorts; // DMA data ports std::vector cuVectorPorts; // ports to GPU vector cache std::vector cuSqcPorts; // ports to GPU inst cache std::vector cuScalarPorts; // ports to GPU scalar cache std::vector cuTokenManagers; std::vector cuTokenPorts; - // all CPU and GPU threads + // all CPU, DMA, and GPU threads std::vector cpuThreads; + std::vector dmaThreads; std::vector wfs; // address manager that (1) generates DRF sequences of requests, @@ -180,6 +183,7 @@ class ProtocolTester : public ClockedObject // number of CPUs and CUs int numCpus; + int numDmas; int numCus; // unique id of the next episode int nextEpisodeId; diff --git a/src/cpu/testers/gpu_ruby_test/tester_dma.hh b/src/cpu/testers/gpu_ruby_test/tester_dma.hh new file mode 100644 index 0000000000..772745b939 --- /dev/null +++ b/src/cpu/testers/gpu_ruby_test/tester_dma.hh @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This is a fake DMA device to pass to Ruby.py so it will create the DMA + * sequencers and controllers without being protocol specific. It otherwise + * does nothing. + */ + +#ifndef __CPU_TESTERS_GPU_RUBY_TEST_TESTER_DMA_HH__ +#define __CPU_TESTERS_GPU_RUBY_TEST_TESTER_DMA_HH__ + +#include "dev/dma_device.hh" +#include "params/TesterDma.hh" + +class TesterDma : public DmaDevice +{ + public: + typedef TesterDmaParams Params; + TesterDma(const Params &p) : DmaDevice(p) { } + virtual ~TesterDma() { } + + // The tester does not use a huge memory range. The range itself is + // choosen arbitrarily + AddrRangeList + getAddrRanges() const override + { + AddrRangeList ranges; + ranges.push_back(RangeSize(0, 0xc0000000)); + return ranges; + } + + // These latencies are not important. Return any integer. + Tick read(PacketPtr) override { return 10; } + Tick write(PacketPtr) override { return 10; } +}; + +#endif /* __CPU_TESTERS_GPU_RUBY_TEST_TESTER_DMA_HH__ */ diff --git a/src/cpu/testers/gpu_ruby_test/tester_thread.cc b/src/cpu/testers/gpu_ruby_test/tester_thread.cc index 9d12489729..0164b5e81f 100644 --- a/src/cpu/testers/gpu_ruby_test/tester_thread.cc +++ b/src/cpu/testers/gpu_ruby_test/tester_thread.cc @@ -159,6 +159,10 @@ TesterThread::isNextActionReady() } else { curAction = curEpisode->peekCurAction(); + // Only GPU wavefront threads have a token port. For all other types + // of threads evaluate to true. + bool haveTokens = tokenPort ? tokenPort->haveTokens(numLanes) : true; + switch(curAction->getType()) { case Episode::Action::Type::ATOMIC: // an atomic action must wait for all previous requests @@ -166,7 +170,7 @@ TesterThread::isNextActionReady() if (pendingLdStCount == 0 && pendingFenceCount == 0 && pendingAtomicCount == 0 && - tokenPort->haveTokens(numLanes)) { + haveTokens) { return true; } @@ -201,8 +205,7 @@ TesterThread::isNextActionReady() assert(pendingAtomicCount == 0); // can't issue if there is a pending fence - if (pendingFenceCount > 0 || - !tokenPort->haveTokens(numLanes)) { + if (pendingFenceCount > 0 || !haveTokens) { return false; } @@ -245,7 +248,9 @@ TesterThread::issueNextAction() { switch(curAction->getType()) { case Episode::Action::Type::ATOMIC: - tokenPort->acquireTokens(numLanes); + if (tokenPort) { + tokenPort->acquireTokens(numLanes); + } issueAtomicOps(); break; case Episode::Action::Type::ACQUIRE: @@ -255,11 +260,15 @@ TesterThread::issueNextAction() issueReleaseOp(); break; case Episode::Action::Type::LOAD: - tokenPort->acquireTokens(numLanes); + if (tokenPort) { + tokenPort->acquireTokens(numLanes); + } issueLoadOps(); break; case Episode::Action::Type::STORE: - tokenPort->acquireTokens(numLanes); + if (tokenPort) { + tokenPort->acquireTokens(numLanes); + } issueStoreOps(); break; default: