diff --git a/src/cpu/testers/spatter_gen/SConscript b/src/cpu/testers/spatter_gen/SConscript new file mode 100644 index 0000000000..86231409dd --- /dev/null +++ b/src/cpu/testers/spatter_gen/SConscript @@ -0,0 +1,38 @@ +# Copyright (c) 2024 The Regents of The University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Import("*") + +SimObject( + "SpatterGen.py", + sim_objects=["SpatterGen"], + enums=["SpatterKernelType", "SpatterProcessingMode"], +) + +Source("spatter_gen.cc") + +DebugFlag("SpatterGen") +DebugFlag("SpatterKernel") diff --git a/src/cpu/testers/spatter_gen/SpatterGen.py b/src/cpu/testers/spatter_gen/SpatterGen.py new file mode 100644 index 0000000000..1c88f867ca --- /dev/null +++ b/src/cpu/testers/spatter_gen/SpatterGen.py @@ -0,0 +1,115 @@ +# Copyright (c) 2024 The Regents of The University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.citations import add_citation +from m5.objects.ClockedObject import ClockedObject +from m5.params import * +from m5.proxy import * +from m5.util.pybind import PyBindMethod + + +class SpatterKernelType(Enum): + vals = ["scatter", "gather"] + + +class SpatterProcessingMode(Enum): + vals = ["synchronous", "asynchronous"] + + +class SpatterGen(ClockedObject): + type = "SpatterGen" + cxx_header = "cpu/testers/spatter_gen/spatter_gen.hh" + cxx_class = "gem5::SpatterGen" + + system = Param.System(Parent.any, "System this SpatterGen is a part of.") + + processing_mode = Param.SpatterProcessingMode( + "How to process kernels accross multiple SpatterGen cores. " + "Whether to synchronize on kernel boundaries or not." + ) + + port = RequestPort("Port to send memory requests.") + + int_regfile_size = Param.Int("Size of the integer register file.") + fp_regfile_size = Param.Int("Size of the floating point register file.") + request_gen_latency = Param.Cycles( + "Number of cycles to spend for creating a request." + ) + request_gen_rate = Param.Int("Number of requests generate per cycle.") + request_buffer_entries = Param.Int("Size of the request buffer.") + send_rate = Param.Int( + "Number of requests to send in parallel." + "Emulates the number of dcache ports." + ) + + cxx_exports = [ + PyBindMethod("addKernel"), + PyBindMethod("proceedPastSyncPoint"), + ] + + +add_citation( + SpatterGen, + """@inproceedings{10.1145/3422575.3422794, +author = {Lavin, Patrick and Young, Jeffrey and Vuduc, Richard and Riedy, +Jason and Vose, Aaron and Ernst, Daniel}, +title = {Evaluating Gather and Scatter Performance on CPUs and GPUs}, +year = {2021}, +isbn = {9781450388993}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/3422575.3422794}, +doi = {10.1145/3422575.3422794}, +abstract = {This paper describes a new benchmark tool, +Spatter, for assessing memory system architectures in the context of a +specific category of indexed accesses known as gather and scatter. +These types of operations are increasingly used to express sparse and +irregular data access patterns, and they have widespread utility in many +modern HPC applications including scientific simulations, data mining and +analysis computations, and graph processing. However, many traditional +benchmarking tools like STREAM, STRIDE, and GUPS focus on characterizing +only uniform stride or fully random accesses despite evidence that modern +applications use varied sets of more complex access patterns. Spatter is an +open-source benchmark that provides a tunable and configurable framework to +benchmark a variety of indexed access patterns, including variations of gather +/ scatter that are seen in HPC mini-apps evaluated in this work. The design of +Spatter includes backends for OpenMP and CUDA, and experiments show how it can +be used to evaluate 1) uniform access patterns for CPU and GPU, 2) prefetching +regimes for gather / scatter, 3) compiler implementations of vectorization for +gather / scatter, and 4) trace-driven “proxy patterns” that reflect the +patterns found in multiple applications. The results from Spatter experiments +show, for instance, that GPUs typically outperform CPUs for these operations +in absolute bandwidth but not fraction of peak bandwidth, and that Spatter can +better represent the performance of some cache-dependent mini-apps than +traditional STREAM bandwidth measurements.}, +booktitle = {Proceedings of the International Symposium on Memory Systems}, +pages = {209–222}, +numpages = {14}, +location = {Washington, DC, USA}, +series = {MEMSYS '20} +} +""", +) diff --git a/src/cpu/testers/spatter_gen/spatter_gen.cc b/src/cpu/testers/spatter_gen/spatter_gen.cc new file mode 100644 index 0000000000..b57259911b --- /dev/null +++ b/src/cpu/testers/spatter_gen/spatter_gen.cc @@ -0,0 +1,582 @@ +/* +* Copyright (c) 2024 The Regents of The University of California +* All rights reserved. +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions are +* met: redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer; +* redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the distribution; +* neither the name of the copyright holders nor the names of its +* contributors may be used to endorse or promote products derived from +* this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "cpu/testers/spatter_gen/spatter_gen.hh" + +#include "base/cprintf.hh" +#include "debug/SpatterGen.hh" +#include "debug/SpatterKernel.hh" +#include "enums/SpatterKernelType.hh" +#include "enums/SpatterProcessingMode.hh" +#include "mem/packet.hh" +#include "sim/sim_exit.hh" +#include "sim/system.hh" + +namespace gem5 +{ + +using enums::SpatterKernelTypeStrings; +using enums::SpatterProcessingMode; + +SpatterGen::SpatterGen(const Params& params): + ClockedObject(params), + state(SpatterGenState::RUNNING), + requestorId(params.system->getRequestorId(this)), + numPendingMemRequests(0), + stats(this), + mode(params.processing_mode), + port(this, name() + ".port"), + intRegFileSize(params.int_regfile_size), intRegUsed(0), + fpRegFileSize(params.fp_regfile_size), fpRegUsed(0), + requestGenLatency(params.request_gen_latency), + requestGenRate(params.request_gen_rate), + firstGeneratorAvailableTime(0), + nextGenEvent([this](){ processNextGenEvent(); }, name() + ".GenEvent"), + requestBufferEntries(params.request_buffer_entries), + requestBuffer(clockPeriod()), + sendRate(params.send_rate), + firstPortAvailableTime(0), + nextSendEvent([this](){ processNextSendEvent(); }, name() + ".SendEvent"), + receiveBuffer(clockPeriod()) +{ + fatal_if(fpRegFileSize < requestBufferEntries, + "fp_regfile_size should be >= request_buffer_entries." + "if request_buffer_entries is bigger than fp_regfile_size," + "it may result in inaccuracies in your simulation." + "Ideally: fp_regfile_size >> request_buffer_entries." + ); + generatorBusyUntil.resize(requestGenRate, 0); + portBusyUntil.resize(sendRate, 0); +} + +Port& +SpatterGen::getPort(const std::string& if_name, PortID idx) +{ + if (if_name == "port") { + return port; + } else { + return ClockedObject::getPort(if_name, idx); + } +} + +void +SpatterGen::startup() +{ + scheduleNextGenEvent(curTick()); +} + +void +SpatterGen::SpatterGenPort::sendPacket(PacketPtr pkt) +{ + panic_if(blocked(), "Should never try to send if port is blocked."); + if (!sendTimingReq(pkt)) { + blockedPacket = pkt; + DPRINTF( + SpatterGen, + "%s: Port blocked when sending %s.\n", + __func__, pkt->print() + ); + } +} + +void +SpatterGen::SpatterGenPort::recvReqRetry() +{ + DPRINTF(SpatterGen, "%s: Port received a ReqRetry.\n", __func__); + panic_if( + blockedPacket == nullptr, + "Received reqRetry with no blocked packet." + ); + if (!sendTimingReq(blockedPacket)) { + DPRINTF( + SpatterGen, + "%s: Port blocked when sending %s.\n", + __func__, blockedPacket->print() + ); + } else { + blockedPacket = nullptr; + owner->recvReqRetry(); + } +} + +void +SpatterGen::recvReqRetry() +{ + if (nextSendEvent.pending()) { + nextSendEvent.wake(); + scheduleNextSendEvent(nextCycle()); + } +} + +bool +SpatterGen::SpatterGenPort::recvTimingResp(PacketPtr pkt) { + return owner->recvTimingResp(pkt); +} + +bool +SpatterGen::recvTimingResp(PacketPtr pkt) +{ + DPRINTF(SpatterGen, "%s: Received pkt: %s.\n", __func__, pkt->print()); + assert(pkt->isResponse()); + + // record trip time. + SpatterAccess* spatter_access = pkt->findNextSenderState(); + Tick trip_time = (curTick() - requestDepartureTime[pkt->req]); + requestDepartureTime.erase(pkt->req); + spatter_access->recordTripTime(trip_time); + + int trips_left = spatter_access->tripsLeft(); + assert(trips_left >= 0); + if (trips_left > 0) { + stats.numIndexReads++; + stats.indexBytesRead += pkt->getSize(); + stats.totalIndexReadLatency += trip_time; + + stats.indexAccessLatency.sample(trip_time); + receiveBuffer.push(spatter_access, curTick()); + } else { + stats.valueAccessLatency.sample(trip_time); + stats.totalIndirectAccessLatency.sample( + spatter_access->tripTimeSoFar() + ); + if (spatter_access->type() == SpatterKernelType::gather) { + stats.numValueReads++; + stats.valueBytesRead += pkt->getSize(); + stats.totalValueReadLatency += trip_time; + } else if (spatter_access->type() == SpatterKernelType::scatter) { + stats.numValueWrites++; + stats.valueBytesWritten += pkt->getSize(); + stats.totalValueWriteLatency += trip_time; + } else { + panic("Unknown kernel type."); + } + // CAUTION: We're going to decrement fpRegUsed here, + // it could cause inaccuracies if processNextGenEvent + // is called after recvTimingResp on the same tick. + // i.e. we might end up releasing a register on the same + // cycle that we are allocating it. + // it's probably not going to ever be an issue since + // fpRegFileSize is probably >> requestBufferEntries + // i.e. the chances of running out of fp registers is low because + // we do not simulate parts of the pipeline that back things up into + // fp registers, e.g. functional units of ALU. + fpRegUsed--; + delete spatter_access; + } + + // delete the pkt since we don't need it anymore. + delete pkt; + + if (!nextGenEvent.pending()) { + scheduleNextGenEvent(nextCycle()); + } + + numPendingMemRequests--; + checkForSimExit(); + return true; +} + +void +SpatterGen::addKernel( + uint32_t id, uint32_t delta, uint32_t count, + SpatterKernelType type, + size_t index_size, Addr base_index_addr, + size_t value_size, Addr base_value_addr, + const std::vector& indices +) +{ + DPRINTF( + SpatterGen, + "%s: Adding kernel with id: %d, delta: %d, count: %d, type: %s.\n", + __func__, id, delta, count, SpatterKernelTypeStrings[type] + ); + SpatterKernel new_kernel( + requestorId, + id, delta, count, type, + index_size, base_index_addr, + value_size, base_value_addr + ); + new_kernel.setIndices(indices); + kernels.push(new_kernel); +} + +void +SpatterGen::proceedPastSyncPoint() +{ + assert(mode == SpatterProcessingMode::synchronous); + assert(state == SpatterGenState::WAITING); + state = SpatterGenState::RUNNING; + scheduleNextGenEvent(nextCycle()); +} + +void +SpatterGen::checkForSimExit() +{ + bool no_pending = numPendingMemRequests == 0; + bool no_queued = requestBuffer.empty(); + int avail_int_regs = intRegFileSize - intRegUsed; + int avail_fp_regs = fpRegFileSize - fpRegUsed; + bool can_do_init = initAccessOk(avail_int_regs, avail_fp_regs, curTick()); + bool can_do_mid = interAccessOk(avail_int_regs, avail_fp_regs, curTick()); + bool can_do_ult = ultAccessOk(avail_int_regs, avail_fp_regs, curTick()); + if (!can_do_init && !can_do_mid && !can_do_ult && no_pending && no_queued) + { + assert(( + (mode == SpatterProcessingMode::synchronous) && + (state == SpatterGenState::DRAINING) + ) || + mode == SpatterProcessingMode::asynchronous + ); + state = SpatterGenState::WAITING; + exitSimLoop( + csprintf("%s received all expected responses.", name()), + 0, + nextCycle() + ); + } +} + +bool +SpatterGen::initAccessOk(int int_regs, int fp_regs, Tick when) const +{ + bool have_int_reg = int_regs > 0; + // for mode == SpatterProcessingMode::asynchronous state will always be + // SpatterGenState::RUNNING. we don't have to do checks for mode. + // for mode == SpatterProcessingMode::synchronous, if state is + // SpatterGenState::DRAINING or SpatterGenState::WAITING + // we can't initiate any new indirect accesses. + bool have_kernel = !kernels.empty() && (state == SpatterGenState::RUNNING); + return have_kernel && have_int_reg; +} + +bool +SpatterGen::interAccessOk(int int_regs, int fp_regs, Tick when) const +{ + bool have_int_reg = int_regs > 0; + bool have_index = receiveBuffer.hasReady(when); + bool mid_idx = have_index && (receiveBuffer.front()->tripsLeft() > 1); + return mid_idx && have_int_reg; +} + +bool +SpatterGen::ultAccessOk(int int_regs, int fp_regs, Tick when) const +{ + bool have_fp_reg = fp_regs > 0; + bool have_index = receiveBuffer.hasReady(when); + bool val_idx = have_index && (receiveBuffer.front()->tripsLeft() == 1); + return val_idx && have_fp_reg; +} + +void +SpatterGen::scheduleNextGenEvent(Tick when) +{ + int avail_int_regs = intRegFileSize - intRegUsed; + int avail_fp_regs = fpRegFileSize - fpRegUsed; + bool have_work = initAccessOk(avail_int_regs, avail_fp_regs, curTick()) || + interAccessOk(avail_int_regs, avail_fp_regs, curTick()) || + ultAccessOk(avail_int_regs, avail_fp_regs, curTick()); + Tick schedule_tick = std::max(when, firstGeneratorAvailableTime); + if (have_work && (!nextGenEvent.scheduled())) { + schedule(nextGenEvent, schedule_tick); + firstGeneratorAvailableTime = MaxTick; + } +} + +void +SpatterGen::processNextGenEvent() +{ + assert(!nextGenEvent.pending()); + int req_buf_before = requestBuffer.size(); + // track changes to intRegUsed in this variable and apply it + // at the end of the for loop. This way if we free a register + // in the for loop, other iterations of the for loop won't + // observe this change. This matches what happens in real h/w. + int int_used_now = 0; + // track this independently to prevent different iterations inside + // for loop observing change to h/w resources, i.e we can't rely + // intRegFileSize - intRegUsed to see if we have registers to allocate + // since they don't change until after the for loop + int int_regs_now = intRegFileSize - intRegUsed; + // same explanation as int_used_now + int fp_used_now = 0; + // same explanation as int_regs_now + int fp_regs_now = fpRegFileSize - fpRegUsed; + for (int i = 0; i < requestGenRate; i++) { + if (generatorBusyUntil[i] > curTick()) { + DPRINTF( + SpatterGen, + "%s: AGU[%d] is busy this cycle.\n", __func__, i + ); + continue; + } + if (!(requestBuffer.size() < requestBufferEntries)) { + // if no space left in the requestBuffer sleep + // whoever pops from requestBuffer wakes us up. + nextGenEvent.sleep(); + break; + } + // Now we know that AGU[i] is available and there is room + // in the requestBuffer to put the packet. + if (ultAccessOk(int_regs_now, fp_regs_now, curTick())) { + // occupy one fp register + fp_regs_now--; + fp_used_now++; + // make AGU busy for the next requestGenLatency cycles. + generatorBusyUntil[i] = clockEdge(Cycles(requestGenLatency)); + + // create a new packet to access + SpatterAccess* spatter_access = receiveBuffer.front(); + PacketPtr pkt = spatter_access->nextPacket(); + pkt->pushSenderState(spatter_access); + + // push to requestBuffer + requestBuffer.push(pkt, curTick()); + DPRINTF( + SpatterGen, + "%s: Pushed pkt: %s to requestBuffer.\n", + __func__, pkt->print() + ); + + // now deallocate resources for reading the index + int_used_now--; + receiveBuffer.pop(); + } else if (interAccessOk(int_regs_now, fp_regs_now, curTick())) { + // occupy one int register + int_regs_now--; + int_used_now++; + // make AGU busy for the next requestGenLatency cycles. + generatorBusyUntil[i] = clockEdge(Cycles(requestGenLatency)); + + // create a new packet to access + SpatterAccess* spatter_access = receiveBuffer.front(); + PacketPtr pkt = spatter_access->nextPacket(); + pkt->pushSenderState(spatter_access); + + // push to requestBuffer + requestBuffer.push(pkt, curTick()); + DPRINTF( + SpatterGen, + "%s: Pushed pkt: %s to requestBuffer.\n", + __func__, pkt->print() + ); + + // now deallocate resources for reading the index + int_used_now--; + receiveBuffer.pop(); + } else if (initAccessOk(int_regs_now, fp_regs_now, curTick())) { + // occupy one int register + int_regs_now--; + int_used_now++; + generatorBusyUntil[i] = clockEdge(Cycles(requestGenLatency)); + + SpatterKernel& front = kernels.front(); + SpatterAccess* spatter_access = front.nextSpatterAccess(); + PacketPtr pkt = spatter_access->nextPacket(); + pkt->pushSenderState(spatter_access); + + requestBuffer.push(pkt, curTick()); + DPRINTF( + SpatterGen, + "%s: Pushed pkt: %s to requestBuffer.\n", + __func__, pkt->print() + ); + + if (front.done()) { + DPRINTF( + SpatterKernel, + "%s: Done with kernel %d type: %s.\n", + __func__, front.id(), + SpatterKernelTypeStrings[front.type()] + ); + kernels.pop(); + // If we're processing synchronously we now have to stop + // making intial accesses and wait everyone to receive + // all expected responses. + if (mode == SpatterProcessingMode::synchronous) { + state = SpatterGenState::DRAINING; + } + } + } else { + // + DPRINTF( + SpatterGen, + "%s: Nothing more could be done this cycle.\n", __func__ + ); + DPRINTF(SpatterGen, "%s: Here is h/w status report: " + "{KERNELS_REMAIN: %d, INDEXES_REMAIN: %d, INT_REG_USED: %d, " + "FP_REG_USED: %d, REQ_BUFF_SIZE: %d}.\n", + __func__, kernels.size(), receiveBuffer.size(), + intRegUsed, fpRegUsed, requestBuffer.size()); + break; + } + } + + // update firstGeneratorAvailableTime after making all changes. + for (int i = 0; i < requestGenRate; i++) { + generatorBusyUntil[i] = std::max(generatorBusyUntil[i], nextCycle()); + firstGeneratorAvailableTime = std::min( + firstGeneratorAvailableTime, + generatorBusyUntil[i] + ); + } + + // now that we have simulated all the work of this cycle, we can + // apply the deltas to the h/w resources. + intRegUsed += int_used_now; + fpRegUsed += fp_used_now; + + bool did_work = (requestBuffer.size() - req_buf_before) > 0; + if (did_work && (!nextSendEvent.pending())) { + scheduleNextSendEvent(nextCycle()); + } + + if (!nextGenEvent.pending()) { + scheduleNextGenEvent(firstGeneratorAvailableTime); + } +} + +void +SpatterGen::scheduleNextSendEvent(Tick when) +{ + bool have_work = !requestBuffer.empty(); + Tick schedule_tick = std::max(when, firstPortAvailableTime); + if (have_work && (!nextSendEvent.scheduled())) { + schedule(nextSendEvent, schedule_tick); + firstPortAvailableTime = MaxTick; + } +} + +void +SpatterGen::processNextSendEvent() +{ + int req_buf_before = requestBuffer.size(); + for (int i = 0; i < sendRate; i++) { + if (portBusyUntil[i] > curTick()) { + DPRINTF( + SpatterGen, + "%s: Port[%d] is busy this cycle.\n", __func__, i + ); + continue; + } + if (requestBuffer.empty()) { + DPRINTF( + SpatterGen, + "%s: No packets to send this cycle.\n", __func__ + ); + break; + } + if (!requestBuffer.hasReady(curTick())) { + DPRINTF( + SpatterGen, + "%s: Packet at front of requestBuffer not ready this cycle.\n", + __func__ + ); + break; + } + PacketPtr pkt = requestBuffer.front(); + DPRINTF( + SpatterGen, + "%s: Sending pkt: %s to port[%d].\n", + __func__, pkt->print(), i + ); + // NOTE: We assume the port will be busy for 1 cycle. + portBusyUntil[i] = clockEdge(Cycles(1)); + port.sendPacket(pkt); + requestBuffer.pop(); + // increase numPendingMemRequests + numPendingMemRequests++; + // record packet departure time + requestDepartureTime[pkt->req] = curTick(); + // Now if we put the port in blocked state no point in continuing + // the loop. also no point in scheduling nextSendEvent. + if (port.blocked()) { + nextSendEvent.sleep(); + break; + } + } + // update firstPortAvailableTime after making all changes. + for (int i = 0; i < sendRate; i++) { + // if the port was not used this cycle, it's busy until nextCycle(). + portBusyUntil[i] = std::max(portBusyUntil[i], nextCycle()); + firstPortAvailableTime = std::min( + firstPortAvailableTime, + portBusyUntil[i] + ); + } + + bool did_work = (req_buf_before - requestBuffer.size()) > 0; + if (did_work && nextGenEvent.pending()) { + // since this event might open up space for output of nextGenEvent, + // it should wake it up if nextGenEvent is asleep. + nextGenEvent.wake(); + scheduleNextGenEvent(nextCycle()); + } + + if (!nextSendEvent.pending()) { + scheduleNextSendEvent(nextCycle()); + } +} + +SpatterGen::SpatterGenStats::SpatterGenStats(SpatterGen* spatter_gen): + statistics::Group(spatter_gen), spatterGen(spatter_gen), + ADD_STAT(numIndexReads, statistics::units::Count::get(), + "Number of reads from the indexer array."), + ADD_STAT(indexBytesRead, statistics::units::Byte::get(), + "Number of bytes read from the indexer array."), + ADD_STAT(totalIndexReadLatency, statistics::units::Tick::get(), + "Total latency for reading from the indexer array."), + ADD_STAT(numValueReads, statistics::units::Count::get(), + "Number of reads from the values array."), + ADD_STAT(numValueWrites, statistics::units::Count::get(), + "Number of writes to the values array."), + ADD_STAT(valueBytesRead, statistics::units::Byte::get(), + "Number of bytes read from the values array."), + ADD_STAT(valueBytesWritten, statistics::units::Byte::get(), + "Number of bytes written to the values array."), + ADD_STAT(totalValueReadLatency, statistics::units::Tick::get(), + "Total latency for reading from the values array."), + ADD_STAT(totalValueWriteLatency, statistics::units::Tick::get(), + "Total latency for writing to the values array."), + ADD_STAT(indexAccessLatency, statistics::units::Tick::get(), + "Distribution of latency for accessing the indexer array."), + ADD_STAT(valueAccessLatency, statistics::units::Tick::get(), + "Distribution of latency for accessing the values array."), + ADD_STAT(totalIndirectAccessLatency, statistics::units::Tick::get(), + "Distribution of total latency for indirect accesses.") +{} + +void +SpatterGen::SpatterGenStats::regStats() +{ + using namespace statistics; + indexAccessLatency.init(8); + valueAccessLatency.init(16); + totalIndirectAccessLatency.init(16); +} + +} // namespace gem5 diff --git a/src/cpu/testers/spatter_gen/spatter_gen.hh b/src/cpu/testers/spatter_gen/spatter_gen.hh new file mode 100644 index 0000000000..1b8a8dbb61 --- /dev/null +++ b/src/cpu/testers/spatter_gen/spatter_gen.hh @@ -0,0 +1,252 @@ +/* +* Copyright (c) 2024 The Regents of The University of California +* All rights reserved. +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions are +* met: redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer; +* redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the distribution; +* neither the name of the copyright holders nor the names of its +* contributors may be used to endorse or promote products derived from +* this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef __CPU_TESTERS_SPATTER_GEN_SPATTER_GEN_HH__ +#define __CPU_TESTERS_SPATTER_GEN_SPATTER_GEN_HH__ + +#include +#include +#include + +#include "base/statistics.hh" +#include "base/stats/group.hh" +#include "cpu/testers/spatter_gen/utility_structs.hh" +#include "enums/SpatterKernelType.hh" +#include "enums/SpatterProcessingMode.hh" +#include "mem/packet.hh" +#include "mem/port.hh" +#include "params/SpatterGen.hh" +#include "sim/clocked_object.hh" +#include "sim/eventq.hh" + +namespace gem5 +{ + + +/** + * @class SpatterGen + * @brief Spatter Kernel Player + * + * This class takes Spatter JSON traces and plays them back in gem5. + * Each trace includes a list of Spatter kernels, which are played in order. + * Kernels are either of type scatter or gather. + * At the time of writing, kernels represent accesses to the memory with + * one level of indirection. + * Initially, an access is made to an array which we call index from now on. + * The index array is streamed through with load accesses. + * In a high level programming language this access will be similar to below. + * "for (int i = 0; i < n; i++) { idx = index[i]; }". + * The value at index[i] is then used to access another array which we will + * call value from now on. + * For scatter type kernels, a random value is stored in the location and + * for gather type kernels, the value is read from the location. + * In a high level programming language this access will be similar to below. + * Scatter + * "for (int i = 0; i < n; i++) { idx = index[i]; value[idx] = rand(); }". + * Gather + * "for (int i = 0; i < n; i++) { idx = index[i]; val = value[idx]; }". + * For more information you can take a look at + * https://github.com/hpcgarage/spatter/blob/main/README.md + * While the readme mentions MultiScatter and MultiGather kernels, the + * trace format is not finalized (at the time of writing). + */ +class SpatterGen: public ClockedObject +{ + private: + typedef enums::SpatterKernelType SpatterKernelType; + typedef enums::SpatterProcessingMode SpatterProcessingMode; + + class SpatterGenEvent : public EventFunctionWrapper + { + private: + // TODO: split pending into pendingInput and pendingOutput + enum class SleepState + { + AWAKE, + ASLEEP + }; + + SleepState _state; + + public: + SpatterGenEvent(const std::function &callback, + const std::string &name): + EventFunctionWrapper(callback, name), _state(SleepState::AWAKE) + {} + // a SpatterGenEvent will only be asleep if it is pending output + bool pending() const { return _state == SleepState::ASLEEP; } + void sleep() { _state = SleepState::ASLEEP; } + void wake() { _state = SleepState::AWAKE; } + }; + + class SpatterGenPort: public RequestPort + { + private: + SpatterGen* owner; + PacketPtr blockedPacket; + + public: + SpatterGenPort(SpatterGen* owner, const std::string& name): + RequestPort(name), owner(owner), blockedPacket(nullptr) {} + + void sendPacket(PacketPtr pkt); + bool blocked() const { return blockedPacket != nullptr; } + + protected: + virtual bool recvTimingResp(PacketPtr pkt) override; + virtual void recvReqRetry() override; + }; + + struct SpatterGenStats: public statistics::Group + { + SpatterGen* spatterGen; + + // TODO: When we enable multiple levels of indirection, we should + // convert this to a vector with one stat for each level of index + statistics::Scalar numIndexReads; + // TODO: When we enable multiple levels of indirection, we should + // convert this to a vector with one stat for each level of index + statistics::Scalar indexBytesRead; + statistics::Scalar totalIndexReadLatency; + + statistics::Scalar numValueReads; + statistics::Scalar numValueWrites; + statistics::Scalar valueBytesRead; + statistics::Scalar valueBytesWritten; + statistics::Scalar totalValueReadLatency; + statistics::Scalar totalValueWriteLatency; + + // TODO: When we enable multiple levels of indirection, we should + // convert this to a vector with one stat for each level of index + statistics::Histogram indexAccessLatency; + statistics::Histogram valueAccessLatency; + statistics::Histogram totalIndirectAccessLatency; + + virtual void regStats() override; + + SpatterGenStats(SpatterGen* spatter_gen); + }; + + enum class SpatterGenState + { + // waiting for all other cores to get to WAITING state, no accesses + WAITING, + // only creating intermediate and ultimate accesses, i.e. wrapping up + DRAINING, + // creating all kinds of accesses, initial, intermediate, and ultimate + RUNNING + }; + + // non param related members + SpatterGenState state; + std::queue kernels; + std::unordered_map requestDepartureTime; + + RequestorID requestorId; + int numPendingMemRequests; + + SpatterGenStats stats; + + void checkForSimExit(); + + bool initAccessOk(int int_regs, int fp_regs, Tick when) const; + bool interAccessOk(int int_regs, int fp_regs, Tick when) const; + bool ultAccessOk(int int_regs, int fp_regs, Tick when) const; + + // param related members (not necessarily one-to-one with params) + SpatterProcessingMode mode; + SpatterGenPort port; + // size of the register files, + // for every memory instruction we need to allocate one register. + int intRegFileSize; + int intRegUsed; + int fpRegFileSize; + int fpRegUsed; + // laterncy to generate A request + int requestGenLatency; + // number of requests generated per event + int requestGenRate; + // tracking smallest tick when at least one "AGU" is available; + Tick firstGeneratorAvailableTime; + // tracking the busy state of our so called "AGU"s. + std::vector generatorBusyUntil; + SpatterGenEvent nextGenEvent; + void processNextGenEvent(); + // put requests to the cache in the request buffer. + int requestBufferEntries; + // store request packet along with their insertion time into this queue. + TimedQueue requestBuffer; + // if nextGenEvent has to be schedule at tick when then schedule it. + // this function should only be called when nextGenEvent is not pending. + void scheduleNextGenEvent(Tick when); + + // bandwidth to issue memory requests to cache, + // this is supposed to model the number of cache ports + // we will assume it takes 1 cycle to issue memory requests + int sendRate; + Tick firstPortAvailableTime; + std::vector portBusyUntil; + SpatterGenEvent nextSendEvent; + void processNextSendEvent(); + // if nextSendEvent has to be schedule at tick when then schedule it. + // this function should only be called when nextSendEvent is not pending. + void scheduleNextSendEvent(Tick when); + + // put the memory responses here. + // no need to limit the size of this buffer. + // it's a response buffer and it will automatically + // be limited by requestBufferEntries, intRegFileSize, fpRegFileSize + TimedQueue receiveBuffer; + + public: + PARAMS(SpatterGen); + SpatterGen(const Params& params); + + Port& + getPort(const std::string& if_name, PortID idx = InvalidPortID) override; + + virtual void startup() override; + + void recvReqRetry(); + bool recvTimingResp(PacketPtr pkt); + + // PyBindMethod to interface adding a kernel with python JSON frontend. + void addKernel( + uint32_t id, uint32_t delta, uint32_t count, + SpatterKernelType type, + size_t index_size, Addr base_index_addr, + size_t value_size, Addr base_value_addr, + const std::vector& indices + ); + + void proceedPastSyncPoint(); +}; + +} // namespace gem5 + +#endif // __CPU_TESTERS_SPATTER_GEN_SPATTER_GEN_HH__ diff --git a/src/cpu/testers/spatter_gen/utility_structs.hh b/src/cpu/testers/spatter_gen/utility_structs.hh new file mode 100644 index 0000000000..21bff9e8ae --- /dev/null +++ b/src/cpu/testers/spatter_gen/utility_structs.hh @@ -0,0 +1,242 @@ +/* +* Copyright (c) 2024 The Regents of The University of California +* All rights reserved. +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions are +* met: redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer; +* redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the distribution; +* neither the name of the copyright holders nor the names of its +* contributors may be used to endorse or promote products derived from +* this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef __CPU_TESTERS_SPATTER_GEN_UTILITY_STRUCTS_HH__ +#define __CPU_TESTERS_SPATTER_GEN_UTILITY_STRUCTS_HH__ + +#include +#include + +#include "base/random.hh" +#include "base/types.hh" +#include "enums/SpatterKernelType.hh" +#include "mem/packet.hh" + +namespace gem5 +{ + +template +class TimedQueue +{ + private: + Tick latency; + + std::queue items; + std::queue insertionTimes; + + public: + TimedQueue(Tick latency): latency(latency) {} + + void push(T item, Tick insertion_time) + { + items.push(item); + insertionTimes.push(insertion_time); + } + + void pop() + { + items.pop(); + insertionTimes.pop(); + } + + T front() const { return items.front(); } + + bool empty() const { return items.empty(); } + + size_t size() const { return items.size(); } + + bool hasReady(Tick current_time) const + { + if (empty()) { + return false; + } + return (current_time - insertionTimes.front()) >= latency; + } +}; + + + +// Represents a single access to a SpatterKernel. +// It supports multiple levels of indirection. +// However, the SpatterKernel class only works with one level of +// indirection (i.e. accessing value[index[i]]). +struct SpatterAccess : public Packet::SenderState +{ + typedef std::tuple AccessPair; + typedef enums::SpatterKernelType SpatterKernelType; + + RequestorID requestorId; + SpatterKernelType _kernelType; + Tick accTripTime; + std::queue accessPairs; + + SpatterAccess( + RequestorID requestor_id, + SpatterKernelType kernel_type, + const std::queue& access_pairs + ): + requestorId(requestor_id), _kernelType(kernel_type), + accTripTime(0), accessPairs(access_pairs) + {} + + SpatterKernelType type() const { return _kernelType; } + + int tripsLeft() const { return accessPairs.size(); } + + void recordTripTime(Tick trip_time) { accTripTime += trip_time; } + + Tick tripTimeSoFar() const { return accTripTime; } + + AccessPair nextAccessPair() + { + assert(tripsLeft() > 0); + AccessPair access_pair = accessPairs.front(); + accessPairs.pop(); + return access_pair; + } + + PacketPtr nextPacket() + { + Addr addr; + size_t size; + std::tie(addr, size) = nextAccessPair(); + MemCmd cmd; + if (tripsLeft() >= 1){ + cmd = MemCmd::ReadReq; + } else { + cmd = _kernelType == \ + SpatterKernelType::gather ? MemCmd::ReadReq : MemCmd::WriteReq; + } + return createPacket(addr, size, cmd); + } + + PacketPtr createPacket(Addr addr, size_t size, MemCmd cmd) const + { + RequestPtr req = std::make_shared(addr, size, 0, requestorId); + + // Dummy PC to have PC-based prefetchers latch on; + // get entropy into higher bits + // This piece of code is directly copied from + // gem5::TrafficGen:: + req->setPC(((Addr) requestorId) << 2); + PacketPtr pkt = new Packet(req, cmd); + uint8_t* pkt_data = new uint8_t[req->getSize()]; + // Randomly intialize pkt_data, for testing cache coherence. + for (int i = 0; i < req->getSize(); i++) { + pkt_data[i] = random_mt.random(); + } + pkt->dataDynamic(pkt_data); + return pkt; + } +}; + +class SpatterKernel +{ + private: + typedef enums::SpatterKernelType SpatterKernelType; + typedef SpatterAccess::AccessPair AccessPair; + + RequestorID requestorId; + uint32_t _id; + uint32_t delta; + uint32_t count; + + SpatterKernelType _type; + + size_t indexSize; + Addr baseIndexAddr; + + size_t valueSize; + Addr baseValueAddr; + + // needed to iterate over indices multiple times. + uint32_t index; + // current iteration over indices + uint32_t iteration; + + // number of times we have left to roll indices to finish one iteration. + uint32_t remRolls; + std::deque indices; + + public: + + SpatterKernel( + RequestorID requestor_id, + uint32_t id, uint32_t delta, uint32_t count, + SpatterKernelType type, + size_t index_size, Addr base_index_addr, + size_t value_size, Addr base_value_addr + ): + requestorId(requestor_id), + _id(id), delta(delta), count(count), + _type(type), + indexSize(index_size), baseIndexAddr(base_index_addr), + valueSize(value_size), baseValueAddr(base_value_addr), + index(0), iteration(0), remRolls(0) + {} + + uint32_t id() const { return _id; } + + void setIndices(const std::vector& pattern) + { + indices.assign(pattern.begin(), pattern.end()); + remRolls = indices.size(); + } + + SpatterKernelType type() const { return _type; } + + bool done() const { return iteration == count; } + + SpatterAccess* nextSpatterAccess() + { + std::queue access_pairs; + Addr index_addr = baseIndexAddr + (index * indexSize); + access_pairs.emplace(index_addr, indexSize); + // update index in the index array + index++; + + uint32_t front = indices.front(); + uint32_t value_index = (delta * iteration) + front; + Addr value_addr = baseValueAddr + (value_index * valueSize); + access_pairs.emplace(value_addr, valueSize); + // roll indices + indices.pop_front(); + indices.push_back(front); + remRolls--; + if (remRolls == 0) { + remRolls = indices.size(); + iteration++; + } + + return new SpatterAccess(requestorId, _type, access_pairs); + } +}; + +} // namespace gem5 + +#endif // __CPU_TESTERS_SPATTER_GEN_UTILITY_STRUCTS_HH__