cpu: Adding SpatterGen

This change adds source code for SpatterGen ClockedObject.
The set of source code pushed includes code for SpatterKernel
that tracks whether information is being gathered or scattered
as well as the list of indices to be accessed. This model
has PyBindMethod to add SpatterKernels from python.
This way all the preparations for kernels can be done in python.
SpatterGen has a few parameters that model limits on a few of
hardware resources in the backend of a processor, e.g. number
of functional units to calculate effective address, the latency
of calculating effective address, number of integer registers.

Change-Id: I451ffb385180a914e884cab220928c5f1944b2e3
This commit is contained in:
Mahyar Samani
2024-05-14 17:35:15 -07:00
parent 3cfc550fc0
commit 6695e5ef70
5 changed files with 1229 additions and 0 deletions

View File

@@ -0,0 +1,38 @@
# Copyright (c) 2024 The Regents of The University of California
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met: redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer;
# redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution;
# neither the name of the copyright holders nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Import("*")
SimObject(
"SpatterGen.py",
sim_objects=["SpatterGen"],
enums=["SpatterKernelType", "SpatterProcessingMode"],
)
Source("spatter_gen.cc")
DebugFlag("SpatterGen")
DebugFlag("SpatterKernel")

View File

@@ -0,0 +1,115 @@
# Copyright (c) 2024 The Regents of The University of California
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met: redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer;
# redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution;
# neither the name of the copyright holders nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from m5.citations import add_citation
from m5.objects.ClockedObject import ClockedObject
from m5.params import *
from m5.proxy import *
from m5.util.pybind import PyBindMethod
class SpatterKernelType(Enum):
vals = ["scatter", "gather"]
class SpatterProcessingMode(Enum):
vals = ["synchronous", "asynchronous"]
class SpatterGen(ClockedObject):
type = "SpatterGen"
cxx_header = "cpu/testers/spatter_gen/spatter_gen.hh"
cxx_class = "gem5::SpatterGen"
system = Param.System(Parent.any, "System this SpatterGen is a part of.")
processing_mode = Param.SpatterProcessingMode(
"How to process kernels accross multiple SpatterGen cores. "
"Whether to synchronize on kernel boundaries or not."
)
port = RequestPort("Port to send memory requests.")
int_regfile_size = Param.Int("Size of the integer register file.")
fp_regfile_size = Param.Int("Size of the floating point register file.")
request_gen_latency = Param.Cycles(
"Number of cycles to spend for creating a request."
)
request_gen_rate = Param.Int("Number of requests generate per cycle.")
request_buffer_entries = Param.Int("Size of the request buffer.")
send_rate = Param.Int(
"Number of requests to send in parallel."
"Emulates the number of dcache ports."
)
cxx_exports = [
PyBindMethod("addKernel"),
PyBindMethod("proceedPastSyncPoint"),
]
add_citation(
SpatterGen,
"""@inproceedings{10.1145/3422575.3422794,
author = {Lavin, Patrick and Young, Jeffrey and Vuduc, Richard and Riedy,
Jason and Vose, Aaron and Ernst, Daniel},
title = {Evaluating Gather and Scatter Performance on CPUs and GPUs},
year = {2021},
isbn = {9781450388993},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3422575.3422794},
doi = {10.1145/3422575.3422794},
abstract = {This paper describes a new benchmark tool,
Spatter, for assessing memory system architectures in the context of a
specific category of indexed accesses known as gather and scatter.
These types of operations are increasingly used to express sparse and
irregular data access patterns, and they have widespread utility in many
modern HPC applications including scientific simulations, data mining and
analysis computations, and graph processing. However, many traditional
benchmarking tools like STREAM, STRIDE, and GUPS focus on characterizing
only uniform stride or fully random accesses despite evidence that modern
applications use varied sets of more complex access patterns. Spatter is an
open-source benchmark that provides a tunable and configurable framework to
benchmark a variety of indexed access patterns, including variations of gather
/ scatter that are seen in HPC mini-apps evaluated in this work. The design of
Spatter includes backends for OpenMP and CUDA, and experiments show how it can
be used to evaluate 1) uniform access patterns for CPU and GPU, 2) prefetching
regimes for gather / scatter, 3) compiler implementations of vectorization for
gather / scatter, and 4) trace-driven “proxy patterns” that reflect the
patterns found in multiple applications. The results from Spatter experiments
show, for instance, that GPUs typically outperform CPUs for these operations
in absolute bandwidth but not fraction of peak bandwidth, and that Spatter can
better represent the performance of some cache-dependent mini-apps than
traditional STREAM bandwidth measurements.},
booktitle = {Proceedings of the International Symposium on Memory Systems},
pages = {209222},
numpages = {14},
location = {Washington, DC, USA},
series = {MEMSYS '20}
}
""",
)

View File

@@ -0,0 +1,582 @@
/*
* Copyright (c) 2024 The Regents of The University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "cpu/testers/spatter_gen/spatter_gen.hh"
#include "base/cprintf.hh"
#include "debug/SpatterGen.hh"
#include "debug/SpatterKernel.hh"
#include "enums/SpatterKernelType.hh"
#include "enums/SpatterProcessingMode.hh"
#include "mem/packet.hh"
#include "sim/sim_exit.hh"
#include "sim/system.hh"
namespace gem5
{
using enums::SpatterKernelTypeStrings;
using enums::SpatterProcessingMode;
SpatterGen::SpatterGen(const Params& params):
ClockedObject(params),
state(SpatterGenState::RUNNING),
requestorId(params.system->getRequestorId(this)),
numPendingMemRequests(0),
stats(this),
mode(params.processing_mode),
port(this, name() + ".port"),
intRegFileSize(params.int_regfile_size), intRegUsed(0),
fpRegFileSize(params.fp_regfile_size), fpRegUsed(0),
requestGenLatency(params.request_gen_latency),
requestGenRate(params.request_gen_rate),
firstGeneratorAvailableTime(0),
nextGenEvent([this](){ processNextGenEvent(); }, name() + ".GenEvent"),
requestBufferEntries(params.request_buffer_entries),
requestBuffer(clockPeriod()),
sendRate(params.send_rate),
firstPortAvailableTime(0),
nextSendEvent([this](){ processNextSendEvent(); }, name() + ".SendEvent"),
receiveBuffer(clockPeriod())
{
fatal_if(fpRegFileSize < requestBufferEntries,
"fp_regfile_size should be >= request_buffer_entries."
"if request_buffer_entries is bigger than fp_regfile_size,"
"it may result in inaccuracies in your simulation."
"Ideally: fp_regfile_size >> request_buffer_entries."
);
generatorBusyUntil.resize(requestGenRate, 0);
portBusyUntil.resize(sendRate, 0);
}
Port&
SpatterGen::getPort(const std::string& if_name, PortID idx)
{
if (if_name == "port") {
return port;
} else {
return ClockedObject::getPort(if_name, idx);
}
}
void
SpatterGen::startup()
{
scheduleNextGenEvent(curTick());
}
void
SpatterGen::SpatterGenPort::sendPacket(PacketPtr pkt)
{
panic_if(blocked(), "Should never try to send if port is blocked.");
if (!sendTimingReq(pkt)) {
blockedPacket = pkt;
DPRINTF(
SpatterGen,
"%s: Port blocked when sending %s.\n",
__func__, pkt->print()
);
}
}
void
SpatterGen::SpatterGenPort::recvReqRetry()
{
DPRINTF(SpatterGen, "%s: Port received a ReqRetry.\n", __func__);
panic_if(
blockedPacket == nullptr,
"Received reqRetry with no blocked packet."
);
if (!sendTimingReq(blockedPacket)) {
DPRINTF(
SpatterGen,
"%s: Port blocked when sending %s.\n",
__func__, blockedPacket->print()
);
} else {
blockedPacket = nullptr;
owner->recvReqRetry();
}
}
void
SpatterGen::recvReqRetry()
{
if (nextSendEvent.pending()) {
nextSendEvent.wake();
scheduleNextSendEvent(nextCycle());
}
}
bool
SpatterGen::SpatterGenPort::recvTimingResp(PacketPtr pkt) {
return owner->recvTimingResp(pkt);
}
bool
SpatterGen::recvTimingResp(PacketPtr pkt)
{
DPRINTF(SpatterGen, "%s: Received pkt: %s.\n", __func__, pkt->print());
assert(pkt->isResponse());
// record trip time.
SpatterAccess* spatter_access = pkt->findNextSenderState<SpatterAccess>();
Tick trip_time = (curTick() - requestDepartureTime[pkt->req]);
requestDepartureTime.erase(pkt->req);
spatter_access->recordTripTime(trip_time);
int trips_left = spatter_access->tripsLeft();
assert(trips_left >= 0);
if (trips_left > 0) {
stats.numIndexReads++;
stats.indexBytesRead += pkt->getSize();
stats.totalIndexReadLatency += trip_time;
stats.indexAccessLatency.sample(trip_time);
receiveBuffer.push(spatter_access, curTick());
} else {
stats.valueAccessLatency.sample(trip_time);
stats.totalIndirectAccessLatency.sample(
spatter_access->tripTimeSoFar()
);
if (spatter_access->type() == SpatterKernelType::gather) {
stats.numValueReads++;
stats.valueBytesRead += pkt->getSize();
stats.totalValueReadLatency += trip_time;
} else if (spatter_access->type() == SpatterKernelType::scatter) {
stats.numValueWrites++;
stats.valueBytesWritten += pkt->getSize();
stats.totalValueWriteLatency += trip_time;
} else {
panic("Unknown kernel type.");
}
// CAUTION: We're going to decrement fpRegUsed here,
// it could cause inaccuracies if processNextGenEvent
// is called after recvTimingResp on the same tick.
// i.e. we might end up releasing a register on the same
// cycle that we are allocating it.
// it's probably not going to ever be an issue since
// fpRegFileSize is probably >> requestBufferEntries
// i.e. the chances of running out of fp registers is low because
// we do not simulate parts of the pipeline that back things up into
// fp registers, e.g. functional units of ALU.
fpRegUsed--;
delete spatter_access;
}
// delete the pkt since we don't need it anymore.
delete pkt;
if (!nextGenEvent.pending()) {
scheduleNextGenEvent(nextCycle());
}
numPendingMemRequests--;
checkForSimExit();
return true;
}
void
SpatterGen::addKernel(
uint32_t id, uint32_t delta, uint32_t count,
SpatterKernelType type,
size_t index_size, Addr base_index_addr,
size_t value_size, Addr base_value_addr,
const std::vector<uint32_t>& indices
)
{
DPRINTF(
SpatterGen,
"%s: Adding kernel with id: %d, delta: %d, count: %d, type: %s.\n",
__func__, id, delta, count, SpatterKernelTypeStrings[type]
);
SpatterKernel new_kernel(
requestorId,
id, delta, count, type,
index_size, base_index_addr,
value_size, base_value_addr
);
new_kernel.setIndices(indices);
kernels.push(new_kernel);
}
void
SpatterGen::proceedPastSyncPoint()
{
assert(mode == SpatterProcessingMode::synchronous);
assert(state == SpatterGenState::WAITING);
state = SpatterGenState::RUNNING;
scheduleNextGenEvent(nextCycle());
}
void
SpatterGen::checkForSimExit()
{
bool no_pending = numPendingMemRequests == 0;
bool no_queued = requestBuffer.empty();
int avail_int_regs = intRegFileSize - intRegUsed;
int avail_fp_regs = fpRegFileSize - fpRegUsed;
bool can_do_init = initAccessOk(avail_int_regs, avail_fp_regs, curTick());
bool can_do_mid = interAccessOk(avail_int_regs, avail_fp_regs, curTick());
bool can_do_ult = ultAccessOk(avail_int_regs, avail_fp_regs, curTick());
if (!can_do_init && !can_do_mid && !can_do_ult && no_pending && no_queued)
{
assert((
(mode == SpatterProcessingMode::synchronous) &&
(state == SpatterGenState::DRAINING)
) ||
mode == SpatterProcessingMode::asynchronous
);
state = SpatterGenState::WAITING;
exitSimLoop(
csprintf("%s received all expected responses.", name()),
0,
nextCycle()
);
}
}
bool
SpatterGen::initAccessOk(int int_regs, int fp_regs, Tick when) const
{
bool have_int_reg = int_regs > 0;
// for mode == SpatterProcessingMode::asynchronous state will always be
// SpatterGenState::RUNNING. we don't have to do checks for mode.
// for mode == SpatterProcessingMode::synchronous, if state is
// SpatterGenState::DRAINING or SpatterGenState::WAITING
// we can't initiate any new indirect accesses.
bool have_kernel = !kernels.empty() && (state == SpatterGenState::RUNNING);
return have_kernel && have_int_reg;
}
bool
SpatterGen::interAccessOk(int int_regs, int fp_regs, Tick when) const
{
bool have_int_reg = int_regs > 0;
bool have_index = receiveBuffer.hasReady(when);
bool mid_idx = have_index && (receiveBuffer.front()->tripsLeft() > 1);
return mid_idx && have_int_reg;
}
bool
SpatterGen::ultAccessOk(int int_regs, int fp_regs, Tick when) const
{
bool have_fp_reg = fp_regs > 0;
bool have_index = receiveBuffer.hasReady(when);
bool val_idx = have_index && (receiveBuffer.front()->tripsLeft() == 1);
return val_idx && have_fp_reg;
}
void
SpatterGen::scheduleNextGenEvent(Tick when)
{
int avail_int_regs = intRegFileSize - intRegUsed;
int avail_fp_regs = fpRegFileSize - fpRegUsed;
bool have_work = initAccessOk(avail_int_regs, avail_fp_regs, curTick()) ||
interAccessOk(avail_int_regs, avail_fp_regs, curTick()) ||
ultAccessOk(avail_int_regs, avail_fp_regs, curTick());
Tick schedule_tick = std::max(when, firstGeneratorAvailableTime);
if (have_work && (!nextGenEvent.scheduled())) {
schedule(nextGenEvent, schedule_tick);
firstGeneratorAvailableTime = MaxTick;
}
}
void
SpatterGen::processNextGenEvent()
{
assert(!nextGenEvent.pending());
int req_buf_before = requestBuffer.size();
// track changes to intRegUsed in this variable and apply it
// at the end of the for loop. This way if we free a register
// in the for loop, other iterations of the for loop won't
// observe this change. This matches what happens in real h/w.
int int_used_now = 0;
// track this independently to prevent different iterations inside
// for loop observing change to h/w resources, i.e we can't rely
// intRegFileSize - intRegUsed to see if we have registers to allocate
// since they don't change until after the for loop
int int_regs_now = intRegFileSize - intRegUsed;
// same explanation as int_used_now
int fp_used_now = 0;
// same explanation as int_regs_now
int fp_regs_now = fpRegFileSize - fpRegUsed;
for (int i = 0; i < requestGenRate; i++) {
if (generatorBusyUntil[i] > curTick()) {
DPRINTF(
SpatterGen,
"%s: AGU[%d] is busy this cycle.\n", __func__, i
);
continue;
}
if (!(requestBuffer.size() < requestBufferEntries)) {
// if no space left in the requestBuffer sleep
// whoever pops from requestBuffer wakes us up.
nextGenEvent.sleep();
break;
}
// Now we know that AGU[i] is available and there is room
// in the requestBuffer to put the packet.
if (ultAccessOk(int_regs_now, fp_regs_now, curTick())) {
// occupy one fp register
fp_regs_now--;
fp_used_now++;
// make AGU busy for the next requestGenLatency cycles.
generatorBusyUntil[i] = clockEdge(Cycles(requestGenLatency));
// create a new packet to access
SpatterAccess* spatter_access = receiveBuffer.front();
PacketPtr pkt = spatter_access->nextPacket();
pkt->pushSenderState(spatter_access);
// push to requestBuffer
requestBuffer.push(pkt, curTick());
DPRINTF(
SpatterGen,
"%s: Pushed pkt: %s to requestBuffer.\n",
__func__, pkt->print()
);
// now deallocate resources for reading the index
int_used_now--;
receiveBuffer.pop();
} else if (interAccessOk(int_regs_now, fp_regs_now, curTick())) {
// occupy one int register
int_regs_now--;
int_used_now++;
// make AGU busy for the next requestGenLatency cycles.
generatorBusyUntil[i] = clockEdge(Cycles(requestGenLatency));
// create a new packet to access
SpatterAccess* spatter_access = receiveBuffer.front();
PacketPtr pkt = spatter_access->nextPacket();
pkt->pushSenderState(spatter_access);
// push to requestBuffer
requestBuffer.push(pkt, curTick());
DPRINTF(
SpatterGen,
"%s: Pushed pkt: %s to requestBuffer.\n",
__func__, pkt->print()
);
// now deallocate resources for reading the index
int_used_now--;
receiveBuffer.pop();
} else if (initAccessOk(int_regs_now, fp_regs_now, curTick())) {
// occupy one int register
int_regs_now--;
int_used_now++;
generatorBusyUntil[i] = clockEdge(Cycles(requestGenLatency));
SpatterKernel& front = kernels.front();
SpatterAccess* spatter_access = front.nextSpatterAccess();
PacketPtr pkt = spatter_access->nextPacket();
pkt->pushSenderState(spatter_access);
requestBuffer.push(pkt, curTick());
DPRINTF(
SpatterGen,
"%s: Pushed pkt: %s to requestBuffer.\n",
__func__, pkt->print()
);
if (front.done()) {
DPRINTF(
SpatterKernel,
"%s: Done with kernel %d type: %s.\n",
__func__, front.id(),
SpatterKernelTypeStrings[front.type()]
);
kernels.pop();
// If we're processing synchronously we now have to stop
// making intial accesses and wait everyone to receive
// all expected responses.
if (mode == SpatterProcessingMode::synchronous) {
state = SpatterGenState::DRAINING;
}
}
} else {
//
DPRINTF(
SpatterGen,
"%s: Nothing more could be done this cycle.\n", __func__
);
DPRINTF(SpatterGen, "%s: Here is h/w status report: "
"{KERNELS_REMAIN: %d, INDEXES_REMAIN: %d, INT_REG_USED: %d, "
"FP_REG_USED: %d, REQ_BUFF_SIZE: %d}.\n",
__func__, kernels.size(), receiveBuffer.size(),
intRegUsed, fpRegUsed, requestBuffer.size());
break;
}
}
// update firstGeneratorAvailableTime after making all changes.
for (int i = 0; i < requestGenRate; i++) {
generatorBusyUntil[i] = std::max(generatorBusyUntil[i], nextCycle());
firstGeneratorAvailableTime = std::min(
firstGeneratorAvailableTime,
generatorBusyUntil[i]
);
}
// now that we have simulated all the work of this cycle, we can
// apply the deltas to the h/w resources.
intRegUsed += int_used_now;
fpRegUsed += fp_used_now;
bool did_work = (requestBuffer.size() - req_buf_before) > 0;
if (did_work && (!nextSendEvent.pending())) {
scheduleNextSendEvent(nextCycle());
}
if (!nextGenEvent.pending()) {
scheduleNextGenEvent(firstGeneratorAvailableTime);
}
}
void
SpatterGen::scheduleNextSendEvent(Tick when)
{
bool have_work = !requestBuffer.empty();
Tick schedule_tick = std::max(when, firstPortAvailableTime);
if (have_work && (!nextSendEvent.scheduled())) {
schedule(nextSendEvent, schedule_tick);
firstPortAvailableTime = MaxTick;
}
}
void
SpatterGen::processNextSendEvent()
{
int req_buf_before = requestBuffer.size();
for (int i = 0; i < sendRate; i++) {
if (portBusyUntil[i] > curTick()) {
DPRINTF(
SpatterGen,
"%s: Port[%d] is busy this cycle.\n", __func__, i
);
continue;
}
if (requestBuffer.empty()) {
DPRINTF(
SpatterGen,
"%s: No packets to send this cycle.\n", __func__
);
break;
}
if (!requestBuffer.hasReady(curTick())) {
DPRINTF(
SpatterGen,
"%s: Packet at front of requestBuffer not ready this cycle.\n",
__func__
);
break;
}
PacketPtr pkt = requestBuffer.front();
DPRINTF(
SpatterGen,
"%s: Sending pkt: %s to port[%d].\n",
__func__, pkt->print(), i
);
// NOTE: We assume the port will be busy for 1 cycle.
portBusyUntil[i] = clockEdge(Cycles(1));
port.sendPacket(pkt);
requestBuffer.pop();
// increase numPendingMemRequests
numPendingMemRequests++;
// record packet departure time
requestDepartureTime[pkt->req] = curTick();
// Now if we put the port in blocked state no point in continuing
// the loop. also no point in scheduling nextSendEvent.
if (port.blocked()) {
nextSendEvent.sleep();
break;
}
}
// update firstPortAvailableTime after making all changes.
for (int i = 0; i < sendRate; i++) {
// if the port was not used this cycle, it's busy until nextCycle().
portBusyUntil[i] = std::max(portBusyUntil[i], nextCycle());
firstPortAvailableTime = std::min(
firstPortAvailableTime,
portBusyUntil[i]
);
}
bool did_work = (req_buf_before - requestBuffer.size()) > 0;
if (did_work && nextGenEvent.pending()) {
// since this event might open up space for output of nextGenEvent,
// it should wake it up if nextGenEvent is asleep.
nextGenEvent.wake();
scheduleNextGenEvent(nextCycle());
}
if (!nextSendEvent.pending()) {
scheduleNextSendEvent(nextCycle());
}
}
SpatterGen::SpatterGenStats::SpatterGenStats(SpatterGen* spatter_gen):
statistics::Group(spatter_gen), spatterGen(spatter_gen),
ADD_STAT(numIndexReads, statistics::units::Count::get(),
"Number of reads from the indexer array."),
ADD_STAT(indexBytesRead, statistics::units::Byte::get(),
"Number of bytes read from the indexer array."),
ADD_STAT(totalIndexReadLatency, statistics::units::Tick::get(),
"Total latency for reading from the indexer array."),
ADD_STAT(numValueReads, statistics::units::Count::get(),
"Number of reads from the values array."),
ADD_STAT(numValueWrites, statistics::units::Count::get(),
"Number of writes to the values array."),
ADD_STAT(valueBytesRead, statistics::units::Byte::get(),
"Number of bytes read from the values array."),
ADD_STAT(valueBytesWritten, statistics::units::Byte::get(),
"Number of bytes written to the values array."),
ADD_STAT(totalValueReadLatency, statistics::units::Tick::get(),
"Total latency for reading from the values array."),
ADD_STAT(totalValueWriteLatency, statistics::units::Tick::get(),
"Total latency for writing to the values array."),
ADD_STAT(indexAccessLatency, statistics::units::Tick::get(),
"Distribution of latency for accessing the indexer array."),
ADD_STAT(valueAccessLatency, statistics::units::Tick::get(),
"Distribution of latency for accessing the values array."),
ADD_STAT(totalIndirectAccessLatency, statistics::units::Tick::get(),
"Distribution of total latency for indirect accesses.")
{}
void
SpatterGen::SpatterGenStats::regStats()
{
using namespace statistics;
indexAccessLatency.init(8);
valueAccessLatency.init(16);
totalIndirectAccessLatency.init(16);
}
} // namespace gem5

View File

@@ -0,0 +1,252 @@
/*
* Copyright (c) 2024 The Regents of The University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __CPU_TESTERS_SPATTER_GEN_SPATTER_GEN_HH__
#define __CPU_TESTERS_SPATTER_GEN_SPATTER_GEN_HH__
#include <queue>
#include <unordered_map>
#include <vector>
#include "base/statistics.hh"
#include "base/stats/group.hh"
#include "cpu/testers/spatter_gen/utility_structs.hh"
#include "enums/SpatterKernelType.hh"
#include "enums/SpatterProcessingMode.hh"
#include "mem/packet.hh"
#include "mem/port.hh"
#include "params/SpatterGen.hh"
#include "sim/clocked_object.hh"
#include "sim/eventq.hh"
namespace gem5
{
/**
* @class SpatterGen
* @brief Spatter Kernel Player
*
* This class takes Spatter JSON traces and plays them back in gem5.
* Each trace includes a list of Spatter kernels, which are played in order.
* Kernels are either of type scatter or gather.
* At the time of writing, kernels represent accesses to the memory with
* one level of indirection.
* Initially, an access is made to an array which we call index from now on.
* The index array is streamed through with load accesses.
* In a high level programming language this access will be similar to below.
* "for (int i = 0; i < n; i++) { idx = index[i]; }".
* The value at index[i] is then used to access another array which we will
* call value from now on.
* For scatter type kernels, a random value is stored in the location and
* for gather type kernels, the value is read from the location.
* In a high level programming language this access will be similar to below.
* Scatter
* "for (int i = 0; i < n; i++) { idx = index[i]; value[idx] = rand(); }".
* Gather
* "for (int i = 0; i < n; i++) { idx = index[i]; val = value[idx]; }".
* For more information you can take a look at
* https://github.com/hpcgarage/spatter/blob/main/README.md
* While the readme mentions MultiScatter and MultiGather kernels, the
* trace format is not finalized (at the time of writing).
*/
class SpatterGen: public ClockedObject
{
private:
typedef enums::SpatterKernelType SpatterKernelType;
typedef enums::SpatterProcessingMode SpatterProcessingMode;
class SpatterGenEvent : public EventFunctionWrapper
{
private:
// TODO: split pending into pendingInput and pendingOutput
enum class SleepState
{
AWAKE,
ASLEEP
};
SleepState _state;
public:
SpatterGenEvent(const std::function<void(void)> &callback,
const std::string &name):
EventFunctionWrapper(callback, name), _state(SleepState::AWAKE)
{}
// a SpatterGenEvent will only be asleep if it is pending output
bool pending() const { return _state == SleepState::ASLEEP; }
void sleep() { _state = SleepState::ASLEEP; }
void wake() { _state = SleepState::AWAKE; }
};
class SpatterGenPort: public RequestPort
{
private:
SpatterGen* owner;
PacketPtr blockedPacket;
public:
SpatterGenPort(SpatterGen* owner, const std::string& name):
RequestPort(name), owner(owner), blockedPacket(nullptr) {}
void sendPacket(PacketPtr pkt);
bool blocked() const { return blockedPacket != nullptr; }
protected:
virtual bool recvTimingResp(PacketPtr pkt) override;
virtual void recvReqRetry() override;
};
struct SpatterGenStats: public statistics::Group
{
SpatterGen* spatterGen;
// TODO: When we enable multiple levels of indirection, we should
// convert this to a vector with one stat for each level of index
statistics::Scalar numIndexReads;
// TODO: When we enable multiple levels of indirection, we should
// convert this to a vector with one stat for each level of index
statistics::Scalar indexBytesRead;
statistics::Scalar totalIndexReadLatency;
statistics::Scalar numValueReads;
statistics::Scalar numValueWrites;
statistics::Scalar valueBytesRead;
statistics::Scalar valueBytesWritten;
statistics::Scalar totalValueReadLatency;
statistics::Scalar totalValueWriteLatency;
// TODO: When we enable multiple levels of indirection, we should
// convert this to a vector with one stat for each level of index
statistics::Histogram indexAccessLatency;
statistics::Histogram valueAccessLatency;
statistics::Histogram totalIndirectAccessLatency;
virtual void regStats() override;
SpatterGenStats(SpatterGen* spatter_gen);
};
enum class SpatterGenState
{
// waiting for all other cores to get to WAITING state, no accesses
WAITING,
// only creating intermediate and ultimate accesses, i.e. wrapping up
DRAINING,
// creating all kinds of accesses, initial, intermediate, and ultimate
RUNNING
};
// non param related members
SpatterGenState state;
std::queue<SpatterKernel> kernels;
std::unordered_map<RequestPtr, Tick> requestDepartureTime;
RequestorID requestorId;
int numPendingMemRequests;
SpatterGenStats stats;
void checkForSimExit();
bool initAccessOk(int int_regs, int fp_regs, Tick when) const;
bool interAccessOk(int int_regs, int fp_regs, Tick when) const;
bool ultAccessOk(int int_regs, int fp_regs, Tick when) const;
// param related members (not necessarily one-to-one with params)
SpatterProcessingMode mode;
SpatterGenPort port;
// size of the register files,
// for every memory instruction we need to allocate one register.
int intRegFileSize;
int intRegUsed;
int fpRegFileSize;
int fpRegUsed;
// laterncy to generate A request
int requestGenLatency;
// number of requests generated per event
int requestGenRate;
// tracking smallest tick when at least one "AGU" is available;
Tick firstGeneratorAvailableTime;
// tracking the busy state of our so called "AGU"s.
std::vector<Tick> generatorBusyUntil;
SpatterGenEvent nextGenEvent;
void processNextGenEvent();
// put requests to the cache in the request buffer.
int requestBufferEntries;
// store request packet along with their insertion time into this queue.
TimedQueue<PacketPtr> requestBuffer;
// if nextGenEvent has to be schedule at tick when then schedule it.
// this function should only be called when nextGenEvent is not pending.
void scheduleNextGenEvent(Tick when);
// bandwidth to issue memory requests to cache,
// this is supposed to model the number of cache ports
// we will assume it takes 1 cycle to issue memory requests
int sendRate;
Tick firstPortAvailableTime;
std::vector<Tick> portBusyUntil;
SpatterGenEvent nextSendEvent;
void processNextSendEvent();
// if nextSendEvent has to be schedule at tick when then schedule it.
// this function should only be called when nextSendEvent is not pending.
void scheduleNextSendEvent(Tick when);
// put the memory responses here.
// no need to limit the size of this buffer.
// it's a response buffer and it will automatically
// be limited by requestBufferEntries, intRegFileSize, fpRegFileSize
TimedQueue<SpatterAccess*> receiveBuffer;
public:
PARAMS(SpatterGen);
SpatterGen(const Params& params);
Port&
getPort(const std::string& if_name, PortID idx = InvalidPortID) override;
virtual void startup() override;
void recvReqRetry();
bool recvTimingResp(PacketPtr pkt);
// PyBindMethod to interface adding a kernel with python JSON frontend.
void addKernel(
uint32_t id, uint32_t delta, uint32_t count,
SpatterKernelType type,
size_t index_size, Addr base_index_addr,
size_t value_size, Addr base_value_addr,
const std::vector<uint32_t>& indices
);
void proceedPastSyncPoint();
};
} // namespace gem5
#endif // __CPU_TESTERS_SPATTER_GEN_SPATTER_GEN_HH__

View File

@@ -0,0 +1,242 @@
/*
* Copyright (c) 2024 The Regents of The University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met: redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer;
* redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution;
* neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __CPU_TESTERS_SPATTER_GEN_UTILITY_STRUCTS_HH__
#define __CPU_TESTERS_SPATTER_GEN_UTILITY_STRUCTS_HH__
#include <deque>
#include <queue>
#include "base/random.hh"
#include "base/types.hh"
#include "enums/SpatterKernelType.hh"
#include "mem/packet.hh"
namespace gem5
{
template<typename T>
class TimedQueue
{
private:
Tick latency;
std::queue<T> items;
std::queue<Tick> insertionTimes;
public:
TimedQueue(Tick latency): latency(latency) {}
void push(T item, Tick insertion_time)
{
items.push(item);
insertionTimes.push(insertion_time);
}
void pop()
{
items.pop();
insertionTimes.pop();
}
T front() const { return items.front(); }
bool empty() const { return items.empty(); }
size_t size() const { return items.size(); }
bool hasReady(Tick current_time) const
{
if (empty()) {
return false;
}
return (current_time - insertionTimes.front()) >= latency;
}
};
// Represents a single access to a SpatterKernel.
// It supports multiple levels of indirection.
// However, the SpatterKernel class only works with one level of
// indirection (i.e. accessing value[index[i]]).
struct SpatterAccess : public Packet::SenderState
{
typedef std::tuple<Addr, size_t> AccessPair;
typedef enums::SpatterKernelType SpatterKernelType;
RequestorID requestorId;
SpatterKernelType _kernelType;
Tick accTripTime;
std::queue<AccessPair> accessPairs;
SpatterAccess(
RequestorID requestor_id,
SpatterKernelType kernel_type,
const std::queue<AccessPair>& access_pairs
):
requestorId(requestor_id), _kernelType(kernel_type),
accTripTime(0), accessPairs(access_pairs)
{}
SpatterKernelType type() const { return _kernelType; }
int tripsLeft() const { return accessPairs.size(); }
void recordTripTime(Tick trip_time) { accTripTime += trip_time; }
Tick tripTimeSoFar() const { return accTripTime; }
AccessPair nextAccessPair()
{
assert(tripsLeft() > 0);
AccessPair access_pair = accessPairs.front();
accessPairs.pop();
return access_pair;
}
PacketPtr nextPacket()
{
Addr addr;
size_t size;
std::tie(addr, size) = nextAccessPair();
MemCmd cmd;
if (tripsLeft() >= 1){
cmd = MemCmd::ReadReq;
} else {
cmd = _kernelType == \
SpatterKernelType::gather ? MemCmd::ReadReq : MemCmd::WriteReq;
}
return createPacket(addr, size, cmd);
}
PacketPtr createPacket(Addr addr, size_t size, MemCmd cmd) const
{
RequestPtr req = std::make_shared<Request>(addr, size, 0, requestorId);
// Dummy PC to have PC-based prefetchers latch on;
// get entropy into higher bits
// This piece of code is directly copied from
// gem5::TrafficGen::
req->setPC(((Addr) requestorId) << 2);
PacketPtr pkt = new Packet(req, cmd);
uint8_t* pkt_data = new uint8_t[req->getSize()];
// Randomly intialize pkt_data, for testing cache coherence.
for (int i = 0; i < req->getSize(); i++) {
pkt_data[i] = random_mt.random<uint8_t>();
}
pkt->dataDynamic(pkt_data);
return pkt;
}
};
class SpatterKernel
{
private:
typedef enums::SpatterKernelType SpatterKernelType;
typedef SpatterAccess::AccessPair AccessPair;
RequestorID requestorId;
uint32_t _id;
uint32_t delta;
uint32_t count;
SpatterKernelType _type;
size_t indexSize;
Addr baseIndexAddr;
size_t valueSize;
Addr baseValueAddr;
// needed to iterate over indices multiple times.
uint32_t index;
// current iteration over indices
uint32_t iteration;
// number of times we have left to roll indices to finish one iteration.
uint32_t remRolls;
std::deque<uint32_t> indices;
public:
SpatterKernel(
RequestorID requestor_id,
uint32_t id, uint32_t delta, uint32_t count,
SpatterKernelType type,
size_t index_size, Addr base_index_addr,
size_t value_size, Addr base_value_addr
):
requestorId(requestor_id),
_id(id), delta(delta), count(count),
_type(type),
indexSize(index_size), baseIndexAddr(base_index_addr),
valueSize(value_size), baseValueAddr(base_value_addr),
index(0), iteration(0), remRolls(0)
{}
uint32_t id() const { return _id; }
void setIndices(const std::vector<uint32_t>& pattern)
{
indices.assign(pattern.begin(), pattern.end());
remRolls = indices.size();
}
SpatterKernelType type() const { return _type; }
bool done() const { return iteration == count; }
SpatterAccess* nextSpatterAccess()
{
std::queue<AccessPair> access_pairs;
Addr index_addr = baseIndexAddr + (index * indexSize);
access_pairs.emplace(index_addr, indexSize);
// update index in the index array
index++;
uint32_t front = indices.front();
uint32_t value_index = (delta * iteration) + front;
Addr value_addr = baseValueAddr + (value_index * valueSize);
access_pairs.emplace(value_addr, valueSize);
// roll indices
indices.pop_front();
indices.push_back(front);
remRolls--;
if (remRolls == 0) {
remRolls = indices.size();
iteration++;
}
return new SpatterAccess(requestorId, _type, access_pairs);
}
};
} // namespace gem5
#endif // __CPU_TESTERS_SPATTER_GEN_UTILITY_STRUCTS_HH__