mem: Fixed implementation of Best Offset Prefetcher (#1403)
This PR fixes the issues with the implementation of the Best Offset Prefetcher described in issue #1402 On branch bop Changes to be committed: modified: src/mem/cache/prefetch/bop.cc modified: src/mem/cache/prefetch/bop.hh --------- Co-authored-by: Setu Gupta <setu.gupta.2020@gamil.com> Co-authored-by: Abhishek Shailendra Singh <abs218@leigh.edu> Co-authored-by: Setu Gupta <setu.gupta@partner.samsung.com>
This commit is contained in:
10
src/mem/cache/prefetch/Prefetcher.py
vendored
10
src/mem/cache/prefetch/Prefetcher.py
vendored
@@ -587,6 +587,16 @@ class BOPPrefetcher(QueuedPrefetcher):
|
||||
queue",
|
||||
)
|
||||
|
||||
# BOP is a degree one prefetcher
|
||||
degree = Param.Int(1, "Number of prefetches to generate")
|
||||
|
||||
queue_squash = True
|
||||
queue_filter = True
|
||||
cache_snoop = True
|
||||
prefetch_on_pf_hit = True
|
||||
on_miss = True
|
||||
on_inst = False
|
||||
|
||||
|
||||
class SBOOEPrefetcher(QueuedPrefetcher):
|
||||
type = "SBOOEPrefetcher"
|
||||
|
||||
152
src/mem/cache/prefetch/bop.cc
vendored
152
src/mem/cache/prefetch/bop.cc
vendored
@@ -1,5 +1,6 @@
|
||||
/**
|
||||
* Copyright (c) 2018 Metempsy Technology Consulting
|
||||
* Copyright (c) 2024 Samsung Electronics
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
@@ -36,7 +37,6 @@ namespace gem5
|
||||
|
||||
namespace prefetch
|
||||
{
|
||||
|
||||
BOP::BOP(const BOPPrefetcherParams &p)
|
||||
: Queued(p),
|
||||
scoreMax(p.score_max), roundMax(p.round_max),
|
||||
@@ -47,7 +47,7 @@ BOP::BOP(const BOPPrefetcherParams &p)
|
||||
delayTicks(cyclesToTicks(p.delay_queue_cycles)),
|
||||
delayQueueEvent([this]{ delayQueueEventWrapper(); }, name()),
|
||||
issuePrefetchRequests(false), bestOffset(1), phaseBestOffset(0),
|
||||
bestScore(0), round(0)
|
||||
bestScore(0), round(0), degree(p.degree)
|
||||
{
|
||||
if (!isPowerOf2(rrEntries)) {
|
||||
fatal("%s: number of RR entries is not power of 2\n", name());
|
||||
@@ -55,16 +55,22 @@ BOP::BOP(const BOPPrefetcherParams &p)
|
||||
if (!isPowerOf2(blkSize)) {
|
||||
fatal("%s: cache line size is not power of 2\n", name());
|
||||
}
|
||||
if (!(p.negative_offsets_enable && (p.offset_list_size % 2 == 0))) {
|
||||
if (p.negative_offsets_enable && (p.offset_list_size % 2 != 0)) {
|
||||
fatal("%s: negative offsets enabled with odd offset list size\n",
|
||||
name());
|
||||
}
|
||||
if (p.degree <= 0) {
|
||||
fatal("%s: prefetch degree must be strictly greater than zero\n",
|
||||
name());
|
||||
}
|
||||
|
||||
rrLeft.resize(rrEntries);
|
||||
rrRight.resize(rrEntries);
|
||||
|
||||
// Following the paper implementation, a list with the specified number
|
||||
// of offsets which are of the form 2^i * 3^j * 5^k with i,j,k >= 0
|
||||
/*
|
||||
* Following the paper implementation, a list with the specified number
|
||||
* of offsets which are of the form 2^i * 3^j * 5^k with i,j,k >= 0
|
||||
*/
|
||||
const int factors[] = { 2, 3, 5 };
|
||||
unsigned int i = 0;
|
||||
int64_t offset_i = 1;
|
||||
@@ -82,8 +88,10 @@ BOP::BOP(const BOPPrefetcherParams &p)
|
||||
if (offset == 1) {
|
||||
offsetsList.push_back(OffsetListEntry(offset_i, 0));
|
||||
i++;
|
||||
// If we want to use negative offsets, add also the negative value
|
||||
// of the offset just calculated
|
||||
/*
|
||||
* If we want to use negative offsets, add also the negative value
|
||||
* of the offset just calculated
|
||||
*/
|
||||
if (p.negative_offsets_enable) {
|
||||
offsetsList.push_back(OffsetListEntry(-offset_i, 0));
|
||||
i++;
|
||||
@@ -103,7 +111,8 @@ BOP::delayQueueEventWrapper()
|
||||
delayQueue.front().processTick <= curTick())
|
||||
{
|
||||
Addr addr_x = delayQueue.front().baseAddr;
|
||||
insertIntoRR(addr_x, RRWay::Left);
|
||||
Addr addr_tag = tag(addr_x);
|
||||
insertIntoRR(addr_x, addr_tag, RRWay::Left);
|
||||
delayQueue.pop_front();
|
||||
}
|
||||
|
||||
@@ -114,22 +123,49 @@ BOP::delayQueueEventWrapper()
|
||||
}
|
||||
|
||||
unsigned int
|
||||
BOP::hash(Addr addr, unsigned int way) const
|
||||
BOP::index(Addr addr, unsigned int way) const
|
||||
{
|
||||
Addr hash1 = addr >> way;
|
||||
Addr hash2 = hash1 >> floorLog2(rrEntries);
|
||||
return (hash1 ^ hash2) & (Addr)(rrEntries - 1);
|
||||
/*
|
||||
* The second parameter, way, is set to 0 for indexing the left side of the
|
||||
* RR Table and, it is set to 1 for indexing the right side of the RR
|
||||
* Table. This is because we always pass the enum RRWay as the way argument
|
||||
* while calling index. This enum is defined in the bop.hh file.
|
||||
*
|
||||
* The indexing function in the author's ChampSim code, which can be found
|
||||
* here: https://comparch-conf.gatech.edu/dpc2/final_program.html, computes
|
||||
* the hash as follows:
|
||||
*
|
||||
* 1. For indexing the left side of the RR Table (way = 0), the cache line
|
||||
* address is XORed with itself after right shifting it by the log base
|
||||
* 2 of the number of entries in the RR Table.
|
||||
* 2. For indexing the right side of the RR Table (way = 1), the cache
|
||||
* line address is XORed with itself after right shifting it by the log
|
||||
* base 2 of the number of entries in the RR Table, multiplied by two.
|
||||
*
|
||||
* Therefore, we if just left shift the log base 2 of the number of RR
|
||||
* entries (log_rr_entries) with the parameter way, then if we are indexing
|
||||
* the left side, we'll leave log_rr_entries as it is, but if we are
|
||||
* indexing the right side, we'll multiply it with 2. Now once we have the
|
||||
* result of this operation, we can right shift the cache line address
|
||||
* (line_addr) by this value to get the first operand of the final XOR
|
||||
* operation. The second operand of the XOR operation is line_addr itself
|
||||
*/
|
||||
Addr log_rr_entries = floorLog2(rrEntries);
|
||||
Addr line_addr = addr >> lBlkSize;
|
||||
Addr hash = line_addr ^ (line_addr >> (log_rr_entries << way));
|
||||
hash &= ((1ULL << log_rr_entries) - 1);
|
||||
return hash % rrEntries;
|
||||
}
|
||||
|
||||
void
|
||||
BOP::insertIntoRR(Addr addr, unsigned int way)
|
||||
BOP::insertIntoRR(Addr addr, Addr tag, unsigned int way)
|
||||
{
|
||||
switch (way) {
|
||||
case RRWay::Left:
|
||||
rrLeft[hash(addr, RRWay::Left)] = addr;
|
||||
rrLeft[index(addr, RRWay::Left)] = tag;
|
||||
break;
|
||||
case RRWay::Right:
|
||||
rrRight[hash(addr, RRWay::Right)] = addr;
|
||||
rrRight[index(addr, RRWay::Right)] = tag;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -141,8 +177,10 @@ BOP::insertIntoDelayQueue(Addr x)
|
||||
return;
|
||||
}
|
||||
|
||||
// Add the address to the delay queue and schedule an event to process
|
||||
// it after the specified delay cycles
|
||||
/*
|
||||
* Add the address to the delay queue and schedule an event to process
|
||||
* it after the specified delay cycles
|
||||
*/
|
||||
Tick process_tick = curTick() + delayTicks;
|
||||
|
||||
delayQueue.push_back(DelayQueueEntry(x, process_tick));
|
||||
@@ -163,7 +201,7 @@ BOP::resetScores()
|
||||
inline Addr
|
||||
BOP::tag(Addr addr) const
|
||||
{
|
||||
return (addr >> blkSize) & tagMask;
|
||||
return (addr >> lBlkSize) & tagMask;
|
||||
}
|
||||
|
||||
bool
|
||||
@@ -185,14 +223,23 @@ BOP::testRR(Addr addr) const
|
||||
}
|
||||
|
||||
void
|
||||
BOP::bestOffsetLearning(Addr x)
|
||||
BOP::bestOffsetLearning(Addr addr_tag)
|
||||
{
|
||||
Addr offset_addr = (*offsetsListIterator).first;
|
||||
Addr lookup_addr = x - offset_addr;
|
||||
Addr offset_tag = (*offsetsListIterator).first;
|
||||
|
||||
/*
|
||||
* Compute the lookup tag for the RR table. Since addr_tag is a tag value,
|
||||
* and not an address, subtracting the offset from addr_tag may result in
|
||||
* integer underflow. Therefore, we first convert the tag back to address
|
||||
* by right shifting it, and then subtract the offset. This gives us a
|
||||
* new lookup address which we use to compute the lookup tag
|
||||
*/
|
||||
Addr lookup_tag = tag((addr_tag << lBlkSize) - (offset_tag << lBlkSize));
|
||||
|
||||
// There was a hit in the RR table, increment the score for this offset
|
||||
if (testRR(lookup_addr)) {
|
||||
DPRINTF(HWPrefetch, "Address %#lx found in the RR table\n", x);
|
||||
if (testRR(lookup_tag)) {
|
||||
DPRINTF(HWPrefetch, "Address %#lx found in the RR table\n",
|
||||
lookup_tag);
|
||||
(*offsetsListIterator).second++;
|
||||
if ((*offsetsListIterator).second > bestScore) {
|
||||
bestScore = (*offsetsListIterator).second;
|
||||
@@ -201,27 +248,37 @@ BOP::bestOffsetLearning(Addr x)
|
||||
}
|
||||
}
|
||||
|
||||
// Move the offset iterator forward to prepare for the next time
|
||||
offsetsListIterator++;
|
||||
|
||||
// All the offsets in the list were visited meaning that a learning
|
||||
// phase finished. Check if
|
||||
/*
|
||||
* All the offsets in the list were visited meaning that a learning
|
||||
* phase finished. Check if
|
||||
*/
|
||||
if (offsetsListIterator == offsetsList.end()) {
|
||||
offsetsListIterator = offsetsList.begin();
|
||||
round++;
|
||||
}
|
||||
|
||||
// Check if the best offset must be updated if:
|
||||
// (1) One of the scores equals SCORE_MAX
|
||||
// (2) The number of rounds equals ROUND_MAX
|
||||
if ((bestScore >= scoreMax) || (round == roundMax)) {
|
||||
// Check if its time to re-calculate the best offset
|
||||
if ((bestScore >= scoreMax) || (round >= roundMax)) {
|
||||
round = 0;
|
||||
|
||||
/*
|
||||
* If the current best score (bestScore) has exceed the threshold to
|
||||
* enable prefetching (badScore), reset the learning structures and
|
||||
* enable prefetch generation
|
||||
*/
|
||||
if (bestScore > badScore) {
|
||||
bestOffset = phaseBestOffset;
|
||||
round = 0;
|
||||
bestScore = 0;
|
||||
phaseBestOffset = 0;
|
||||
resetScores();
|
||||
issuePrefetchRequests = true;
|
||||
} else if (bestScore <= badScore) {
|
||||
} else {
|
||||
issuePrefetchRequests = false;
|
||||
}
|
||||
resetScores();
|
||||
bestScore = 0;
|
||||
phaseBestOffset = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -234,21 +291,23 @@ BOP::calculatePrefetch(const PrefetchInfo &pfi,
|
||||
Addr tag_x = tag(addr);
|
||||
|
||||
if (delayQueueEnabled) {
|
||||
insertIntoDelayQueue(tag_x);
|
||||
insertIntoDelayQueue(addr);
|
||||
} else {
|
||||
insertIntoRR(tag_x, RRWay::Left);
|
||||
insertIntoRR(addr, tag_x, RRWay::Left);
|
||||
}
|
||||
|
||||
// Go through the nth offset and update the score, the best score and the
|
||||
// current best offset if a better one is found
|
||||
bestOffsetLearning(tag_x);
|
||||
/*
|
||||
* Go through the nth offset and update the score, the best score and the
|
||||
* current best offset if a better one is found
|
||||
*/
|
||||
bestOffsetLearning(addr);
|
||||
|
||||
// This prefetcher is a degree 1 prefetch, so it will only generate one
|
||||
// prefetch at most per access
|
||||
if (issuePrefetchRequests) {
|
||||
Addr prefetch_addr = addr + (bestOffset << lBlkSize);
|
||||
addresses.push_back(AddrPriority(prefetch_addr, 0));
|
||||
DPRINTF(HWPrefetch, "Generated prefetch %#lx\n", prefetch_addr);
|
||||
for (int i = 1; i <= degree; i++) {
|
||||
Addr prefetch_addr = addr + ((i * bestOffset) << lBlkSize);
|
||||
addresses.push_back(AddrPriority(prefetch_addr, 0));
|
||||
DPRINTF(HWPrefetch, "Generated prefetch %#lx\n", prefetch_addr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -257,13 +316,14 @@ BOP::notifyFill(const CacheAccessProbeArg &arg)
|
||||
{
|
||||
const PacketPtr& pkt = arg.pkt;
|
||||
|
||||
// Only insert into the RR right way if it's the pkt is a HWP
|
||||
// Only insert into the RR right way if it's the pkt is a hardware prefetch
|
||||
if (!pkt->cmd.isHWPrefetch()) return;
|
||||
|
||||
Addr tag_y = tag(pkt->getAddr());
|
||||
Addr addr = pkt->getAddr();
|
||||
Addr tag_y = tag(addr);
|
||||
|
||||
if (issuePrefetchRequests) {
|
||||
insertIntoRR(tag_y - bestOffset, RRWay::Right);
|
||||
insertIntoRR(addr, tag_y - bestOffset, RRWay::Right);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
8
src/mem/cache/prefetch/bop.hh
vendored
8
src/mem/cache/prefetch/bop.hh
vendored
@@ -1,5 +1,6 @@
|
||||
/**
|
||||
* Copyright (c) 2018 Metempsy Technology Consulting
|
||||
* Copyright (c) 2024 Samsung Electronics
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
@@ -116,13 +117,14 @@ class BOP : public Queued
|
||||
* @param addr: address to hash
|
||||
* @param way: RR table to which is addressed (left/right)
|
||||
*/
|
||||
unsigned int hash(Addr addr, unsigned int way) const;
|
||||
unsigned int index(Addr addr, unsigned int way) const;
|
||||
|
||||
/** Insert the specified address into the RR table
|
||||
* @param addr: address to insert
|
||||
* @param addr_tag: The tag to insert in the RR table
|
||||
* @param way: RR table to which the address will be inserted
|
||||
*/
|
||||
void insertIntoRR(Addr addr, unsigned int way);
|
||||
void insertIntoRR(Addr addr, Addr addr_tag, unsigned int way);
|
||||
|
||||
/** Insert the specified address into the delay queue. This will
|
||||
* trigger an event after the delay cycles pass
|
||||
@@ -151,6 +153,8 @@ class BOP : public Queued
|
||||
void notifyFill(const CacheAccessProbeArg &arg) override;
|
||||
|
||||
public:
|
||||
/** The prefetch degree, i.e. the number of prefetches to generate */
|
||||
unsigned int degree;
|
||||
|
||||
BOP(const BOPPrefetcherParams &p);
|
||||
~BOP() = default;
|
||||
|
||||
Reference in New Issue
Block a user