mem: Fixed implementation of Best Offset Prefetcher (#1403)

This PR fixes the issues with the implementation of the Best Offset
Prefetcher described in issue #1402

On branch bop
Changes to be committed:
	modified:   src/mem/cache/prefetch/bop.cc
	modified:   src/mem/cache/prefetch/bop.hh

---------

Co-authored-by: Setu Gupta <setu.gupta.2020@gamil.com>
Co-authored-by: Abhishek Shailendra Singh <abs218@leigh.edu>
Co-authored-by: Setu Gupta <setu.gupta@partner.samsung.com>
This commit is contained in:
Setu
2024-08-21 11:54:20 -05:00
committed by GitHub
parent e7442036a5
commit f6010439fe
3 changed files with 122 additions and 48 deletions

View File

@@ -587,6 +587,16 @@ class BOPPrefetcher(QueuedPrefetcher):
queue",
)
# BOP is a degree one prefetcher
degree = Param.Int(1, "Number of prefetches to generate")
queue_squash = True
queue_filter = True
cache_snoop = True
prefetch_on_pf_hit = True
on_miss = True
on_inst = False
class SBOOEPrefetcher(QueuedPrefetcher):
type = "SBOOEPrefetcher"

View File

@@ -1,5 +1,6 @@
/**
* Copyright (c) 2018 Metempsy Technology Consulting
* Copyright (c) 2024 Samsung Electronics
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -36,7 +37,6 @@ namespace gem5
namespace prefetch
{
BOP::BOP(const BOPPrefetcherParams &p)
: Queued(p),
scoreMax(p.score_max), roundMax(p.round_max),
@@ -47,7 +47,7 @@ BOP::BOP(const BOPPrefetcherParams &p)
delayTicks(cyclesToTicks(p.delay_queue_cycles)),
delayQueueEvent([this]{ delayQueueEventWrapper(); }, name()),
issuePrefetchRequests(false), bestOffset(1), phaseBestOffset(0),
bestScore(0), round(0)
bestScore(0), round(0), degree(p.degree)
{
if (!isPowerOf2(rrEntries)) {
fatal("%s: number of RR entries is not power of 2\n", name());
@@ -55,16 +55,22 @@ BOP::BOP(const BOPPrefetcherParams &p)
if (!isPowerOf2(blkSize)) {
fatal("%s: cache line size is not power of 2\n", name());
}
if (!(p.negative_offsets_enable && (p.offset_list_size % 2 == 0))) {
if (p.negative_offsets_enable && (p.offset_list_size % 2 != 0)) {
fatal("%s: negative offsets enabled with odd offset list size\n",
name());
}
if (p.degree <= 0) {
fatal("%s: prefetch degree must be strictly greater than zero\n",
name());
}
rrLeft.resize(rrEntries);
rrRight.resize(rrEntries);
// Following the paper implementation, a list with the specified number
// of offsets which are of the form 2^i * 3^j * 5^k with i,j,k >= 0
/*
* Following the paper implementation, a list with the specified number
* of offsets which are of the form 2^i * 3^j * 5^k with i,j,k >= 0
*/
const int factors[] = { 2, 3, 5 };
unsigned int i = 0;
int64_t offset_i = 1;
@@ -82,8 +88,10 @@ BOP::BOP(const BOPPrefetcherParams &p)
if (offset == 1) {
offsetsList.push_back(OffsetListEntry(offset_i, 0));
i++;
// If we want to use negative offsets, add also the negative value
// of the offset just calculated
/*
* If we want to use negative offsets, add also the negative value
* of the offset just calculated
*/
if (p.negative_offsets_enable) {
offsetsList.push_back(OffsetListEntry(-offset_i, 0));
i++;
@@ -103,7 +111,8 @@ BOP::delayQueueEventWrapper()
delayQueue.front().processTick <= curTick())
{
Addr addr_x = delayQueue.front().baseAddr;
insertIntoRR(addr_x, RRWay::Left);
Addr addr_tag = tag(addr_x);
insertIntoRR(addr_x, addr_tag, RRWay::Left);
delayQueue.pop_front();
}
@@ -114,22 +123,49 @@ BOP::delayQueueEventWrapper()
}
unsigned int
BOP::hash(Addr addr, unsigned int way) const
BOP::index(Addr addr, unsigned int way) const
{
Addr hash1 = addr >> way;
Addr hash2 = hash1 >> floorLog2(rrEntries);
return (hash1 ^ hash2) & (Addr)(rrEntries - 1);
/*
* The second parameter, way, is set to 0 for indexing the left side of the
* RR Table and, it is set to 1 for indexing the right side of the RR
* Table. This is because we always pass the enum RRWay as the way argument
* while calling index. This enum is defined in the bop.hh file.
*
* The indexing function in the author's ChampSim code, which can be found
* here: https://comparch-conf.gatech.edu/dpc2/final_program.html, computes
* the hash as follows:
*
* 1. For indexing the left side of the RR Table (way = 0), the cache line
* address is XORed with itself after right shifting it by the log base
* 2 of the number of entries in the RR Table.
* 2. For indexing the right side of the RR Table (way = 1), the cache
* line address is XORed with itself after right shifting it by the log
* base 2 of the number of entries in the RR Table, multiplied by two.
*
* Therefore, we if just left shift the log base 2 of the number of RR
* entries (log_rr_entries) with the parameter way, then if we are indexing
* the left side, we'll leave log_rr_entries as it is, but if we are
* indexing the right side, we'll multiply it with 2. Now once we have the
* result of this operation, we can right shift the cache line address
* (line_addr) by this value to get the first operand of the final XOR
* operation. The second operand of the XOR operation is line_addr itself
*/
Addr log_rr_entries = floorLog2(rrEntries);
Addr line_addr = addr >> lBlkSize;
Addr hash = line_addr ^ (line_addr >> (log_rr_entries << way));
hash &= ((1ULL << log_rr_entries) - 1);
return hash % rrEntries;
}
void
BOP::insertIntoRR(Addr addr, unsigned int way)
BOP::insertIntoRR(Addr addr, Addr tag, unsigned int way)
{
switch (way) {
case RRWay::Left:
rrLeft[hash(addr, RRWay::Left)] = addr;
rrLeft[index(addr, RRWay::Left)] = tag;
break;
case RRWay::Right:
rrRight[hash(addr, RRWay::Right)] = addr;
rrRight[index(addr, RRWay::Right)] = tag;
break;
}
}
@@ -141,8 +177,10 @@ BOP::insertIntoDelayQueue(Addr x)
return;
}
// Add the address to the delay queue and schedule an event to process
// it after the specified delay cycles
/*
* Add the address to the delay queue and schedule an event to process
* it after the specified delay cycles
*/
Tick process_tick = curTick() + delayTicks;
delayQueue.push_back(DelayQueueEntry(x, process_tick));
@@ -163,7 +201,7 @@ BOP::resetScores()
inline Addr
BOP::tag(Addr addr) const
{
return (addr >> blkSize) & tagMask;
return (addr >> lBlkSize) & tagMask;
}
bool
@@ -185,14 +223,23 @@ BOP::testRR(Addr addr) const
}
void
BOP::bestOffsetLearning(Addr x)
BOP::bestOffsetLearning(Addr addr_tag)
{
Addr offset_addr = (*offsetsListIterator).first;
Addr lookup_addr = x - offset_addr;
Addr offset_tag = (*offsetsListIterator).first;
/*
* Compute the lookup tag for the RR table. Since addr_tag is a tag value,
* and not an address, subtracting the offset from addr_tag may result in
* integer underflow. Therefore, we first convert the tag back to address
* by right shifting it, and then subtract the offset. This gives us a
* new lookup address which we use to compute the lookup tag
*/
Addr lookup_tag = tag((addr_tag << lBlkSize) - (offset_tag << lBlkSize));
// There was a hit in the RR table, increment the score for this offset
if (testRR(lookup_addr)) {
DPRINTF(HWPrefetch, "Address %#lx found in the RR table\n", x);
if (testRR(lookup_tag)) {
DPRINTF(HWPrefetch, "Address %#lx found in the RR table\n",
lookup_tag);
(*offsetsListIterator).second++;
if ((*offsetsListIterator).second > bestScore) {
bestScore = (*offsetsListIterator).second;
@@ -201,27 +248,37 @@ BOP::bestOffsetLearning(Addr x)
}
}
// Move the offset iterator forward to prepare for the next time
offsetsListIterator++;
// All the offsets in the list were visited meaning that a learning
// phase finished. Check if
/*
* All the offsets in the list were visited meaning that a learning
* phase finished. Check if
*/
if (offsetsListIterator == offsetsList.end()) {
offsetsListIterator = offsetsList.begin();
round++;
}
// Check if the best offset must be updated if:
// (1) One of the scores equals SCORE_MAX
// (2) The number of rounds equals ROUND_MAX
if ((bestScore >= scoreMax) || (round == roundMax)) {
// Check if its time to re-calculate the best offset
if ((bestScore >= scoreMax) || (round >= roundMax)) {
round = 0;
/*
* If the current best score (bestScore) has exceed the threshold to
* enable prefetching (badScore), reset the learning structures and
* enable prefetch generation
*/
if (bestScore > badScore) {
bestOffset = phaseBestOffset;
round = 0;
bestScore = 0;
phaseBestOffset = 0;
resetScores();
issuePrefetchRequests = true;
} else if (bestScore <= badScore) {
} else {
issuePrefetchRequests = false;
}
resetScores();
bestScore = 0;
phaseBestOffset = 0;
}
}
@@ -234,21 +291,23 @@ BOP::calculatePrefetch(const PrefetchInfo &pfi,
Addr tag_x = tag(addr);
if (delayQueueEnabled) {
insertIntoDelayQueue(tag_x);
insertIntoDelayQueue(addr);
} else {
insertIntoRR(tag_x, RRWay::Left);
insertIntoRR(addr, tag_x, RRWay::Left);
}
// Go through the nth offset and update the score, the best score and the
// current best offset if a better one is found
bestOffsetLearning(tag_x);
/*
* Go through the nth offset and update the score, the best score and the
* current best offset if a better one is found
*/
bestOffsetLearning(addr);
// This prefetcher is a degree 1 prefetch, so it will only generate one
// prefetch at most per access
if (issuePrefetchRequests) {
Addr prefetch_addr = addr + (bestOffset << lBlkSize);
addresses.push_back(AddrPriority(prefetch_addr, 0));
DPRINTF(HWPrefetch, "Generated prefetch %#lx\n", prefetch_addr);
for (int i = 1; i <= degree; i++) {
Addr prefetch_addr = addr + ((i * bestOffset) << lBlkSize);
addresses.push_back(AddrPriority(prefetch_addr, 0));
DPRINTF(HWPrefetch, "Generated prefetch %#lx\n", prefetch_addr);
}
}
}
@@ -257,13 +316,14 @@ BOP::notifyFill(const CacheAccessProbeArg &arg)
{
const PacketPtr& pkt = arg.pkt;
// Only insert into the RR right way if it's the pkt is a HWP
// Only insert into the RR right way if it's the pkt is a hardware prefetch
if (!pkt->cmd.isHWPrefetch()) return;
Addr tag_y = tag(pkt->getAddr());
Addr addr = pkt->getAddr();
Addr tag_y = tag(addr);
if (issuePrefetchRequests) {
insertIntoRR(tag_y - bestOffset, RRWay::Right);
insertIntoRR(addr, tag_y - bestOffset, RRWay::Right);
}
}

View File

@@ -1,5 +1,6 @@
/**
* Copyright (c) 2018 Metempsy Technology Consulting
* Copyright (c) 2024 Samsung Electronics
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -116,13 +117,14 @@ class BOP : public Queued
* @param addr: address to hash
* @param way: RR table to which is addressed (left/right)
*/
unsigned int hash(Addr addr, unsigned int way) const;
unsigned int index(Addr addr, unsigned int way) const;
/** Insert the specified address into the RR table
* @param addr: address to insert
* @param addr_tag: The tag to insert in the RR table
* @param way: RR table to which the address will be inserted
*/
void insertIntoRR(Addr addr, unsigned int way);
void insertIntoRR(Addr addr, Addr addr_tag, unsigned int way);
/** Insert the specified address into the delay queue. This will
* trigger an event after the delay cycles pass
@@ -151,6 +153,8 @@ class BOP : public Queued
void notifyFill(const CacheAccessProbeArg &arg) override;
public:
/** The prefetch degree, i.e. the number of prefetches to generate */
unsigned int degree;
BOP(const BOPPrefetcherParams &p);
~BOP() = default;