arch-gcn3,gpu-compute: Move TLB to common folder in amdgpu

This TLB is more of an "APU" TLB than anything GCN3 specific. It can be
used with either GCN3 or Vega. With this change, VEGA_X86 builds and one
can run binaries with Vega ISA code using the same steps as GCN3 but
building the Vega ISA instead.

Change-Id: I0c92bcd0379a18628dc05cb5af070bdc7e692c7c
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/53803
Maintainer: Bobby Bruce <bbruce@ucdavis.edu>
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
Matthew Poremba
2021-12-08 12:23:23 -06:00
parent 7f2079f662
commit c028af111a
11 changed files with 52 additions and 10 deletions

View File

@@ -0,0 +1,43 @@
# Copyright (c) 2021 Advanced Micro Devices, Inc.
# All rights reserved.
#
# For use for simulation and test purposes only
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import sys
Import('*')
if not env['BUILD_GPU']:
Return()
if env['TARGET_GPU_ISA'] == 'gcn3' or env['TARGET_GPU_ISA'] == 'vega':
SimObject('X86GPUTLB.py', sim_objects=['X86GPUTLB', 'TLBCoalescer'])
Source('tlb.cc')
Source('tlb_coalescer.cc')

View File

@@ -0,0 +1,76 @@
# Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
# All rights reserved.
#
# For use for simulation and test purposes only
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from m5.defines import buildEnv
from m5.params import *
from m5.proxy import *
from m5.objects.ClockedObject import ClockedObject
from m5.SimObject import SimObject
class X86GPUTLB(ClockedObject):
type = 'X86GPUTLB'
cxx_class = 'gem5::X86ISA::GpuTLB'
cxx_header = 'arch/amdgpu/common/tlb.hh'
size = Param.Int(64, "TLB size (number of entries)")
assoc = Param.Int(64, "TLB associativity")
if buildEnv.get('FULL_SYSTEM', False):
walker = Param.X86PagetableWalker(X86PagetableWalker(),
"page table walker")
hitLatency = Param.Int(2, "Latency of a TLB hit")
missLatency1 = Param.Int(5, "Latency #1 of a TLB miss")
missLatency2 = Param.Int(100, "Latency #2 of a TLB miss")
maxOutstandingReqs = Param.Int(64, "# of maximum outstanding requests")
cpu_side_ports = VectorResponsePort("Ports on side closer to CPU/CU")
slave = DeprecatedParam(cpu_side_ports,
'`slave` is now called `cpu_side_ports`')
mem_side_ports = VectorRequestPort("Ports on side closer to memory")
master = DeprecatedParam(mem_side_ports,
'`master` is now called `mem_side_ports`')
allocationPolicy = Param.Bool(True, "Allocate on an access")
accessDistance = Param.Bool(False, "print accessDistance stats")
class TLBCoalescer(ClockedObject):
type = 'TLBCoalescer'
cxx_class = 'gem5::TLBCoalescer'
cxx_header = 'arch/amdgpu/common/tlb_coalescer.hh'
probesPerCycle = Param.Int(2, "Number of TLB probes per cycle")
coalescingWindow = Param.Int(1, "Permit coalescing across that many ticks")
cpu_side_ports = VectorResponsePort("Port on side closer to CPU/CU")
slave = DeprecatedParam(cpu_side_ports,
'`slave` is now called `cpu_side_ports`')
mem_side_ports = VectorRequestPort("Port on side closer to memory")
master = DeprecatedParam(mem_side_ports,
'`master` is now called `mem_side_ports`')
disableCoalescing = Param.Bool(False,"Dispable Coalescing")

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,445 @@
/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __GPU_TLB_HH__
#define __GPU_TLB_HH__
#include <fstream>
#include <list>
#include <queue>
#include <string>
#include <vector>
#include "arch/generic/tlb.hh"
#include "arch/x86/pagetable.hh"
#include "arch/x86/pagetable_walker.hh"
#include "arch/x86/regs/segment.hh"
#include "base/callback.hh"
#include "base/logging.hh"
#include "base/statistics.hh"
#include "base/stats/group.hh"
#include "gpu-compute/compute_unit.hh"
#include "mem/port.hh"
#include "mem/request.hh"
#include "params/X86GPUTLB.hh"
#include "sim/clocked_object.hh"
#include "sim/sim_object.hh"
namespace gem5
{
class BaseTLB;
class Packet;
class ThreadContext;
namespace X86ISA
{
class GpuTLB : public ClockedObject
{
protected:
friend class Walker;
typedef std::list<TlbEntry*> EntryList;
uint32_t configAddress;
public:
typedef X86GPUTLBParams Params;
GpuTLB(const Params &p);
~GpuTLB();
typedef enum BaseMMU::Mode Mode;
class Translation
{
public:
virtual ~Translation() { }
/**
* Signal that the translation has been delayed due to a hw page
* table walk.
*/
virtual void markDelayed() = 0;
/**
* The memory for this object may be dynamically allocated, and it
* may be responsible for cleaning itslef up which will happen in
* this function. Once it's called the object is no longer valid.
*/
virtual void finish(Fault fault, const RequestPtr &req,
ThreadContext *tc, Mode mode) = 0;
};
void dumpAll();
TlbEntry *lookup(Addr va, bool update_lru=true);
void setConfigAddress(uint32_t addr);
protected:
EntryList::iterator lookupIt(Addr va, bool update_lru=true);
Walker *walker;
public:
Walker *getWalker();
void invalidateAll();
void invalidateNonGlobal();
void demapPage(Addr va, uint64_t asn);
protected:
int size;
int assoc;
int numSets;
/**
* true if this is a fully-associative TLB
*/
bool FA;
Addr setMask;
/**
* Allocation Policy: true if we always allocate on a hit, false
* otherwise. Default is true.
*/
bool allocationPolicy;
/**
* if true, then this is not the last level TLB
*/
bool hasMemSidePort;
/**
* Print out accessDistance stats. One stat file
* per TLB.
*/
bool accessDistance;
std::vector<TlbEntry> tlb;
/*
* It's a per-set list. As long as we have not reached
* the full capacity of the given set, grab an entry from
* the freeList.
*/
std::vector<EntryList> freeList;
/**
* An entryList per set is the equivalent of an LRU stack;
* it's used to guide replacement decisions. The head of the list
* contains the MRU TLB entry of the given set. If the freeList
* for this set is empty, the last element of the list
* is evicted (i.e., dropped on the floor).
*/
std::vector<EntryList> entryList;
Fault translateInt(bool read, const RequestPtr &req,
ThreadContext *tc);
Fault translate(const RequestPtr &req, ThreadContext *tc,
Translation *translation, Mode mode, bool &delayedResponse,
bool timing, int &latency);
public:
// latencies for a TLB hit, miss and page fault
int hitLatency;
int missLatency1;
int missLatency2;
void updatePageFootprint(Addr virt_page_addr);
void printAccessPattern();
Fault translateAtomic(const RequestPtr &req, ThreadContext *tc,
Mode mode, int &latency);
void translateTiming(const RequestPtr &req, ThreadContext *tc,
Translation *translation, Mode mode,
int &latency);
Tick doMmuRegRead(ThreadContext *tc, Packet *pkt);
Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt);
TlbEntry *insert(Addr vpn, TlbEntry &entry);
// Checkpointing
virtual void serialize(CheckpointOut& cp) const override;
virtual void unserialize(CheckpointIn& cp) override;
void issueTranslation();
enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN};
bool tlbLookup(const RequestPtr &req,
ThreadContext *tc, bool update_stats);
void handleTranslationReturn(Addr addr, tlbOutcome outcome,
PacketPtr pkt);
void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome);
void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
TlbEntry *tlb_entry, Mode mode);
void updatePhysAddresses(Addr virt_page_addr, TlbEntry *tlb_entry,
Addr phys_page_addr);
void issueTLBLookup(PacketPtr pkt);
// CpuSidePort is the TLB Port closer to the CPU/CU side
class CpuSidePort : public ResponsePort
{
public:
CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB,
PortID _index)
: ResponsePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
protected:
GpuTLB *tlb;
int index;
virtual bool recvTimingReq(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt);
virtual void recvRangeChange() { }
virtual void recvReqRetry();
virtual void recvRespRetry() { panic("recvRespRetry called"); }
virtual AddrRangeList getAddrRanges() const;
};
/**
* MemSidePort is the TLB Port closer to the memory side
* If this is a last level TLB then this port will not be connected.
*
* Future action item: if we ever do real page walks, then this port
* should be connected to a RubyPort.
*/
class MemSidePort : public RequestPort
{
public:
MemSidePort(const std::string &_name, GpuTLB * gpu_TLB,
PortID _index)
: RequestPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
std::deque<PacketPtr> retries;
protected:
GpuTLB *tlb;
int index;
virtual bool recvTimingResp(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt) { }
virtual void recvRangeChange() { }
virtual void recvReqRetry();
};
// TLB ports on the cpu Side
std::vector<CpuSidePort*> cpuSidePort;
// TLB ports on the memory side
std::vector<MemSidePort*> memSidePort;
Port &getPort(const std::string &if_name,
PortID idx=InvalidPortID) override;
/**
* TLB TranslationState: this currently is a somewhat bastardization of
* the usage of SenderState, whereby the receiver of a packet is not
* usually supposed to need to look at the contents of the senderState,
* you're really only supposed to look at what you pushed on, pop it
* off, and send it back.
*
* However, since there is state that we want to pass to the TLBs using
* the send/recv Timing/Functional/etc. APIs, which don't allow for new
* arguments, we need a common TLB senderState to pass between TLBs,
* both "forwards" and "backwards."
*
* So, basically, the rule is that any packet received by a TLB port
* (cpuside OR memside) must be safely castable to a TranslationState.
*/
struct TranslationState : public Packet::SenderState
{
// TLB mode, read or write
Mode tlbMode;
// Thread context associated with this req
ThreadContext *tc;
/*
* TLB entry to be populated and passed back and filled in
* previous TLBs. Equivalent to the data cache concept of
* "data return."
*/
TlbEntry *tlbEntry;
// Is this a TLB prefetch request?
bool isPrefetch;
// When was the req for this translation issued
uint64_t issueTime;
// Remember where this came from
std::vector<ResponsePort*>ports;
// keep track of #uncoalesced reqs per packet per TLB level;
// reqCnt per level >= reqCnt higher level
std::vector<int> reqCnt;
// TLB level this packet hit in; 0 if it hit in the page table
int hitLevel;
Packet::SenderState *saved;
TranslationState(Mode tlb_mode, ThreadContext *_tc,
bool is_prefetch=false,
Packet::SenderState *_saved=nullptr)
: tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
isPrefetch(is_prefetch), issueTime(0),
hitLevel(0),saved(_saved) { }
};
// maximum number of permitted coalesced requests per cycle
int maxCoalescedReqs;
// Current number of outstandings coalesced requests.
// Should be <= maxCoalescedReqs
int outstandingReqs;
/**
* A TLBEvent is scheduled after the TLB lookup and helps us take the
* appropriate actions:
* (e.g., update TLB on a hit,
* send request to lower level TLB on a miss,
* or start a page walk if this was the last-level TLB).
*/
void translationReturn(Addr virtPageAddr, tlbOutcome outcome,
PacketPtr pkt);
class TLBEvent : public Event
{
private:
GpuTLB *tlb;
Addr virtPageAddr;
/**
* outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK
*/
tlbOutcome outcome;
PacketPtr pkt;
public:
TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome,
PacketPtr _pkt);
void process();
const char *description() const;
// updateOutcome updates the tlbOutcome of a TLBEvent
void updateOutcome(tlbOutcome _outcome);
Addr getTLBEventVaddr();
};
std::unordered_map<Addr, TLBEvent*> translationReturnEvent;
// this FIFO queue keeps track of the virt. page addresses
// that are pending cleanup
std::queue<Addr> cleanupQueue;
// the cleanupEvent is scheduled after a TLBEvent triggers in order to
// free memory and do the required clean-up
void cleanup();
EventFunctionWrapper cleanupEvent;
/**
* This hash map will use the virtual page address as a key
* and will keep track of total number of accesses per page
*/
struct AccessInfo
{
unsigned int lastTimeAccessed; // last access to this page
unsigned int accessesPerPage;
// need to divide it by accessesPerPage at the end
unsigned int totalReuseDistance;
/**
* The field below will help us compute the access distance,
* that is the number of (coalesced) TLB accesses that
* happened in between each access to this page
*
* localTLBAccesses[x] is the value of localTLBNumAccesses
* when the page <Addr> was accessed for the <x>th time
*/
std::vector<unsigned int> localTLBAccesses;
unsigned int sumDistance;
unsigned int meanDistance;
};
typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable;
AccessPatternTable TLBFootprint;
// Called at the end of simulation to dump page access stats.
void exitCallback();
EventFunctionWrapper exitEvent;
protected:
struct GpuTLBStats : public statistics::Group
{
GpuTLBStats(statistics::Group *parent);
// local_stats are as seen from the TLB
// without taking into account coalescing
statistics::Scalar localNumTLBAccesses;
statistics::Scalar localNumTLBHits;
statistics::Scalar localNumTLBMisses;
statistics::Formula localTLBMissRate;
// global_stats are as seen from the
// CU's perspective taking into account
// all coalesced requests.
statistics::Scalar globalNumTLBAccesses;
statistics::Scalar globalNumTLBHits;
statistics::Scalar globalNumTLBMisses;
statistics::Formula globalTLBMissRate;
// from the CU perspective (global)
statistics::Scalar accessCycles;
// from the CU perspective (global)
statistics::Scalar pageTableCycles;
statistics::Scalar numUniquePages;
// from the perspective of this TLB
statistics::Scalar localCycles;
// from the perspective of this TLB
statistics::Formula localLatency;
// I take the avg. per page and then
// the avg. over all pages.
statistics::Scalar avgReuseDistance;
} stats;
};
}
using GpuTranslationState = X86ISA::GpuTLB::TranslationState;
} // namespace gem5
#endif // __GPU_TLB_HH__

View File

@@ -0,0 +1,539 @@
/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "arch/amdgpu/common/tlb_coalescer.hh"
#include <cstring>
#include "arch/x86/page_size.hh"
#include "base/logging.hh"
#include "debug/GPUTLB.hh"
#include "sim/process.hh"
namespace gem5
{
TLBCoalescer::TLBCoalescer(const Params &p)
: ClockedObject(p),
TLBProbesPerCycle(p.probesPerCycle),
coalescingWindow(p.coalescingWindow),
disableCoalescing(p.disableCoalescing),
probeTLBEvent([this]{ processProbeTLBEvent(); },
"Probe the TLB below",
false, Event::CPU_Tick_Pri),
cleanupEvent([this]{ processCleanupEvent(); },
"Cleanup issuedTranslationsTable hashmap",
false, Event::Maximum_Pri),
stats(this)
{
// create the response ports based on the number of connected ports
for (size_t i = 0; i < p.port_cpu_side_ports_connection_count; ++i) {
cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i),
this, i));
}
// create the request ports based on the number of connected ports
for (size_t i = 0; i < p.port_mem_side_ports_connection_count; ++i) {
memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i),
this, i));
}
}
Port &
TLBCoalescer::getPort(const std::string &if_name, PortID idx)
{
if (if_name == "cpu_side_ports") {
if (idx >= static_cast<PortID>(cpuSidePort.size())) {
panic("TLBCoalescer::getPort: unknown index %d\n", idx);
}
return *cpuSidePort[idx];
} else if (if_name == "mem_side_ports") {
if (idx >= static_cast<PortID>(memSidePort.size())) {
panic("TLBCoalescer::getPort: unknown index %d\n", idx);
}
return *memSidePort[idx];
} else {
panic("TLBCoalescer::getPort: unknown port %s\n", if_name);
}
}
/*
* This method returns true if the <incoming_pkt>
* can be coalesced with <coalesced_pkt> and false otherwise.
* A given set of rules is checked.
* The rules can potentially be modified based on the TLB level.
*/
bool
TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt)
{
if (disableCoalescing)
return false;
GpuTranslationState *incoming_state =
safe_cast<GpuTranslationState*>(incoming_pkt->senderState);
GpuTranslationState *coalesced_state =
safe_cast<GpuTranslationState*>(coalesced_pkt->senderState);
// Rule 1: Coalesce requests only if they
// fall within the same virtual page
Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(),
X86ISA::PageBytes);
Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(),
X86ISA::PageBytes);
if (incoming_virt_page_addr != coalesced_virt_page_addr)
return false;
//* Rule 2: Coalesce requests only if they
// share a TLB Mode, i.e. they are both read
// or write requests.
BaseMMU::Mode incoming_mode = incoming_state->tlbMode;
BaseMMU::Mode coalesced_mode = coalesced_state->tlbMode;
if (incoming_mode != coalesced_mode)
return false;
// when we can coalesce a packet update the reqCnt
// that is the number of packets represented by
// this coalesced packet
if (!incoming_state->isPrefetch)
coalesced_state->reqCnt.back() += incoming_state->reqCnt.back();
return true;
}
/*
* We need to update the physical addresses of all the translation requests
* that were coalesced into the one that just returned.
*/
void
TLBCoalescer::updatePhysAddresses(PacketPtr pkt)
{
Addr virt_page_addr = roundDown(pkt->req->getVaddr(), X86ISA::PageBytes);
DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n",
issuedTranslationsTable[virt_page_addr].size(), virt_page_addr);
GpuTranslationState *sender_state =
safe_cast<GpuTranslationState*>(pkt->senderState);
TheISA::TlbEntry *tlb_entry = sender_state->tlbEntry;
assert(tlb_entry);
Addr first_entry_vaddr = tlb_entry->vaddr;
Addr first_entry_paddr = tlb_entry->paddr;
int page_size = tlb_entry->size();
bool uncacheable = tlb_entry->uncacheable;
int first_hit_level = sender_state->hitLevel;
// Get the physical page address of the translated request
// Using the page_size specified in the TLBEntry allows us
// to support different page sizes.
Addr phys_page_paddr = pkt->req->getPaddr();
phys_page_paddr &= ~(page_size - 1);
for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) {
PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i];
GpuTranslationState *sender_state =
safe_cast<GpuTranslationState*>(
local_pkt->senderState);
// we are sending the packet back, so pop the reqCnt associated
// with this level in the TLB hiearchy
if (!sender_state->isPrefetch)
sender_state->reqCnt.pop_back();
/*
* Only the first packet from this coalesced request has been
* translated. Grab the translated phys. page addr and update the
* physical addresses of the remaining packets with the appropriate
* page offsets.
*/
if (i) {
Addr paddr = phys_page_paddr;
paddr |= (local_pkt->req->getVaddr() & (page_size - 1));
local_pkt->req->setPaddr(paddr);
if (uncacheable)
local_pkt->req->setFlags(Request::UNCACHEABLE);
// update senderState->tlbEntry, so we can insert
// the correct TLBEentry in the TLBs above.
auto p = sender_state->tc->getProcessPtr();
sender_state->tlbEntry =
new TheISA::TlbEntry(p->pid(), first_entry_vaddr,
first_entry_paddr, false, false);
// update the hitLevel for all uncoalesced reqs
// so that each packet knows where it hit
// (used for statistics in the CUs)
sender_state->hitLevel = first_hit_level;
}
ResponsePort *return_port = sender_state->ports.back();
sender_state->ports.pop_back();
// Translation is done - Convert to a response pkt if necessary and
// send the translation back
if (local_pkt->isRequest()) {
local_pkt->makeTimingResponse();
}
return_port->sendTimingResp(local_pkt);
}
// schedule clean up for end of this cycle
// This is a maximum priority event and must be on
// the same cycle as GPUTLB cleanup event to prevent
// race conditions with an IssueProbeEvent caused by
// MemSidePort::recvReqRetry
cleanupQueue.push(virt_page_addr);
if (!cleanupEvent.scheduled())
schedule(cleanupEvent, curTick());
}
// Receive translation requests, create a coalesced request,
// and send them to the TLB (TLBProbesPerCycle)
bool
TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
{
// first packet of a coalesced request
PacketPtr first_packet = nullptr;
// true if we are able to do coalescing
bool didCoalesce = false;
// number of coalesced reqs for a given window
int coalescedReq_cnt = 0;
GpuTranslationState *sender_state =
safe_cast<GpuTranslationState*>(pkt->senderState);
// push back the port to remember the path back
sender_state->ports.push_back(this);
bool update_stats = !sender_state->isPrefetch;
if (update_stats) {
// if reqCnt is empty then this packet does not represent
// multiple uncoalesced reqs(pkts) but just a single pkt.
// If it does though then the reqCnt for each level in the
// hierarchy accumulates the total number of reqs this packet
// represents
int req_cnt = 1;
if (!sender_state->reqCnt.empty())
req_cnt = sender_state->reqCnt.back();
sender_state->reqCnt.push_back(req_cnt);
// update statistics
coalescer->stats.uncoalescedAccesses++;
req_cnt = sender_state->reqCnt.back();
DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
coalescer->stats.queuingCycles -= (curTick() * req_cnt);
coalescer->stats.localqueuingCycles -= curTick();
}
// FIXME if you want to coalesce not based on the issueTime
// of the packets (i.e., from the compute unit's perspective)
// but based on when they reached this coalescer then
// remove the following if statement and use curTick() or
// coalescingWindow for the tick_index.
if (!sender_state->issueTime)
sender_state->issueTime = curTick();
// The tick index is used as a key to the coalescerFIFO hashmap.
// It is shared by all candidates that fall within the
// given coalescingWindow.
int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow;
if (coalescer->coalescerFIFO.count(tick_index)) {
coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size();
}
// see if we can coalesce the incoming pkt with another
// coalesced request with the same tick_index
for (int i = 0; i < coalescedReq_cnt; ++i) {
first_packet = coalescer->coalescerFIFO[tick_index][i][0];
if (coalescer->canCoalesce(pkt, first_packet)) {
coalescer->coalescerFIFO[tick_index][i].push_back(pkt);
DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n",
i, tick_index,
coalescer->coalescerFIFO[tick_index][i].size());
didCoalesce = true;
break;
}
}
// if this is the first request for this tick_index
// or we did not manage to coalesce, update stats
// and make necessary allocations.
if (!coalescedReq_cnt || !didCoalesce) {
if (update_stats)
coalescer->stats.coalescedAccesses++;
std::vector<PacketPtr> new_array;
new_array.push_back(pkt);
coalescer->coalescerFIFO[tick_index].push_back(new_array);
DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after "
"push\n", tick_index,
coalescer->coalescerFIFO[tick_index].size());
}
//schedule probeTLBEvent next cycle to send the
//coalesced requests to the TLB
if (!coalescer->probeTLBEvent.scheduled()) {
coalescer->schedule(coalescer->probeTLBEvent,
curTick() + coalescer->clockPeriod());
}
return true;
}
void
TLBCoalescer::CpuSidePort::recvReqRetry()
{
panic("recvReqRetry called");
}
void
TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
{
GpuTranslationState *sender_state =
safe_cast<GpuTranslationState*>(pkt->senderState);
bool update_stats = !sender_state->isPrefetch;
if (update_stats)
coalescer->stats.uncoalescedAccesses++;
// If there is a pending timing request for this virtual address
// print a warning message. This is a temporary caveat of
// the current simulator where atomic and timing requests can
// coexist. FIXME remove this check/warning in the future.
Addr virt_page_addr = roundDown(pkt->req->getVaddr(), X86ISA::PageBytes);
int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr);
if (map_count) {
DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing "
"req. pending\n", virt_page_addr);
}
coalescer->memSidePort[0]->sendFunctional(pkt);
}
AddrRangeList
TLBCoalescer::CpuSidePort::getAddrRanges() const
{
// currently not checked by the requestor
AddrRangeList ranges;
return ranges;
}
bool
TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt)
{
// a translation completed and returned
coalescer->updatePhysAddresses(pkt);
return true;
}
void
TLBCoalescer::MemSidePort::recvReqRetry()
{
//we've receeived a retry. Schedule a probeTLBEvent
if (!coalescer->probeTLBEvent.scheduled())
coalescer->schedule(coalescer->probeTLBEvent,
curTick() + coalescer->clockPeriod());
}
void
TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt)
{
fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n");
}
/*
* Here we scan the coalescer FIFO and issue the max
* number of permitted probes to the TLB below. We
* permit bypassing of coalesced requests for the same
* tick_index.
*
* We do not access the next tick_index unless we've
* drained the previous one. The coalesced requests
* that are successfully sent are moved to the
* issuedTranslationsTable table (the table which keeps
* track of the outstanding reqs)
*/
void
TLBCoalescer::processProbeTLBEvent()
{
// number of TLB probes sent so far
int sent_probes = 0;
// rejected denotes a blocking event
bool rejected = false;
// It is set to true either when the recvTiming of the TLB below
// returns false or when there is another outstanding request for the
// same virt. page.
DPRINTF(GPUTLB, "triggered TLBCoalescer %s\n", __func__);
for (auto iter = coalescerFIFO.begin();
iter != coalescerFIFO.end() && !rejected; ) {
int coalescedReq_cnt = iter->second.size();
int i = 0;
int vector_index = 0;
DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n",
coalescedReq_cnt, iter->first);
while (i < coalescedReq_cnt) {
++i;
PacketPtr first_packet = iter->second[vector_index][0];
// compute virtual page address for this request
Addr virt_page_addr = roundDown(first_packet->req->getVaddr(),
X86ISA::PageBytes);
// is there another outstanding request for the same page addr?
int pending_reqs =
issuedTranslationsTable.count(virt_page_addr);
if (pending_reqs) {
DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for "
"page %#x\n", virt_page_addr);
++vector_index;
rejected = true;
continue;
}
// send the coalesced request for virt_page_addr
if (!memSidePort[0]->sendTimingReq(first_packet)) {
DPRINTF(GPUTLB, "Failed to send TLB request for page %#x\n",
virt_page_addr);
// No need for a retries queue since we are already buffering
// the coalesced request in coalescerFIFO.
rejected = true;
++vector_index;
} else {
GpuTranslationState *tmp_sender_state =
safe_cast<GpuTranslationState*>
(first_packet->senderState);
bool update_stats = !tmp_sender_state->isPrefetch;
if (update_stats) {
// req_cnt is total number of packets represented
// by the one we just sent counting all the way from
// the top of TLB hiearchy (i.e., from the CU)
int req_cnt = tmp_sender_state->reqCnt.back();
stats.queuingCycles += (curTick() * req_cnt);
DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
name(), req_cnt);
// pkt_cnt is number of packets we coalesced into the one
// we just sent but only at this coalescer level
int pkt_cnt = iter->second[vector_index].size();
stats.localqueuingCycles += (curTick() * pkt_cnt);
}
DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
virt_page_addr);
//copy coalescedReq to issuedTranslationsTable
issuedTranslationsTable[virt_page_addr]
= iter->second[vector_index];
//erase the entry of this coalesced req
iter->second.erase(iter->second.begin() + vector_index);
if (iter->second.empty())
assert(i == coalescedReq_cnt);
sent_probes++;
if (sent_probes == TLBProbesPerCycle)
return;
}
}
//if there are no more coalesced reqs for this tick_index
//erase the hash_map with the first iterator
if (iter->second.empty()) {
coalescerFIFO.erase(iter++);
} else {
++iter;
}
}
}
void
TLBCoalescer::processCleanupEvent()
{
while (!cleanupQueue.empty()) {
Addr cleanup_addr = cleanupQueue.front();
cleanupQueue.pop();
issuedTranslationsTable.erase(cleanup_addr);
DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n",
cleanup_addr);
}
}
TLBCoalescer::TLBCoalescerStats::TLBCoalescerStats(statistics::Group *parent)
: statistics::Group(parent),
ADD_STAT(uncoalescedAccesses, "Number of uncoalesced TLB accesses"),
ADD_STAT(coalescedAccesses, "Number of coalesced TLB accesses"),
ADD_STAT(queuingCycles, "Number of cycles spent in queue"),
ADD_STAT(localqueuingCycles,
"Number of cycles spent in queue for all incoming reqs"),
ADD_STAT(localLatency, "Avg. latency over all incoming pkts")
{
localLatency = localqueuingCycles / uncoalescedAccesses;
}
} // namespace gem5

View File

@@ -0,0 +1,226 @@
/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __TLB_COALESCER_HH__
#define __TLB_COALESCER_HH__
#include <list>
#include <queue>
#include <string>
#include <vector>
#include "arch/amdgpu/common/tlb.hh"
#include "arch/generic/tlb.hh"
#include "arch/x86/isa.hh"
#include "arch/x86/pagetable.hh"
#include "arch/x86/regs/segment.hh"
#include "base/logging.hh"
#include "base/statistics.hh"
#include "mem/port.hh"
#include "mem/request.hh"
#include "params/TLBCoalescer.hh"
#include "sim/clocked_object.hh"
namespace gem5
{
class BaseTLB;
class Packet;
class ThreadContext;
/**
* The TLBCoalescer is a ClockedObject sitting on the front side (CPUSide) of
* each TLB. It receives packets and issues coalesced requests to the
* TLB below it. It controls how requests are coalesced (the rules)
* and the permitted number of TLB probes per cycle (i.e., how many
* coalesced requests it feeds the TLB per cycle).
*/
class TLBCoalescer : public ClockedObject
{
public:
typedef TLBCoalescerParams Params;
TLBCoalescer(const Params &p);
~TLBCoalescer() { }
// Number of TLB probes per cycle. Parameterizable - default 2.
int TLBProbesPerCycle;
// Consider coalescing across that many ticks.
// Paraemterizable - default 1.
int coalescingWindow;
// Each coalesced request consists of multiple packets
// that all fall within the same virtual page
typedef std::vector<PacketPtr> coalescedReq;
// disables coalescing when true
bool disableCoalescing;
/*
* This is a hash map with <tick_index> as a key.
* It contains a vector of coalescedReqs per <tick_index>.
* Requests are buffered here until they can be issued to
* the TLB, at which point they are copied to the
* issuedTranslationsTable hash map.
*
* In terms of coalescing, we coalesce requests in a given
* window of x cycles by using tick_index = issueTime/x as a
* key, where x = coalescingWindow. issueTime is the issueTime
* of the pkt from the ComputeUnit's perspective, but another
* option is to change it to curTick(), so we coalesce based
* on the receive time.
*/
typedef std::map<int64_t, std::vector<coalescedReq>>
CoalescingFIFO;
CoalescingFIFO coalescerFIFO;
/*
* issuedTranslationsTabler: a hash_map indexed by virtual page
* address. Each hash_map entry has a vector of PacketPtr associated
* with it denoting the different packets that share an outstanding
* coalesced translation request for the same virtual page.
*
* The rules that determine which requests we can coalesce are
* specified in the canCoalesce() method.
*/
typedef std::unordered_map<Addr, coalescedReq> CoalescingTable;
CoalescingTable issuedTranslationsTable;
bool canCoalesce(PacketPtr pkt1, PacketPtr pkt2);
void updatePhysAddresses(PacketPtr pkt);
class CpuSidePort : public ResponsePort
{
public:
CpuSidePort(const std::string &_name, TLBCoalescer *tlb_coalescer,
PortID _index)
: ResponsePort(_name, tlb_coalescer), coalescer(tlb_coalescer),
index(_index) { }
protected:
TLBCoalescer *coalescer;
int index;
virtual bool recvTimingReq(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt);
virtual void recvRangeChange() { }
virtual void recvReqRetry();
virtual void
recvRespRetry()
{
fatal("recvRespRetry() is not implemented in the TLB "
"coalescer.\n");
}
virtual AddrRangeList getAddrRanges() const;
};
class MemSidePort : public RequestPort
{
public:
MemSidePort(const std::string &_name, TLBCoalescer *tlb_coalescer,
PortID _index)
: RequestPort(_name, tlb_coalescer), coalescer(tlb_coalescer),
index(_index) { }
std::deque<PacketPtr> retries;
protected:
TLBCoalescer *coalescer;
int index;
virtual bool recvTimingResp(PacketPtr pkt);
virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
virtual void recvFunctional(PacketPtr pkt);
virtual void recvRangeChange() { }
virtual void recvReqRetry();
virtual void
recvRespRetry()
{
fatal("recvRespRetry() not implemented in TLB coalescer");
}
};
// Coalescer response ports on the cpu Side
std::vector<CpuSidePort*> cpuSidePort;
// Coalescer request ports on the memory side
std::vector<MemSidePort*> memSidePort;
Port &getPort(const std::string &if_name,
PortID idx=InvalidPortID) override;
void processProbeTLBEvent();
/// This event issues the TLB probes
EventFunctionWrapper probeTLBEvent;
void processCleanupEvent();
/// The cleanupEvent is scheduled after a TLBEvent triggers
/// in order to free memory and do the required clean-up
EventFunctionWrapper cleanupEvent;
// this FIFO queue keeps track of the virt. page
// addresses that are pending cleanup
std::queue<Addr> cleanupQueue;
protected:
struct TLBCoalescerStats : public statistics::Group
{
TLBCoalescerStats(statistics::Group *parent);
// number of packets the coalescer receives
statistics::Scalar uncoalescedAccesses;
// number packets the coalescer send to the TLB
statistics::Scalar coalescedAccesses;
// Number of cycles the coalesced requests spend waiting in
// coalescerFIFO. For each packet the coalescer receives we take into
// account the number of all uncoalesced requests this pkt "represents"
statistics::Scalar queuingCycles;
// On average how much time a request from the
// uncoalescedAccesses that reaches the TLB
// spends waiting?
statistics::Scalar localqueuingCycles;
// localqueuingCycles/uncoalescedAccesses
statistics::Formula localLatency;
} stats;
};
} // namespace gem5
#endif // __TLB_COALESCER_HH__