Files
gem5/src/gpu-compute/gpu_tlb.cc
Matthew Poremba 64134b6e66 base,arch-hsail: Fix GPU build
The GPU build is currently broken due to recent changes. This fixes
the build after changes to local access, removal of getSyscallArg,
and creating of AMO header in base.

Change-Id: I43506f6fb0a92a61a50ecb9efa7ee279ecb21d98
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/27136
Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com>
Reviewed-by: Bradford Beckmann <brad.beckmann@amd.com>
Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com>
Tested-by: Gem5 Cloud Project GCB service account <345032938727@cloudbuild.gserviceaccount.com>
2020-04-03 21:51:57 +00:00

1518 lines
51 KiB
C++

/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Lisa Hsu
*/
#include "gpu-compute/gpu_tlb.hh"
#include <cmath>
#include <cstring>
#include "arch/x86/faults.hh"
#include "arch/x86/insts/microldstop.hh"
#include "arch/x86/pagetable.hh"
#include "arch/x86/pagetable_walker.hh"
#include "arch/x86/regs/misc.hh"
#include "arch/x86/regs/msr.hh"
#include "arch/x86/x86_traits.hh"
#include "base/bitfield.hh"
#include "base/logging.hh"
#include "base/output.hh"
#include "base/trace.hh"
#include "cpu/base.hh"
#include "cpu/thread_context.hh"
#include "debug/GPUPrefetch.hh"
#include "debug/GPUTLB.hh"
#include "mem/packet_access.hh"
#include "mem/page_table.hh"
#include "mem/request.hh"
#include "sim/process.hh"
#include "sim/pseudo_inst.hh"
namespace X86ISA
{
GpuTLB::GpuTLB(const Params *p)
: ClockedObject(p), configAddress(0), size(p->size),
cleanupEvent([this]{ cleanup(); }, name(), false,
Event::Maximum_Pri),
exitEvent([this]{ exitCallback(); }, name())
{
assoc = p->assoc;
assert(assoc <= size);
numSets = size/assoc;
allocationPolicy = p->allocationPolicy;
hasMemSidePort = false;
accessDistance = p->accessDistance;
clock = p->clk_domain->clockPeriod();
tlb.assign(size, TlbEntry());
freeList.resize(numSets);
entryList.resize(numSets);
for (int set = 0; set < numSets; ++set) {
for (int way = 0; way < assoc; ++way) {
int x = set * assoc + way;
freeList[set].push_back(&tlb.at(x));
}
}
FA = (size == assoc);
/**
* @warning: the set-associative version assumes you have a
* fixed page size of 4KB.
* If the page size is greather than 4KB (as defined in the
* TheISA::PageBytes), then there are various issues w/ the current
* implementation (you'd have the same 8KB page being replicated in
* different sets etc)
*/
setMask = numSets - 1;
maxCoalescedReqs = p->maxOutstandingReqs;
// Do not allow maxCoalescedReqs to be more than the TLB associativity
if (maxCoalescedReqs > assoc) {
maxCoalescedReqs = assoc;
cprintf("Forcing maxCoalescedReqs to %d (TLB assoc.) \n", assoc);
}
outstandingReqs = 0;
hitLatency = p->hitLatency;
missLatency1 = p->missLatency1;
missLatency2 = p->missLatency2;
// create the slave ports based on the number of connected ports
for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d",
name(), i), this, i));
}
// create the master ports based on the number of connected ports
for (size_t i = 0; i < p->port_master_connection_count; ++i) {
memSidePort.push_back(new MemSidePort(csprintf("%s-port%d",
name(), i), this, i));
}
}
// fixme: this is never called?
GpuTLB::~GpuTLB()
{
// make sure all the hash-maps are empty
assert(translationReturnEvent.empty());
}
Port &
GpuTLB::getPort(const std::string &if_name, PortID idx)
{
if (if_name == "slave") {
if (idx >= static_cast<PortID>(cpuSidePort.size())) {
panic("TLBCoalescer::getPort: unknown index %d\n", idx);
}
return *cpuSidePort[idx];
} else if (if_name == "master") {
if (idx >= static_cast<PortID>(memSidePort.size())) {
panic("TLBCoalescer::getPort: unknown index %d\n", idx);
}
hasMemSidePort = true;
return *memSidePort[idx];
} else {
panic("TLBCoalescer::getPort: unknown port %s\n", if_name);
}
}
TlbEntry*
GpuTLB::insert(Addr vpn, TlbEntry &entry)
{
TlbEntry *newEntry = nullptr;
/**
* vpn holds the virtual page address
* The least significant bits are simply masked
*/
int set = (vpn >> TheISA::PageShift) & setMask;
if (!freeList[set].empty()) {
newEntry = freeList[set].front();
freeList[set].pop_front();
} else {
newEntry = entryList[set].back();
entryList[set].pop_back();
}
*newEntry = entry;
newEntry->vaddr = vpn;
entryList[set].push_front(newEntry);
return newEntry;
}
GpuTLB::EntryList::iterator
GpuTLB::lookupIt(Addr va, bool update_lru)
{
int set = (va >> TheISA::PageShift) & setMask;
if (FA) {
assert(!set);
}
auto entry = entryList[set].begin();
for (; entry != entryList[set].end(); ++entry) {
int page_size = (*entry)->size();
if ((*entry)->vaddr <= va && (*entry)->vaddr + page_size > va) {
DPRINTF(GPUTLB, "Matched vaddr %#x to entry starting at %#x "
"with size %#x.\n", va, (*entry)->vaddr, page_size);
if (update_lru) {
entryList[set].push_front(*entry);
entryList[set].erase(entry);
entry = entryList[set].begin();
}
break;
}
}
return entry;
}
TlbEntry*
GpuTLB::lookup(Addr va, bool update_lru)
{
int set = (va >> TheISA::PageShift) & setMask;
auto entry = lookupIt(va, update_lru);
if (entry == entryList[set].end())
return nullptr;
else
return *entry;
}
void
GpuTLB::invalidateAll()
{
DPRINTF(GPUTLB, "Invalidating all entries.\n");
for (int i = 0; i < numSets; ++i) {
while (!entryList[i].empty()) {
TlbEntry *entry = entryList[i].front();
entryList[i].pop_front();
freeList[i].push_back(entry);
}
}
}
void
GpuTLB::setConfigAddress(uint32_t addr)
{
configAddress = addr;
}
void
GpuTLB::invalidateNonGlobal()
{
DPRINTF(GPUTLB, "Invalidating all non global entries.\n");
for (int i = 0; i < numSets; ++i) {
for (auto entryIt = entryList[i].begin();
entryIt != entryList[i].end();) {
if (!(*entryIt)->global) {
freeList[i].push_back(*entryIt);
entryList[i].erase(entryIt++);
} else {
++entryIt;
}
}
}
}
void
GpuTLB::demapPage(Addr va, uint64_t asn)
{
int set = (va >> TheISA::PageShift) & setMask;
auto entry = lookupIt(va, false);
if (entry != entryList[set].end()) {
freeList[set].push_back(*entry);
entryList[set].erase(entry);
}
}
namespace
{
Cycles
localMiscRegAccess(bool read, MiscRegIndex regNum,
ThreadContext *tc, PacketPtr pkt)
{
if (read) {
RegVal data = htole(tc->readMiscReg(regNum));
// Make sure we don't trot off the end of data.
pkt->setData((uint8_t *)&data);
} else {
RegVal data = htole(tc->readMiscRegNoEffect(regNum));
tc->setMiscReg(regNum, letoh(data));
}
return Cycles(1);
}
} // anonymous namespace
Fault
GpuTLB::translateInt(bool read, const RequestPtr &req, ThreadContext *tc)
{
DPRINTF(GPUTLB, "Addresses references internal memory.\n");
Addr vaddr = req->getVaddr();
Addr prefix = (vaddr >> 3) & IntAddrPrefixMask;
if (prefix == IntAddrPrefixCPUID) {
panic("CPUID memory space not yet implemented!\n");
} else if (prefix == IntAddrPrefixMSR) {
vaddr = (vaddr >> 3) & ~IntAddrPrefixMask;
MiscRegIndex regNum;
if (!msrAddrToIndex(regNum, vaddr))
return std::make_shared<GeneralProtection>(0);
req->setLocalAccessor(
[read,regNum,vaddr](ThreadContext *tc, PacketPtr pkt)
{
return localMiscRegAccess(read, regNum, tc, pkt);
}
);
return NoFault;
} else if (prefix == IntAddrPrefixIO) {
// TODO If CPL > IOPL or in virtual mode, check the I/O permission
// bitmap in the TSS.
Addr IOPort = vaddr & ~IntAddrPrefixMask;
// Make sure the address fits in the expected 16 bit IO address
// space.
assert(!(IOPort & ~0xFFFF));
if (IOPort == 0xCF8 && req->getSize() == 4) {
req->setLocalAccessor(
[read](ThreadContext *tc, PacketPtr pkt)
{
return localMiscRegAccess(
read, MISCREG_PCI_CONFIG_ADDRESS, tc, pkt);
}
);
} else if ((IOPort & ~mask(2)) == 0xCFC) {
req->setFlags(Request::UNCACHEABLE | Request::STRICT_ORDER);
Addr configAddress =
tc->readMiscRegNoEffect(MISCREG_PCI_CONFIG_ADDRESS);
if (bits(configAddress, 31, 31)) {
req->setPaddr(PhysAddrPrefixPciConfig |
mbits(configAddress, 30, 2) |
(IOPort & mask(2)));
} else {
req->setPaddr(PhysAddrPrefixIO | IOPort);
}
} else {
req->setFlags(Request::UNCACHEABLE | Request::STRICT_ORDER);
req->setPaddr(PhysAddrPrefixIO | IOPort);
}
return NoFault;
} else {
panic("Access to unrecognized internal address space %#x.\n",
prefix);
}
}
/**
* TLB_lookup will only perform a TLB lookup returning true on a TLB hit
* and false on a TLB miss.
* Many of the checks about different modes have been converted to
* assertions, since these parts of the code are not really used.
* On a hit it will update the LRU stack.
*/
bool
GpuTLB::tlbLookup(const RequestPtr &req,
ThreadContext *tc, bool update_stats)
{
bool tlb_hit = false;
#ifndef NDEBUG
uint32_t flags = req->getFlags();
int seg = flags & SegmentFlagMask;
#endif
assert(seg != SEGMENT_REG_MS);
Addr vaddr = req->getVaddr();
DPRINTF(GPUTLB, "TLB Lookup for vaddr %#x.\n", vaddr);
HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
if (m5Reg.prot) {
DPRINTF(GPUTLB, "In protected mode.\n");
// make sure we are in 64-bit mode
assert(m5Reg.mode == LongMode);
// If paging is enabled, do the translation.
if (m5Reg.paging) {
DPRINTF(GPUTLB, "Paging enabled.\n");
//update LRU stack on a hit
TlbEntry *entry = lookup(vaddr, true);
if (entry)
tlb_hit = true;
if (!update_stats) {
// functional tlb access for memory initialization
// i.e., memory seeding or instr. seeding -> don't update
// TLB and stats
return tlb_hit;
}
localNumTLBAccesses++;
if (!entry) {
localNumTLBMisses++;
} else {
localNumTLBHits++;
}
}
}
return tlb_hit;
}
Fault
GpuTLB::translate(const RequestPtr &req, ThreadContext *tc,
Translation *translation, Mode mode,
bool &delayedResponse, bool timing, int &latency)
{
uint32_t flags = req->getFlags();
int seg = flags & SegmentFlagMask;
bool storeCheck = flags & (StoreCheck << FlagShift);
// If this is true, we're dealing with a request
// to a non-memory address space.
if (seg == SEGMENT_REG_MS) {
return translateInt(mode == Mode::Read, req, tc);
}
delayedResponse = false;
Addr vaddr = req->getVaddr();
DPRINTF(GPUTLB, "Translating vaddr %#x.\n", vaddr);
HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
// If protected mode has been enabled...
if (m5Reg.prot) {
DPRINTF(GPUTLB, "In protected mode.\n");
// If we're not in 64-bit mode, do protection/limit checks
if (m5Reg.mode != LongMode) {
DPRINTF(GPUTLB, "Not in long mode. Checking segment "
"protection.\n");
// Check for a null segment selector.
if (!(seg == SEGMENT_REG_TSG || seg == SYS_SEGMENT_REG_IDTR ||
seg == SEGMENT_REG_HS || seg == SEGMENT_REG_LS)
&& !tc->readMiscRegNoEffect(MISCREG_SEG_SEL(seg))) {
return std::make_shared<GeneralProtection>(0);
}
bool expandDown = false;
SegAttr attr = tc->readMiscRegNoEffect(MISCREG_SEG_ATTR(seg));
if (seg >= SEGMENT_REG_ES && seg <= SEGMENT_REG_HS) {
if (!attr.writable && (mode == BaseTLB::Write ||
storeCheck))
return std::make_shared<GeneralProtection>(0);
if (!attr.readable && mode == BaseTLB::Read)
return std::make_shared<GeneralProtection>(0);
expandDown = attr.expandDown;
}
Addr base = tc->readMiscRegNoEffect(MISCREG_SEG_BASE(seg));
Addr limit = tc->readMiscRegNoEffect(MISCREG_SEG_LIMIT(seg));
// This assumes we're not in 64 bit mode. If we were, the
// default address size is 64 bits, overridable to 32.
int size = 32;
bool sizeOverride = (flags & (AddrSizeFlagBit << FlagShift));
SegAttr csAttr = tc->readMiscRegNoEffect(MISCREG_CS_ATTR);
if ((csAttr.defaultSize && sizeOverride) ||
(!csAttr.defaultSize && !sizeOverride)) {
size = 16;
}
Addr offset = bits(vaddr - base, size - 1, 0);
Addr endOffset = offset + req->getSize() - 1;
if (expandDown) {
DPRINTF(GPUTLB, "Checking an expand down segment.\n");
warn_once("Expand down segments are untested.\n");
if (offset <= limit || endOffset <= limit)
return std::make_shared<GeneralProtection>(0);
} else {
if (offset > limit || endOffset > limit)
return std::make_shared<GeneralProtection>(0);
}
}
// If paging is enabled, do the translation.
if (m5Reg.paging) {
DPRINTF(GPUTLB, "Paging enabled.\n");
// The vaddr already has the segment base applied.
TlbEntry *entry = lookup(vaddr);
localNumTLBAccesses++;
if (!entry) {
localNumTLBMisses++;
if (timing) {
latency = missLatency1;
}
if (FullSystem) {
fatal("GpuTLB doesn't support full-system mode\n");
} else {
DPRINTF(GPUTLB, "Handling a TLB miss for address %#x "
"at pc %#x.\n", vaddr, tc->instAddr());
Process *p = tc->getProcessPtr();
const EmulationPageTable::Entry *pte =
p->pTable->lookup(vaddr);
if (!pte && mode != BaseTLB::Execute) {
// penalize a "page fault" more
if (timing)
latency += missLatency2;
if (p->fixupFault(vaddr))
pte = p->pTable->lookup(vaddr);
}
if (!pte) {
return std::make_shared<PageFault>(vaddr, true,
mode, true,
false);
} else {
Addr alignedVaddr = p->pTable->pageAlign(vaddr);
DPRINTF(GPUTLB, "Mapping %#x to %#x\n",
alignedVaddr, pte->paddr);
TlbEntry gpuEntry(p->pid(), alignedVaddr,
pte->paddr, false, false);
entry = insert(alignedVaddr, gpuEntry);
}
DPRINTF(GPUTLB, "Miss was serviced.\n");
}
} else {
localNumTLBHits++;
if (timing) {
latency = hitLatency;
}
}
// Do paging protection checks.
bool inUser = (m5Reg.cpl == 3 &&
!(flags & (CPL0FlagBit << FlagShift)));
CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
bool badWrite = (!entry->writable && (inUser || cr0.wp));
if ((inUser && !entry->user) || (mode == BaseTLB::Write &&
badWrite)) {
// The page must have been present to get into the TLB in
// the first place. We'll assume the reserved bits are
// fine even though we're not checking them.
return std::make_shared<PageFault>(vaddr, true, mode,
inUser, false);
}
if (storeCheck && badWrite) {
// This would fault if this were a write, so return a page
// fault that reflects that happening.
return std::make_shared<PageFault>(vaddr, true,
BaseTLB::Write,
inUser, false);
}
DPRINTF(GPUTLB, "Entry found with paddr %#x, doing protection "
"checks.\n", entry->paddr);
int page_size = entry->size();
Addr paddr = entry->paddr | (vaddr & (page_size - 1));
DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
req->setPaddr(paddr);
if (entry->uncacheable)
req->setFlags(Request::UNCACHEABLE);
} else {
//Use the address which already has segmentation applied.
DPRINTF(GPUTLB, "Paging disabled.\n");
DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
req->setPaddr(vaddr);
}
} else {
// Real mode
DPRINTF(GPUTLB, "In real mode.\n");
DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
req->setPaddr(vaddr);
}
// Check for an access to the local APIC
if (FullSystem) {
LocalApicBase localApicBase =
tc->readMiscRegNoEffect(MISCREG_APIC_BASE);
Addr baseAddr = localApicBase.base * PageBytes;
Addr paddr = req->getPaddr();
if (baseAddr <= paddr && baseAddr + PageBytes > paddr) {
// Force the access to be uncacheable.
req->setFlags(Request::UNCACHEABLE);
req->setPaddr(x86LocalAPICAddress(tc->contextId(),
paddr - baseAddr));
}
}
return NoFault;
};
Fault
GpuTLB::translateAtomic(const RequestPtr &req, ThreadContext *tc,
Mode mode, int &latency)
{
bool delayedResponse;
return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse, false,
latency);
}
void
GpuTLB::translateTiming(const RequestPtr &req, ThreadContext *tc,
Translation *translation, Mode mode, int &latency)
{
bool delayedResponse;
assert(translation);
Fault fault = GpuTLB::translate(req, tc, translation, mode,
delayedResponse, true, latency);
if (!delayedResponse)
translation->finish(fault, req, tc, mode);
}
Walker*
GpuTLB::getWalker()
{
return walker;
}
void
GpuTLB::serialize(CheckpointOut &cp) const
{
}
void
GpuTLB::unserialize(CheckpointIn &cp)
{
}
void
GpuTLB::regStats()
{
ClockedObject::regStats();
localNumTLBAccesses
.name(name() + ".local_TLB_accesses")
.desc("Number of TLB accesses")
;
localNumTLBHits
.name(name() + ".local_TLB_hits")
.desc("Number of TLB hits")
;
localNumTLBMisses
.name(name() + ".local_TLB_misses")
.desc("Number of TLB misses")
;
localTLBMissRate
.name(name() + ".local_TLB_miss_rate")
.desc("TLB miss rate")
;
accessCycles
.name(name() + ".access_cycles")
.desc("Cycles spent accessing this TLB level")
;
pageTableCycles
.name(name() + ".page_table_cycles")
.desc("Cycles spent accessing the page table")
;
localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
numUniquePages
.name(name() + ".unique_pages")
.desc("Number of unique pages touched")
;
localCycles
.name(name() + ".local_cycles")
.desc("Number of cycles spent in queue for all incoming reqs")
;
localLatency
.name(name() + ".local_latency")
.desc("Avg. latency over incoming coalesced reqs")
;
localLatency = localCycles / localNumTLBAccesses;
globalNumTLBAccesses
.name(name() + ".global_TLB_accesses")
.desc("Number of TLB accesses")
;
globalNumTLBHits
.name(name() + ".global_TLB_hits")
.desc("Number of TLB hits")
;
globalNumTLBMisses
.name(name() + ".global_TLB_misses")
.desc("Number of TLB misses")
;
globalTLBMissRate
.name(name() + ".global_TLB_miss_rate")
.desc("TLB miss rate")
;
globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;
avgReuseDistance
.name(name() + ".avg_reuse_distance")
.desc("avg. reuse distance over all pages (in ticks)")
;
}
/**
* Do the TLB lookup for this coalesced request and schedule
* another event <TLB access latency> cycles later.
*/
void
GpuTLB::issueTLBLookup(PacketPtr pkt)
{
assert(pkt);
assert(pkt->senderState);
Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
TheISA::PageBytes);
TranslationState *sender_state =
safe_cast<TranslationState*>(pkt->senderState);
bool update_stats = !sender_state->prefetch;
ThreadContext * tmp_tc = sender_state->tc;
DPRINTF(GPUTLB, "Translation req. for virt. page addr %#x\n",
virt_page_addr);
int req_cnt = sender_state->reqCnt.back();
if (update_stats) {
accessCycles -= (curTick() * req_cnt);
localCycles -= curTick();
updatePageFootprint(virt_page_addr);
globalNumTLBAccesses += req_cnt;
}
tlbOutcome lookup_outcome = TLB_MISS;
const RequestPtr &tmp_req = pkt->req;
// Access the TLB and figure out if it's a hit or a miss.
bool success = tlbLookup(tmp_req, tmp_tc, update_stats);
if (success) {
lookup_outcome = TLB_HIT;
// Put the entry in SenderState
TlbEntry *entry = lookup(tmp_req->getVaddr(), false);
assert(entry);
auto p = sender_state->tc->getProcessPtr();
sender_state->tlbEntry =
new TlbEntry(p->pid(), entry->vaddr, entry->paddr,
false, false);
if (update_stats) {
// the reqCnt has an entry per level, so its size tells us
// which level we are in
sender_state->hitLevel = sender_state->reqCnt.size();
globalNumTLBHits += req_cnt;
}
} else {
if (update_stats)
globalNumTLBMisses += req_cnt;
}
/*
* We now know the TLB lookup outcome (if it's a hit or a miss), as well
* as the TLB access latency.
*
* We create and schedule a new TLBEvent which will help us take the
* appropriate actions (e.g., update TLB on a hit, send request to lower
* level TLB on a miss, or start a page walk if this was the last-level
* TLB)
*/
TLBEvent *tlb_event =
new TLBEvent(this, virt_page_addr, lookup_outcome, pkt);
if (translationReturnEvent.count(virt_page_addr)) {
panic("Virtual Page Address %#x already has a return event\n",
virt_page_addr);
}
translationReturnEvent[virt_page_addr] = tlb_event;
assert(tlb_event);
DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n",
curTick() + this->ticks(hitLatency));
schedule(tlb_event, curTick() + this->ticks(hitLatency));
}
GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr, tlbOutcome tlb_outcome,
PacketPtr _pkt)
: Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
outcome(tlb_outcome), pkt(_pkt)
{
}
/**
* Do Paging protection checks. If we encounter a page fault, then
* an assertion is fired.
*/
void
GpuTLB::pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
TlbEntry * tlb_entry, Mode mode)
{
HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
uint32_t flags = pkt->req->getFlags();
bool storeCheck = flags & (StoreCheck << FlagShift);
// Do paging protection checks.
bool inUser = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp));
if ((inUser && !tlb_entry->user) ||
(mode == BaseTLB::Write && badWrite)) {
// The page must have been present to get into the TLB in
// the first place. We'll assume the reserved bits are
// fine even though we're not checking them.
panic("Page fault detected");
}
if (storeCheck && badWrite) {
// This would fault if this were a write, so return a page
// fault that reflects that happening.
panic("Page fault detected");
}
}
/**
* handleTranslationReturn is called on a TLB hit,
* when a TLB miss returns or when a page fault returns.
* The latter calls handelHit with TLB miss as tlbOutcome.
*/
void
GpuTLB::handleTranslationReturn(Addr virt_page_addr, tlbOutcome tlb_outcome,
PacketPtr pkt)
{
assert(pkt);
Addr vaddr = pkt->req->getVaddr();
TranslationState *sender_state =
safe_cast<TranslationState*>(pkt->senderState);
ThreadContext *tc = sender_state->tc;
Mode mode = sender_state->tlbMode;
TlbEntry *local_entry, *new_entry;
if (tlb_outcome == TLB_HIT) {
DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n", vaddr);
local_entry = sender_state->tlbEntry;
} else {
DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n",
vaddr);
// We are returning either from a page walk or from a hit at a lower
// TLB level. The senderState should be "carrying" a pointer to the
// correct TLBEntry.
new_entry = sender_state->tlbEntry;
assert(new_entry);
local_entry = new_entry;
if (allocationPolicy) {
DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
virt_page_addr);
local_entry = insert(virt_page_addr, *new_entry);
}
assert(local_entry);
}
/**
* At this point the packet carries an up-to-date tlbEntry pointer
* in its senderState.
* Next step is to do the paging protection checks.
*/
DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks "
"while paddr was %#x.\n", local_entry->vaddr,
local_entry->paddr);
pagingProtectionChecks(tc, pkt, local_entry, mode);
int page_size = local_entry->size();
Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
// Since this packet will be sent through the cpu side slave port,
// it must be converted to a response pkt if it is not one already
if (pkt->isRequest()) {
pkt->makeTimingResponse();
}
pkt->req->setPaddr(paddr);
if (local_entry->uncacheable) {
pkt->req->setFlags(Request::UNCACHEABLE);
}
//send packet back to coalescer
cpuSidePort[0]->sendTimingResp(pkt);
//schedule cleanup event
cleanupQueue.push(virt_page_addr);
// schedule this only once per cycle.
// The check is required because we might have multiple translations
// returning the same cycle
// this is a maximum priority event and must be on the same cycle
// as the cleanup event in TLBCoalescer to avoid a race with
// IssueProbeEvent caused by TLBCoalescer::MemSidePort::recvReqRetry
if (!cleanupEvent.scheduled())
schedule(cleanupEvent, curTick());
}
/**
* Here we take the appropriate actions based on the result of the
* TLB lookup.
*/
void
GpuTLB::translationReturn(Addr virtPageAddr, tlbOutcome outcome,
PacketPtr pkt)
{
DPRINTF(GPUTLB, "Triggered TLBEvent for addr %#x\n", virtPageAddr);
assert(translationReturnEvent[virtPageAddr]);
assert(pkt);
TranslationState *tmp_sender_state =
safe_cast<TranslationState*>(pkt->senderState);
int req_cnt = tmp_sender_state->reqCnt.back();
bool update_stats = !tmp_sender_state->prefetch;
if (outcome == TLB_HIT) {
handleTranslationReturn(virtPageAddr, TLB_HIT, pkt);
if (update_stats) {
accessCycles += (req_cnt * curTick());
localCycles += curTick();
}
} else if (outcome == TLB_MISS) {
DPRINTF(GPUTLB, "This is a TLB miss\n");
if (update_stats) {
accessCycles += (req_cnt*curTick());
localCycles += curTick();
}
if (hasMemSidePort) {
// the one cyle added here represent the delay from when we get
// the reply back till when we propagate it to the coalescer
// above.
if (update_stats) {
accessCycles += (req_cnt * 1);
localCycles += 1;
}
/**
* There is a TLB below. Send the coalesced request.
* We actually send the very first packet of all the
* pending packets for this virtual page address.
*/
if (!memSidePort[0]->sendTimingReq(pkt)) {
DPRINTF(GPUTLB, "Failed sending translation request to "
"lower level TLB for addr %#x\n", virtPageAddr);
memSidePort[0]->retries.push_back(pkt);
} else {
DPRINTF(GPUTLB, "Sent translation request to lower level "
"TLB for addr %#x\n", virtPageAddr);
}
} else {
//this is the last level TLB. Start a page walk
DPRINTF(GPUTLB, "Last level TLB - start a page walk for "
"addr %#x\n", virtPageAddr);
if (update_stats)
pageTableCycles -= (req_cnt*curTick());
TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
assert(tlb_event);
tlb_event->updateOutcome(PAGE_WALK);
schedule(tlb_event, curTick() + ticks(missLatency2));
}
} else if (outcome == PAGE_WALK) {
if (update_stats)
pageTableCycles += (req_cnt*curTick());
// Need to access the page table and update the TLB
DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
virtPageAddr);
TranslationState *sender_state =
safe_cast<TranslationState*>(pkt->senderState);
Process *p = sender_state->tc->getProcessPtr();
Addr vaddr = pkt->req->getVaddr();
#ifndef NDEBUG
Addr alignedVaddr = p->pTable->pageAlign(vaddr);
assert(alignedVaddr == virtPageAddr);
#endif
const EmulationPageTable::Entry *pte = p->pTable->lookup(vaddr);
if (!pte && sender_state->tlbMode != BaseTLB::Execute &&
p->fixupFault(vaddr)) {
pte = p->pTable->lookup(vaddr);
}
if (pte) {
DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
pte->paddr);
sender_state->tlbEntry =
new TlbEntry(p->pid(), virtPageAddr, pte->paddr, false,
false);
} else {
sender_state->tlbEntry = nullptr;
}
handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
} else if (outcome == MISS_RETURN) {
/** we add an extra cycle in the return path of the translation
* requests in between the various TLB levels.
*/
handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
} else {
panic("Unexpected TLB outcome %d", outcome);
}
}
void
GpuTLB::TLBEvent::process()
{
tlb->translationReturn(virtPageAddr, outcome, pkt);
}
const char*
GpuTLB::TLBEvent::description() const
{
return "trigger translationDoneEvent";
}
void
GpuTLB::TLBEvent::updateOutcome(tlbOutcome _outcome)
{
outcome = _outcome;
}
Addr
GpuTLB::TLBEvent::getTLBEventVaddr()
{
return virtPageAddr;
}
/*
* recvTiming receives a coalesced timing request from a TLBCoalescer
* and it calls issueTLBLookup()
* It only rejects the packet if we have exceeded the max
* outstanding number of requests for the TLB
*/
bool
GpuTLB::CpuSidePort::recvTimingReq(PacketPtr pkt)
{
if (tlb->outstandingReqs < tlb->maxCoalescedReqs) {
tlb->issueTLBLookup(pkt);
// update number of outstanding translation requests
tlb->outstandingReqs++;
return true;
} else {
DPRINTF(GPUTLB, "Reached maxCoalescedReqs number %d\n",
tlb->outstandingReqs);
return false;
}
}
/**
* handleFuncTranslationReturn is called on a TLB hit,
* when a TLB miss returns or when a page fault returns.
* It updates LRU, inserts the TLB entry on a miss
* depending on the allocation policy and does the required
* protection checks. It does NOT create a new packet to
* update the packet's addr; this is done in hsail-gpu code.
*/
void
GpuTLB::handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome tlb_outcome)
{
TranslationState *sender_state =
safe_cast<TranslationState*>(pkt->senderState);
ThreadContext *tc = sender_state->tc;
Mode mode = sender_state->tlbMode;
Addr vaddr = pkt->req->getVaddr();
TlbEntry *local_entry, *new_entry;
if (tlb_outcome == TLB_HIT) {
DPRINTF(GPUTLB, "Functional Translation Done - TLB hit for addr "
"%#x\n", vaddr);
local_entry = sender_state->tlbEntry;
} else {
DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr "
"%#x\n", vaddr);
// We are returning either from a page walk or from a hit at a lower
// TLB level. The senderState should be "carrying" a pointer to the
// correct TLBEntry.
new_entry = sender_state->tlbEntry;
assert(new_entry);
local_entry = new_entry;
if (allocationPolicy) {
Addr virt_page_addr = roundDown(vaddr, TheISA::PageBytes);
DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
virt_page_addr);
local_entry = insert(virt_page_addr, *new_entry);
}
assert(local_entry);
}
DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks "
"while paddr was %#x.\n", local_entry->vaddr,
local_entry->paddr);
/**
* Do paging checks if it's a normal functional access. If it's for a
* prefetch, then sometimes you can try to prefetch something that
* won't pass protection. We don't actually want to fault becuase there
* is no demand access to deem this a violation. Just put it in the
* TLB and it will fault if indeed a future demand access touches it in
* violation.
*
* This feature could be used to explore security issues around
* speculative memory accesses.
*/
if (!sender_state->prefetch && sender_state->tlbEntry)
pagingProtectionChecks(tc, pkt, local_entry, mode);
int page_size = local_entry->size();
Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
pkt->req->setPaddr(paddr);
if (local_entry->uncacheable)
pkt->req->setFlags(Request::UNCACHEABLE);
}
// This is used for atomic translations. Need to
// make it all happen during the same cycle.
void
GpuTLB::CpuSidePort::recvFunctional(PacketPtr pkt)
{
TranslationState *sender_state =
safe_cast<TranslationState*>(pkt->senderState);
ThreadContext *tc = sender_state->tc;
bool update_stats = !sender_state->prefetch;
Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
TheISA::PageBytes);
if (update_stats)
tlb->updatePageFootprint(virt_page_addr);
// do the TLB lookup without updating the stats
bool success = tlb->tlbLookup(pkt->req, tc, update_stats);
tlbOutcome tlb_outcome = success ? TLB_HIT : TLB_MISS;
// functional mode means no coalescing
// global metrics are the same as the local metrics
if (update_stats) {
tlb->globalNumTLBAccesses++;
if (success) {
sender_state->hitLevel = sender_state->reqCnt.size();
tlb->globalNumTLBHits++;
}
}
if (!success) {
if (update_stats)
tlb->globalNumTLBMisses++;
if (tlb->hasMemSidePort) {
// there is a TLB below -> propagate down the TLB hierarchy
tlb->memSidePort[0]->sendFunctional(pkt);
// If no valid translation from a prefetch, then just return
if (sender_state->prefetch && !pkt->req->hasPaddr())
return;
} else {
// Need to access the page table and update the TLB
DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
virt_page_addr);
Process *p = tc->getProcessPtr();
Addr vaddr = pkt->req->getVaddr();
#ifndef NDEBUG
Addr alignedVaddr = p->pTable->pageAlign(vaddr);
assert(alignedVaddr == virt_page_addr);
#endif
const EmulationPageTable::Entry *pte =
p->pTable->lookup(vaddr);
if (!pte && sender_state->tlbMode != BaseTLB::Execute &&
p->fixupFault(vaddr)) {
pte = p->pTable->lookup(vaddr);
}
if (!sender_state->prefetch) {
// no PageFaults are permitted after
// the second page table lookup
assert(pte);
DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
pte->paddr);
sender_state->tlbEntry =
new TlbEntry(p->pid(), virt_page_addr,
pte->paddr, false, false);
} else {
// If this was a prefetch, then do the normal thing if it
// was a successful translation. Otherwise, send an empty
// TLB entry back so that it can be figured out as empty and
// handled accordingly.
if (pte) {
DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
pte->paddr);
sender_state->tlbEntry =
new TlbEntry(p->pid(), virt_page_addr,
pte->paddr, false, false);
} else {
DPRINTF(GPUPrefetch, "Prefetch failed %#x\n",
alignedVaddr);
sender_state->tlbEntry = nullptr;
return;
}
}
}
} else {
DPRINTF(GPUPrefetch, "Functional Hit for vaddr %#x\n",
tlb->lookup(pkt->req->getVaddr()));
TlbEntry *entry = tlb->lookup(pkt->req->getVaddr(),
update_stats);
assert(entry);
auto p = sender_state->tc->getProcessPtr();
sender_state->tlbEntry =
new TlbEntry(p->pid(), entry->vaddr, entry->paddr,
false, false);
}
// This is the function that would populate pkt->req with the paddr of
// the translation. But if no translation happens (i.e Prefetch fails)
// then the early returns in the above code wiill keep this function
// from executing.
tlb->handleFuncTranslationReturn(pkt, tlb_outcome);
}
void
GpuTLB::CpuSidePort::recvReqRetry()
{
// The CPUSidePort never sends anything but replies. No retries
// expected.
panic("recvReqRetry called");
}
AddrRangeList
GpuTLB::CpuSidePort::getAddrRanges() const
{
// currently not checked by the master
AddrRangeList ranges;
return ranges;
}
/**
* MemSidePort receives the packet back.
* We need to call the handleTranslationReturn
* and propagate up the hierarchy.
*/
bool
GpuTLB::MemSidePort::recvTimingResp(PacketPtr pkt)
{
Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
TheISA::PageBytes);
DPRINTF(GPUTLB, "MemSidePort recvTiming for virt_page_addr %#x\n",
virt_page_addr);
TLBEvent *tlb_event = tlb->translationReturnEvent[virt_page_addr];
assert(tlb_event);
assert(virt_page_addr == tlb_event->getTLBEventVaddr());
tlb_event->updateOutcome(MISS_RETURN);
tlb->schedule(tlb_event, curTick()+tlb->ticks(1));
return true;
}
void
GpuTLB::MemSidePort::recvReqRetry()
{
// No retries should reach the TLB. The retries
// should only reach the TLBCoalescer.
panic("recvReqRetry called");
}
void
GpuTLB::cleanup()
{
while (!cleanupQueue.empty()) {
Addr cleanup_addr = cleanupQueue.front();
cleanupQueue.pop();
// delete TLBEvent
TLBEvent * old_tlb_event = translationReturnEvent[cleanup_addr];
delete old_tlb_event;
translationReturnEvent.erase(cleanup_addr);
// update number of outstanding requests
outstandingReqs--;
}
/** the higher level coalescer should retry if it has
* any pending requests.
*/
for (int i = 0; i < cpuSidePort.size(); ++i) {
cpuSidePort[i]->sendRetryReq();
}
}
void
GpuTLB::updatePageFootprint(Addr virt_page_addr)
{
std::pair<AccessPatternTable::iterator, bool> ret;
AccessInfo tmp_access_info;
tmp_access_info.lastTimeAccessed = 0;
tmp_access_info.accessesPerPage = 0;
tmp_access_info.totalReuseDistance = 0;
tmp_access_info.sumDistance = 0;
tmp_access_info.meanDistance = 0;
ret = TLBFootprint.insert(AccessPatternTable::value_type(virt_page_addr,
tmp_access_info));
bool first_page_access = ret.second;
if (first_page_access) {
numUniquePages++;
} else {
int accessed_before;
accessed_before = curTick() - ret.first->second.lastTimeAccessed;
ret.first->second.totalReuseDistance += accessed_before;
}
ret.first->second.accessesPerPage++;
ret.first->second.lastTimeAccessed = curTick();
if (accessDistance) {
ret.first->second.localTLBAccesses
.push_back(localNumTLBAccesses.value());
}
}
void
GpuTLB::exitCallback()
{
std::ostream *page_stat_file = nullptr;
if (accessDistance) {
// print per page statistics to a separate file (.csv format)
// simout is the gem5 output directory (default is m5out or the one
// specified with -d
page_stat_file = simout.create(name().c_str())->stream();
// print header
*page_stat_file << "page,max_access_distance,mean_access_distance, "
<< "stddev_distance" << std::endl;
}
// update avg. reuse distance footprint
AccessPatternTable::iterator iter, iter_begin, iter_end;
unsigned int sum_avg_reuse_distance_per_page = 0;
// iterate through all pages seen by this TLB
for (iter = TLBFootprint.begin(); iter != TLBFootprint.end(); iter++) {
sum_avg_reuse_distance_per_page += iter->second.totalReuseDistance /
iter->second.accessesPerPage;
if (accessDistance) {
unsigned int tmp = iter->second.localTLBAccesses[0];
unsigned int prev = tmp;
for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
if (i) {
tmp = prev + 1;
}
prev = iter->second.localTLBAccesses[i];
// update the localTLBAccesses value
// with the actual differece
iter->second.localTLBAccesses[i] -= tmp;
// compute the sum of AccessDistance per page
// used later for mean
iter->second.sumDistance +=
iter->second.localTLBAccesses[i];
}
iter->second.meanDistance =
iter->second.sumDistance / iter->second.accessesPerPage;
// compute std_dev and max (we need a second round because we
// need to know the mean value
unsigned int max_distance = 0;
unsigned int stddev_distance = 0;
for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
unsigned int tmp_access_distance =
iter->second.localTLBAccesses[i];
if (tmp_access_distance > max_distance) {
max_distance = tmp_access_distance;
}
unsigned int diff =
tmp_access_distance - iter->second.meanDistance;
stddev_distance += pow(diff, 2);
}
stddev_distance =
sqrt(stddev_distance/iter->second.accessesPerPage);
if (page_stat_file) {
*page_stat_file << std::hex << iter->first << ",";
*page_stat_file << std::dec << max_distance << ",";
*page_stat_file << std::dec << iter->second.meanDistance
<< ",";
*page_stat_file << std::dec << stddev_distance;
*page_stat_file << std::endl;
}
// erase the localTLBAccesses array
iter->second.localTLBAccesses.clear();
}
}
if (!TLBFootprint.empty()) {
avgReuseDistance =
sum_avg_reuse_distance_per_page / TLBFootprint.size();
}
//clear the TLBFootprint map
TLBFootprint.clear();
}
} // namespace X86ISA
X86ISA::GpuTLB*
X86GPUTLBParams::create()
{
return new X86ISA::GpuTLB(this);
}