arch-vega: Add VEGA page tables and TLB

Add the page table walker, page table format, TLB, TLB coalescer, and
associated support in the AMDGPUDevice. This page table format used the
hardware format for dGPU and is very different from APU/GCN3 which use
the X86 page table format.

In order to support either format for the GPU model, a common
TranslationState called GpuTranslation state is created which holds the
combined fields of both the APU and Vega translation state. Similarly
the TlbEntry is cast at runtime by the corresponding arch files as they
are the only files which touch the internals of the TlbEntry. The GPU
model only checks if a TlbEntry is non-null and thus does not need to
cast to peek inside the data structure.

Change-Id: I4484c66239b48df5224d61caa6e968e56eea38a5
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/51848
Reviewed-by: Jason Lowe-Power <power.jg@gmail.com>
Maintainer: Jason Lowe-Power <power.jg@gmail.com>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
Matthew Poremba
2021-09-01 18:25:19 -05:00
parent 7cfe88df74
commit 539a2e2bcd
22 changed files with 3599 additions and 69 deletions

View File

@@ -0,0 +1,106 @@
/*
* Copyright (c) 2022 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __ARCH_AMDGPU_COMMON_GPU_TRANSLATION_STATE_HH__
#define __ARCH_AMDGPU_COMMON_GPU_TRANSLATION_STATE_HH__
#include "arch/generic/mmu.hh"
namespace gem5
{
class ResponsePort;
/**
* GPU TranslationState: this currently is a somewhat bastardization of
* the usage of SenderState, whereby the receiver of a packet is not
* usually supposed to need to look at the contents of the senderState,
* you're really only supposed to look at what you pushed on, pop it
* off, and send it back.
*
* However, since there is state that we want to pass to the TLBs using
* the send/recv Timing/Functional/etc. APIs, which don't allow for new
* arguments, we need a common TLB senderState to pass between TLBs,
* both "forwards" and "backwards."
*
* So, basically, the rule is that any packet received by a TLB port
* (cpuside OR memside) must be safely castable to a GpuTranslationState.
*/
struct GpuTranslationState : public Packet::SenderState
{
// TLB mode, read or write
BaseMMU::Mode tlbMode;
// SE mode thread context associated with this req
ThreadContext *tc;
// FS mode related fields
int deviceId;
int pasId; // Process Address Space ID
/*
* TLB entry to be populated and passed back and filled in
* previous TLBs. Equivalent to the data cache concept of
* "data return."
*/
Serializable *tlbEntry;
// Is this a TLB prefetch request?
bool isPrefetch;
// When was the req for this translation issued
uint64_t issueTime;
// Remember where this came from
std::vector<ResponsePort*>ports;
// keep track of #uncoalesced reqs per packet per TLB level;
// reqCnt per level >= reqCnt higher level
std::vector<int> reqCnt;
// TLB level this packet hit in; 0 if it hit in the page table
int hitLevel;
Packet::SenderState *saved;
GpuTranslationState(BaseMMU::Mode tlb_mode, ThreadContext *_tc,
bool _prefetch=false,
Packet::SenderState *_saved=nullptr)
: tlbMode(tlb_mode), tc(_tc), deviceId(0), pasId(0), tlbEntry(nullptr),
isPrefetch(_prefetch), issueTime(0), hitLevel(0), saved(_saved)
{ }
GpuTranslationState(BaseMMU::Mode tlb_mode,
bool _prefetch=false,
Packet::SenderState *_saved=nullptr)
: tlbMode(tlb_mode), tc(nullptr), deviceId(0), pasId(0),
tlbEntry(nullptr), isPrefetch(_prefetch), issueTime(0), hitLevel(0),
saved(_saved)
{ }
};
} // namespace gem5
#endif // __ARCH_AMDGPU_COMMON_GPU_TRANSLATION_STATE_HH__

View File

@@ -35,6 +35,7 @@
#include <cmath>
#include <cstring>
#include "arch/amdgpu/common/gpu_translation_state.hh"
#include "arch/x86/faults.hh"
#include "arch/x86/insts/microldstop.hh"
#include "arch/x86/page_size.hh"
@@ -664,8 +665,8 @@ namespace X86ISA
Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
X86ISA::PageBytes);
TranslationState *sender_state =
safe_cast<TranslationState*>(pkt->senderState);
GpuTranslationState *sender_state =
safe_cast<GpuTranslationState*>(pkt->senderState);
bool update_stats = !sender_state->isPrefetch;
ThreadContext * tmp_tc = sender_state->tc;
@@ -788,8 +789,8 @@ namespace X86ISA
assert(pkt);
Addr vaddr = pkt->req->getVaddr();
TranslationState *sender_state =
safe_cast<TranslationState*>(pkt->senderState);
GpuTranslationState *sender_state =
safe_cast<GpuTranslationState*>(pkt->senderState);
ThreadContext *tc = sender_state->tc;
Mode mode = sender_state->tlbMode;
@@ -799,7 +800,7 @@ namespace X86ISA
if (tlb_outcome == TLB_HIT) {
DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n",
vaddr);
local_entry = sender_state->tlbEntry;
local_entry = safe_cast<TlbEntry *>(sender_state->tlbEntry);
} else {
DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n",
vaddr);
@@ -809,7 +810,7 @@ namespace X86ISA
* lower TLB level. The senderState should be "carrying" a pointer
* to the correct TLBEntry.
*/
new_entry = sender_state->tlbEntry;
new_entry = safe_cast<TlbEntry *>(sender_state->tlbEntry);
assert(new_entry);
local_entry = new_entry;
@@ -877,8 +878,8 @@ namespace X86ISA
assert(translationReturnEvent[virtPageAddr]);
assert(pkt);
TranslationState *tmp_sender_state =
safe_cast<TranslationState*>(pkt->senderState);
GpuTranslationState *tmp_sender_state =
safe_cast<GpuTranslationState*>(pkt->senderState);
int req_cnt = tmp_sender_state->reqCnt.back();
bool update_stats = !tmp_sender_state->isPrefetch;
@@ -945,8 +946,8 @@ namespace X86ISA
DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
virtPageAddr);
TranslationState *sender_state =
safe_cast<TranslationState*>(pkt->senderState);
GpuTranslationState *sender_state =
safe_cast<GpuTranslationState*>(pkt->senderState);
Process *p = sender_state->tc->getProcessPtr();
Addr vaddr = pkt->req->getVaddr();
@@ -1038,8 +1039,8 @@ namespace X86ISA
void
GpuTLB::handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome tlb_outcome)
{
TranslationState *sender_state =
safe_cast<TranslationState*>(pkt->senderState);
GpuTranslationState *sender_state =
safe_cast<GpuTranslationState*>(pkt->senderState);
ThreadContext *tc = sender_state->tc;
Mode mode = sender_state->tlbMode;
@@ -1051,7 +1052,7 @@ namespace X86ISA
DPRINTF(GPUTLB, "Functional Translation Done - TLB hit for addr "
"%#x\n", vaddr);
local_entry = sender_state->tlbEntry;
local_entry = safe_cast<TlbEntry *>(sender_state->tlbEntry);
} else {
DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr "
"%#x\n", vaddr);
@@ -1061,7 +1062,7 @@ namespace X86ISA
* lower TLB level. The senderState should be "carrying" a pointer
* to the correct TLBEntry.
*/
new_entry = sender_state->tlbEntry;
new_entry = safe_cast<TlbEntry *>(sender_state->tlbEntry);
assert(new_entry);
local_entry = new_entry;
@@ -1110,8 +1111,8 @@ namespace X86ISA
void
GpuTLB::CpuSidePort::recvFunctional(PacketPtr pkt)
{
TranslationState *sender_state =
safe_cast<TranslationState*>(pkt->senderState);
GpuTranslationState *sender_state =
safe_cast<GpuTranslationState*>(pkt->senderState);
ThreadContext *tc = sender_state->tc;
bool update_stats = !sender_state->isPrefetch;

View File

@@ -264,56 +264,6 @@ namespace X86ISA
Port &getPort(const std::string &if_name,
PortID idx=InvalidPortID) override;
/**
* TLB TranslationState: this currently is a somewhat bastardization of
* the usage of SenderState, whereby the receiver of a packet is not
* usually supposed to need to look at the contents of the senderState,
* you're really only supposed to look at what you pushed on, pop it
* off, and send it back.
*
* However, since there is state that we want to pass to the TLBs using
* the send/recv Timing/Functional/etc. APIs, which don't allow for new
* arguments, we need a common TLB senderState to pass between TLBs,
* both "forwards" and "backwards."
*
* So, basically, the rule is that any packet received by a TLB port
* (cpuside OR memside) must be safely castable to a TranslationState.
*/
struct TranslationState : public Packet::SenderState
{
// TLB mode, read or write
Mode tlbMode;
// Thread context associated with this req
ThreadContext *tc;
/*
* TLB entry to be populated and passed back and filled in
* previous TLBs. Equivalent to the data cache concept of
* "data return."
*/
TlbEntry *tlbEntry;
// Is this a TLB prefetch request?
bool isPrefetch;
// When was the req for this translation issued
uint64_t issueTime;
// Remember where this came from
std::vector<ResponsePort*>ports;
// keep track of #uncoalesced reqs per packet per TLB level;
// reqCnt per level >= reqCnt higher level
std::vector<int> reqCnt;
// TLB level this packet hit in; 0 if it hit in the page table
int hitLevel;
Packet::SenderState *saved;
TranslationState(Mode tlb_mode, ThreadContext *_tc,
bool is_prefetch=false,
Packet::SenderState *_saved=nullptr)
: tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
isPrefetch(is_prefetch), issueTime(0),
hitLevel(0),saved(_saved) { }
};
// maximum number of permitted coalesced requests per cycle
int maxCoalescedReqs;
@@ -436,8 +386,6 @@ namespace X86ISA
};
}
using GpuTranslationState = X86ISA::GpuTLB::TranslationState;
} // namespace gem5
#endif // __GPU_TLB_HH__

View File

@@ -33,6 +33,7 @@
#include <cstring>
#include "arch/amdgpu/common/gpu_translation_state.hh"
#include "arch/x86/page_size.hh"
#include "base/logging.hh"
#include "debug/GPUTLB.hh"
@@ -149,7 +150,8 @@ TLBCoalescer::updatePhysAddresses(PacketPtr pkt)
GpuTranslationState *sender_state =
safe_cast<GpuTranslationState*>(pkt->senderState);
TheISA::TlbEntry *tlb_entry = sender_state->tlbEntry;
X86ISA::TlbEntry *tlb_entry =
safe_cast<X86ISA::TlbEntry *>(sender_state->tlbEntry);
assert(tlb_entry);
Addr first_entry_vaddr = tlb_entry->vaddr;
Addr first_entry_paddr = tlb_entry->paddr;