Files
gem5/src/dev/amdgpu/amdgpu_vm.cc
Matthew Poremba 823b5a6eb8 dev-amdgpu: Support multiple CPs and MMIO AddrRanges
Currently gem5 assumes that there is only one command processor (CP)
which contains the PM4 packet processor. Some GPU devices have multiple
CPs which the driver tests individually during POST if they are used or
not. Therefore, these additional CPs need to be supported.

This commit allows for multiple PM4 packet processors which represent
multiple CPs. Each of these processors will have its own independent
MMIO address range. To more easily support ranges, the MMIO addresses
now use AddrRange to index a PM4 packet processor instead of the
hard-coded constexpr MMIO start and size pairs.

By default only one PM4 packet processor is created, meaning the
functionality of the simulation is unchanged for devices currently
supported in gem5.

Change-Id: I977f4fd3a169ef4a78671a4fb58c8ea0e19bf52c
2024-03-21 10:13:55 -05:00

420 lines
14 KiB
C++

/*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "dev/amdgpu/amdgpu_vm.hh"
#include "arch/amdgpu/vega/pagetable_walker.hh"
#include "arch/amdgpu/vega/tlb.hh"
#include "arch/generic/mmu.hh"
#include "base/trace.hh"
#include "debug/AMDGPUDevice.hh"
#include "dev/amdgpu/amdgpu_defines.hh"
#include "dev/amdgpu/amdgpu_device.hh"
#include "mem/packet_access.hh"
namespace gem5
{
AMDGPUVM::AMDGPUVM()
{
// Zero out contexts
memset(&vmContext0, 0, sizeof(AMDGPUSysVMContext));
vmContexts.resize(AMDGPU_VM_COUNT);
for (int i = 0; i < AMDGPU_VM_COUNT; ++i) {
memset(&vmContexts[0], 0, sizeof(AMDGPUVMContext));
}
for (int i = 0; i < NUM_MMIO_RANGES; ++i) {
mmioRanges[i] = AddrRange();
}
}
void
AMDGPUVM::setMMIOAperture(mmio_range_t mmio_aperture, AddrRange range)
{
mmioRanges[mmio_aperture] = range;
}
AddrRange
AMDGPUVM::getMMIORange(mmio_range_t mmio_aperture)
{
return mmioRanges[mmio_aperture];
}
const AddrRange&
AMDGPUVM::getMMIOAperture(Addr offset)
{
for (int i = 0; i < NUM_MMIO_RANGES; ++i) {
if (mmioRanges[i].contains(offset)) {
return mmioRanges[i];
}
}
// Default to NBIO
return mmioRanges[NBIO_MMIO_RANGE];
}
Addr
AMDGPUVM::gartBase()
{
return vmContext0.ptBase;
}
Addr
AMDGPUVM::gartSize()
{
return vmContext0.ptEnd - vmContext0.ptStart;
}
void
AMDGPUVM::readMMIO(PacketPtr pkt, Addr offset)
{
uint32_t value = pkt->getLE<uint32_t>();
switch (offset) {
// MMHUB MMIOs
case mmMMHUB_VM_INVALIDATE_ENG17_SEM:
DPRINTF(AMDGPUDevice, "Marking invalidate ENG17 SEM acquired\n");
pkt->setLE<uint32_t>(1);
break;
case mmMMHUB_VM_INVALIDATE_ENG17_ACK:
// This is only used by driver initialization and only expects an ACK
// for VMID 0 which is the first bit in the response.
DPRINTF(AMDGPUDevice, "Telling driver invalidate ENG17 is complete\n");
pkt->setLE<uint32_t>(1);
break;
case mmMMHUB_VM_FB_LOCATION_BASE:
mmhubBase = ((Addr)bits(value, 23, 0) << 24);
DPRINTF(AMDGPUDevice, "MMHUB FB base set to %#x\n", mmhubBase);
break;
case mmMMHUB_VM_FB_LOCATION_TOP:
mmhubTop = ((Addr)bits(value, 23, 0) << 24) | 0xFFFFFFULL;
DPRINTF(AMDGPUDevice, "MMHUB FB top set to %#x\n", mmhubTop);
break;
// GRBM MMIOs
case mmVM_INVALIDATE_ENG17_ACK:
DPRINTF(AMDGPUDevice, "Overwritting invalidation ENG17 ACK\n");
pkt->setLE<uint32_t>(1);
break;
default:
DPRINTF(AMDGPUDevice, "GPUVM read of unknown MMIO %#x\n", offset);
break;
}
}
void
AMDGPUVM::writeMMIO(PacketPtr pkt, Addr offset)
{
switch (offset) {
// VMID0 MMIOs
case mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_LO32:
vmContext0.ptBaseL = pkt->getLE<uint32_t>();
// Clear extra bits not part of address
vmContext0.ptBaseL = insertBits(vmContext0.ptBaseL, 0, 0, 0);
break;
case mmVM_CONTEXT0_PAGE_TABLE_BASE_ADDR_HI32:
vmContext0.ptBaseH = pkt->getLE<uint32_t>();
break;
case mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_LO32:
vmContext0.ptStartL = pkt->getLE<uint32_t>();
break;
case mmVM_CONTEXT0_PAGE_TABLE_START_ADDR_HI32:
vmContext0.ptStartH = pkt->getLE<uint32_t>();
break;
case mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_LO32:
vmContext0.ptEndL = pkt->getLE<uint32_t>();
break;
case mmVM_CONTEXT0_PAGE_TABLE_END_ADDR_HI32:
vmContext0.ptEndH = pkt->getLE<uint32_t>();
break;
case mmMC_VM_AGP_TOP: {
uint32_t val = pkt->getLE<uint32_t>();
vmContext0.agpTop = (((Addr)bits(val, 23, 0)) << 24) | 0xffffff;
} break;
case mmMC_VM_AGP_BOT: {
uint32_t val = pkt->getLE<uint32_t>();
vmContext0.agpBot = ((Addr)bits(val, 23, 0)) << 24;
} break;
case mmMC_VM_AGP_BASE: {
uint32_t val = pkt->getLE<uint32_t>();
vmContext0.agpBase = ((Addr)bits(val, 23, 0)) << 24;
} break;
case mmMC_VM_FB_LOCATION_TOP: {
uint32_t val = pkt->getLE<uint32_t>();
vmContext0.fbTop = (((Addr)bits(val, 23, 0)) << 24) | 0xffffff;
} break;
case mmMC_VM_FB_LOCATION_BASE: {
uint32_t val = pkt->getLE<uint32_t>();
vmContext0.fbBase = ((Addr)bits(val, 23, 0)) << 24;
} break;
case mmMC_VM_FB_OFFSET: {
uint32_t val = pkt->getLE<uint32_t>();
vmContext0.fbOffset = ((Addr)bits(val, 23, 0)) << 24;
} break;
case mmMC_VM_SYSTEM_APERTURE_LOW_ADDR: {
uint32_t val = pkt->getLE<uint32_t>();
vmContext0.sysAddrL = ((Addr)bits(val, 29, 0)) << 18;
} break;
case mmMC_VM_SYSTEM_APERTURE_HIGH_ADDR: {
uint32_t val = pkt->getLE<uint32_t>();
vmContext0.sysAddrH = ((Addr)bits(val, 29, 0)) << 18;
} break;
default:
break;
}
}
void
AMDGPUVM::registerTLB(VegaISA::GpuTLB *tlb)
{
DPRINTF(AMDGPUDevice, "Registered a TLB with device\n");
gpu_tlbs.push_back(tlb);
}
void
AMDGPUVM::invalidateTLBs()
{
DPRINTF(AMDGPUDevice, "Invalidating all TLBs\n");
for (auto &tlb : gpu_tlbs) {
tlb->invalidateAll();
DPRINTF(AMDGPUDevice, " ... TLB invalidated\n");
}
}
void
AMDGPUVM::serialize(CheckpointOut &cp) const
{
Addr vm0PTBase = vmContext0.ptBase;
Addr vm0PTStart = vmContext0.ptStart;
Addr vm0PTEnd = vmContext0.ptEnd;
uint64_t gartTableSize;
SERIALIZE_SCALAR(vm0PTBase);
SERIALIZE_SCALAR(vm0PTStart);
SERIALIZE_SCALAR(vm0PTEnd);
SERIALIZE_SCALAR(vmContext0.agpBase);
SERIALIZE_SCALAR(vmContext0.agpTop);
SERIALIZE_SCALAR(vmContext0.agpBot);
SERIALIZE_SCALAR(vmContext0.fbBase);
SERIALIZE_SCALAR(vmContext0.fbTop);
SERIALIZE_SCALAR(vmContext0.fbOffset);
SERIALIZE_SCALAR(vmContext0.sysAddrL);
SERIALIZE_SCALAR(vmContext0.sysAddrH);
SERIALIZE_SCALAR(mmhubBase);
SERIALIZE_SCALAR(mmhubTop);
Addr ptBase[AMDGPU_VM_COUNT];
Addr ptStart[AMDGPU_VM_COUNT];
Addr ptEnd[AMDGPU_VM_COUNT];
for (int i = 0; i < AMDGPU_VM_COUNT; i++) {
ptBase[i] = vmContexts[i].ptBase;
ptStart[i] = vmContexts[i].ptStart;
ptEnd[i] = vmContexts[i].ptEnd;
}
SERIALIZE_ARRAY(ptBase, AMDGPU_VM_COUNT);
SERIALIZE_ARRAY(ptStart, AMDGPU_VM_COUNT);
SERIALIZE_ARRAY(ptEnd, AMDGPU_VM_COUNT);
gartTableSize = gartTable.size();
uint64_t* gartTableKey = new uint64_t[gartTableSize];
uint64_t* gartTableValue = new uint64_t[gartTableSize];
SERIALIZE_SCALAR(gartTableSize);
int i = 0;
for (auto it = gartTable.begin(); it != gartTable.end(); ++it) {
gartTableKey[i] = it->first;
gartTableValue[i] = it->second;
i++;
}
SERIALIZE_ARRAY(gartTableKey, gartTableSize);
SERIALIZE_ARRAY(gartTableValue, gartTableSize);
delete[] gartTableKey;
delete[] gartTableValue;
}
void
AMDGPUVM::unserialize(CheckpointIn &cp)
{
// Unserialize requires fields not be packed
Addr vm0PTBase;
Addr vm0PTStart;
Addr vm0PTEnd;
uint64_t gartTableSize, *gartTableKey, *gartTableValue;
UNSERIALIZE_SCALAR(vm0PTBase);
UNSERIALIZE_SCALAR(vm0PTStart);
UNSERIALIZE_SCALAR(vm0PTEnd);
vmContext0.ptBase = vm0PTBase;
vmContext0.ptStart = vm0PTStart;
vmContext0.ptEnd = vm0PTEnd;
UNSERIALIZE_SCALAR(vmContext0.agpBase);
UNSERIALIZE_SCALAR(vmContext0.agpTop);
UNSERIALIZE_SCALAR(vmContext0.agpBot);
UNSERIALIZE_SCALAR(vmContext0.fbBase);
UNSERIALIZE_SCALAR(vmContext0.fbTop);
UNSERIALIZE_SCALAR(vmContext0.fbOffset);
UNSERIALIZE_SCALAR(vmContext0.sysAddrL);
UNSERIALIZE_SCALAR(vmContext0.sysAddrH);
UNSERIALIZE_SCALAR(mmhubBase);
UNSERIALIZE_SCALAR(mmhubTop);
Addr ptBase[AMDGPU_VM_COUNT];
Addr ptStart[AMDGPU_VM_COUNT];
Addr ptEnd[AMDGPU_VM_COUNT];
UNSERIALIZE_ARRAY(ptBase, AMDGPU_VM_COUNT);
UNSERIALIZE_ARRAY(ptStart, AMDGPU_VM_COUNT);
UNSERIALIZE_ARRAY(ptEnd, AMDGPU_VM_COUNT);
for (int i = 0; i < AMDGPU_VM_COUNT; i++) {
vmContexts[i].ptBase = ptBase[i];
vmContexts[i].ptStart = ptStart[i];
vmContexts[i].ptEnd = ptEnd[i];
}
UNSERIALIZE_SCALAR(gartTableSize);
gartTableKey = new uint64_t[gartTableSize];
gartTableValue = new uint64_t[gartTableSize];
UNSERIALIZE_ARRAY(gartTableKey, gartTableSize);
UNSERIALIZE_ARRAY(gartTableValue, gartTableSize);
for (uint64_t i = 0; i < gartTableSize; i++) {
gartTable[gartTableKey[i]] = gartTableValue[i];
}
delete[] gartTableKey;
delete[] gartTableValue;
}
void
AMDGPUVM::AGPTranslationGen::translate(Range &range) const
{
assert(vm->inAGP(range.vaddr));
Addr next = roundUp(range.vaddr, AMDGPU_AGP_PAGE_SIZE);
if (next == range.vaddr)
next += AMDGPU_AGP_PAGE_SIZE;
range.size = std::min(range.size, next - range.vaddr);
range.paddr = range.vaddr - vm->getAGPBot() + vm->getAGPBase();
DPRINTF(AMDGPUDevice, "AMDGPUVM: AGP translation %#lx -> %#lx\n",
range.vaddr, range.paddr);
}
void
AMDGPUVM::GARTTranslationGen::translate(Range &range) const
{
Addr next = roundUp(range.vaddr, AMDGPU_GART_PAGE_SIZE);
if (next == range.vaddr)
next += AMDGPU_GART_PAGE_SIZE;
range.size = std::min(range.size, next - range.vaddr);
Addr gart_addr = bits(range.vaddr, 63, 12);
// This table is a bit hard to iterate over. If we cross a page, the next
// PTE is not necessarily the next entry but actually 7 entries away.
Addr lsb = bits(gart_addr, 2, 0);
gart_addr += lsb * 7;
// GART is a single level translation, so the value at the "virtual" addr
// is the PTE containing the physical address.
auto result = vm->gartTable.find(gart_addr);
if (result == vm->gartTable.end()) {
// There is no reason to fault as there is no recovery mechanism for
// invalid GART entries. Simply panic in this case
warn("GART translation for %p not found", range.vaddr);
// Some PM4 packets have register addresses which we ignore. In that
// case just return the vaddr rather than faulting.
range.paddr = range.vaddr;
} else {
Addr pte = result->second;
Addr lower_bits = bits(range.vaddr, 11, 0);
range.paddr = (bits(pte, 47, 12) << 12) | lower_bits;
}
DPRINTF(AMDGPUDevice, "AMDGPUVM: GART translation %#lx -> %#lx\n",
range.vaddr, range.paddr);
}
void
AMDGPUVM::MMHUBTranslationGen::translate(Range &range) const
{
assert(vm->inMMHUB(range.vaddr));
Addr next = roundUp(range.vaddr, AMDGPU_MMHUB_PAGE_SIZE);
if (next == range.vaddr)
next += AMDGPU_MMHUB_PAGE_SIZE;
range.size = std::min(range.size, next - range.vaddr);
range.paddr = range.vaddr - vm->getMMHUBBase();
DPRINTF(AMDGPUDevice, "AMDGPUVM: MMHUB translation %#lx -> %#lx\n",
range.vaddr, range.paddr);
}
void
AMDGPUVM::UserTranslationGen::translate(Range &range) const
{
// Get base address of the page table for this vmid
Addr base = vm->getPageTableBase(vmid);
Addr start = vm->getPageTableStart(vmid);
DPRINTF(AMDGPUDevice, "User tl base %#lx start %#lx walker %p\n",
base, start, walker);
bool system_bit;
unsigned logBytes;
Addr paddr = range.vaddr;
Fault fault = walker->startFunctional(base, paddr, logBytes,
BaseMMU::Mode::Read, system_bit);
if (fault != NoFault) {
fatal("User translation fault");
}
// GPU page size is variable. Use logBytes to determine size.
const Addr page_size = 1 << logBytes;
Addr next = roundUp(range.vaddr, page_size);
if (next == range.vaddr) {
// We don't know the size of the next page, use default.
next += AMDGPU_USER_PAGE_SIZE;
}
// If we are not in system/host memory, change the address to the MMHUB
// aperture. This is mapped to the same backing memory as device memory.
if (!system_bit) {
paddr += vm->getMMHUBBase();
assert(vm->inMMHUB(paddr));
}
range.size = std::min(range.size, next - range.vaddr);
range.paddr = paddr;
}
} // namespace gem5