dev-amdgpu: Support for ROCm 6.0 (#926)

Implement several features new in ROCm 6.0 and features required for
future devices. Includes the following:

- Support for multiple command processors
- Improve handling of unknown register addresses
- Use AddrRange for MMIO address regions
- Handle GART writes through SDMA copy
- Implement PCIe indirect reads and writes
- Improve PM4 write to check dword count
- Implement common MI300X instruction
This commit is contained in:
Matthew Poremba
2024-03-21 21:12:09 -07:00
committed by GitHub
23 changed files with 565 additions and 261 deletions

View File

@@ -188,9 +188,15 @@ def makeGpuFSSystem(args):
system.pc.south_bridge.gpu.sdmas = sdma_engines
# Setup PM4 packet processor
pm4_pkt_proc = PM4PacketProcessor()
system.pc.south_bridge.gpu.pm4_pkt_proc = pm4_pkt_proc
# Setup PM4 packet processors
pm4_procs = []
pm4_procs.append(
PM4PacketProcessor(
ip_id=0, mmio_range=AddrRange(start=0xC000, end=0xD000)
)
)
system.pc.south_bridge.gpu.pm4_pkt_procs = pm4_procs
# GPU data path
gpu_mem_mgr = AMDGPUMemoryManager()
@@ -207,7 +213,8 @@ def makeGpuFSSystem(args):
for sdma in sdma_engines:
system._dma_ports.append(sdma)
system._dma_ports.append(device_ih)
system._dma_ports.append(pm4_pkt_proc)
for pm4_proc in pm4_procs:
system._dma_ports.append(pm4_proc)
system._dma_ports.append(system_hub)
system._dma_ports.append(gpu_mem_mgr)
system._dma_ports.append(hsapp_pt_walker)
@@ -221,7 +228,8 @@ def makeGpuFSSystem(args):
for sdma in sdma_engines:
sdma.pio = system.iobus.mem_side_ports
device_ih.pio = system.iobus.mem_side_ports
pm4_pkt_proc.pio = system.iobus.mem_side_ports
for pm4_proc in pm4_procs:
pm4_proc.pio = system.iobus.mem_side_ports
system_hub.pio = system.iobus.mem_side_ports
# Full system needs special TLBs for SQC, Scalar, and vector data ports

View File

@@ -52,7 +52,7 @@ if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
/sbin/m5 exit
fi
modprobe -v amdgpu ip_block_mask=0xff ppfeaturemask=0 dpm=0 audio=0
modprobe -v amdgpu ip_block_mask=0xdf ppfeaturemask=0 dpm=0 audio=0
echo "Running {} {}"
echo "{}" | base64 -d > myapp
chmod +x myapp

View File

@@ -500,10 +500,10 @@ namespace VegaISA
&Decoder::subDecode_OP_FLAT,
&Decoder::subDecode_OP_FLAT,
&Decoder::subDecode_OP_FLAT,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::subDecode_OP_FLAT,
&Decoder::subDecode_OP_FLAT,
&Decoder::subDecode_OP_FLAT,
&Decoder::subDecode_OP_FLAT,
&Decoder::subDecode_OP_MUBUF,
&Decoder::subDecode_OP_MUBUF,
&Decoder::subDecode_OP_MUBUF,
@@ -1091,7 +1091,7 @@ namespace VegaISA
&Decoder::decode_OPU_VOP3__V_MAD_I16,
&Decoder::decode_OPU_VOP3__V_FMA_F16,
&Decoder::decode_OPU_VOP3__V_DIV_FIXUP_F16,
&Decoder::decode_invalid,
&Decoder::decode_OPU_VOP3__V_LSHL_ADD_U64,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
@@ -7053,6 +7053,12 @@ namespace VegaISA
return new Inst_VOP3__V_DIV_FIXUP_F16(&iFmt->iFmt_VOP3A);
}
GPUStaticInst*
Decoder::decode_OPU_VOP3__V_LSHL_ADD_U64(MachInst iFmt)
{
return new Inst_VOP3__V_LSHL_ADD_U64(&iFmt->iFmt_VOP3A);
}
GPUStaticInst*
Decoder::decode_OPU_VOP3__V_INTERP_P1_F32(MachInst iFmt)
{

View File

@@ -470,6 +470,7 @@ namespace VegaISA
GPUStaticInst* decode_OPU_VOP3__V_MAD_I16(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_FMA_F16(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_DIV_FIXUP_F16(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_LSHL_ADD_U64(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_INTERP_P1_F32(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_INTERP_P2_F32(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_INTERP_MOV_F32(MachInst);

View File

@@ -30192,6 +30192,42 @@ namespace VegaISA
void execute(GPUDynInstPtr) override;
}; // Inst_VOP3__V_DIV_FIXUP_F16
class Inst_VOP3__V_LSHL_ADD_U64 : public Inst_VOP3A
{
public:
Inst_VOP3__V_LSHL_ADD_U64(InFmt_VOP3A*);
~Inst_VOP3__V_LSHL_ADD_U64();
int
getNumOperands() override
{
return numDstRegOperands() + numSrcRegOperands();
} // getNumOperands
int numDstRegOperands() override { return 1; }
int numSrcRegOperands() override { return 3; }
int
getOperandSize(int opIdx) override
{
switch (opIdx) {
case 0: //src_0
return 8;
case 1: //src_1
return 4;
case 2: //src_2
return 8;
case 3: //vdst
return 8;
default:
fatal("op idx %i out of bounds\n", opIdx);
return -1;
}
} // getOperandSize
void execute(GPUDynInstPtr) override;
}; // Inst_VOP3__V_LSHL_ADD_U64
class Inst_VOP3__V_CVT_PKACCUM_U8_F32 : public Inst_VOP3A
{
public:

View File

@@ -7630,6 +7630,54 @@ namespace VegaISA
{
panicUnimplemented();
} // execute
// --- Inst_VOP3__V_LSHL_ADD_U64 class methods ---
Inst_VOP3__V_LSHL_ADD_U64::Inst_VOP3__V_LSHL_ADD_U64(InFmt_VOP3A *iFmt)
: Inst_VOP3A(iFmt, "v_lshl_add_u64", false)
{
setFlag(ALU);
} // Inst_VOP3__V_LSHL_ADD_U64
Inst_VOP3__V_LSHL_ADD_U64::~Inst_VOP3__V_LSHL_ADD_U64()
{
} // ~Inst_VOP3__V_LSHL_ADD_U64
// --- description from .arch file ---
// D.u = (S0.u << S1.u[4:0]) + S2.u.
void
Inst_VOP3__V_LSHL_ADD_U64::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
VecOperandU64 vdst(gpuDynInst, instData.VDST);
src0.readSrc();
src1.readSrc();
src2.readSrc();
/**
* input modifiers are supported by FP operations only
*/
assert(!(instData.ABS & 0x1));
assert(!(instData.ABS & 0x2));
assert(!(instData.ABS & 0x4));
assert(!(extData.NEG & 0x1));
assert(!(extData.NEG & 0x2));
assert(!(extData.NEG & 0x4));
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
int shift_amount = bits(src1[lane], 2, 0);
shift_amount = shift_amount > 4 ? 0 : shift_amount;
vdst[lane] = (src0[lane] << shift_amount)
+ src2[lane];
}
}
vdst.write();
} // execute
// --- Inst_VOP3__V_CVT_PKACCUM_U8_F32 class methods ---
Inst_VOP3__V_CVT_PKACCUM_U8_F32::Inst_VOP3__V_CVT_PKACCUM_U8_F32(

View File

@@ -95,7 +95,7 @@ class AMDGPUDevice(PciDevice):
# The config script should not create a new cp here but rather assign the
# same cp that is assigned to the Shader SimObject.
cp = Param.GPUCommandProcessor(NULL, "Command Processor")
pm4_pkt_proc = Param.PM4PacketProcessor("PM4 Packet Processor")
pm4_pkt_procs = VectorParam.PM4PacketProcessor("PM4 Packet Processor")
memory_manager = Param.AMDGPUMemoryManager("GPU Memory Manager")
memories = VectorParam.AbstractMemory([], "All memories in the device")
device_ih = Param.AMDGPUInterruptHandler("GPU Interrupt handler")
@@ -118,6 +118,10 @@ class PM4PacketProcessor(DmaVirtDevice):
cxx_header = "dev/amdgpu/pm4_packet_processor.hh"
cxx_class = "gem5::PM4PacketProcessor"
# Default to 0 as the common case is one PM4 packet processor
ip_id = Param.Int(0, "Instance ID of this PM4 processor")
mmio_range = Param.AddrRange("Range of MMIO addresses")
class AMDGPUMemoryManager(ClockedObject):
type = "AMDGPUMemoryManager"

View File

@@ -49,6 +49,16 @@ enum QueueType
RLC
};
/*
* Hold information about doorbells including queue type and the IP
* block ID if the IP can have multiple instances.
*/
typedef struct
{
QueueType qtype;
int ip_id;
} DoorbellInfo;
// AMD GPUs support 16 different virtual address spaces
static constexpr int AMDGPU_VM_COUNT = 16;
@@ -61,36 +71,11 @@ constexpr int MMIO_BAR = 5;
constexpr uint32_t VGA_ROM_DEFAULT = 0xc0000;
constexpr uint32_t ROM_SIZE = 0x20000; // 128kB
/* SDMA base, size, mmio offset shift. */
static constexpr uint32_t SDMA0_BASE = 0x4980;
static constexpr uint32_t SDMA1_BASE = 0x5180;
static constexpr uint32_t SDMA_SIZE = 0x800;
static constexpr uint32_t SDMA_OFFSET_SHIFT = 2;
/* Interrupt handler base, size, mmio offset shift. */
static constexpr uint32_t IH_BASE = 0x4280;
static constexpr uint32_t IH_SIZE = 0x700;
/* Most MMIOs use DWORD addresses and thus need to be shifted. */
static constexpr uint32_t IH_OFFSET_SHIFT = 2;
/* Graphics register bus manager base, size, mmio offset shift. */
static constexpr uint32_t GRBM_BASE = 0x8000;
static constexpr uint32_t GRBM_SIZE = 0x5000;
static constexpr uint32_t GRBM_OFFSET_SHIFT = 2;
/* GFX base, size, mmio offset shift. */
static constexpr uint32_t GFX_BASE = 0x28000;
static constexpr uint32_t GFX_SIZE = 0x17000;
static constexpr uint32_t GFX_OFFSET_SHIFT = 2;
/* MMHUB base, size, mmio offset shift. */
static constexpr uint32_t MMHUB_BASE = 0x68000;
static constexpr uint32_t MMHUB_SIZE = 0x2120;
static constexpr uint32_t MMHUB_OFFSET_SHIFT = 2;
/* NBIO base and size. */
static constexpr uint32_t NBIO_BASE = 0x0;
static constexpr uint32_t NBIO_SIZE = 0x4280;
} // namespace gem5
#endif // __DEV_AMDGPU_AMDGPU_DEFINES_HH__

View File

@@ -54,8 +54,7 @@ namespace gem5
AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
: PciDevice(p), gpuMemMgr(p.memory_manager), deviceIH(p.device_ih),
pm4PktProc(p.pm4_pkt_proc), cp(p.cp),
checkpoint_before_mmios(p.checkpoint_before_mmios),
cp(p.cp), checkpoint_before_mmios(p.checkpoint_before_mmios),
init_interrupt_count(0), _lastVMID(0),
deviceMem(name() + ".deviceMem", p.memories, false, "", false)
{
@@ -81,6 +80,16 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
romRange = RangeSize(VGA_ROM_DEFAULT, ROM_SIZE);
}
if (p.device_name == "Vega10") {
gfx_version = GfxVersion::gfx900;
} else if (p.device_name == "MI100") {
gfx_version = GfxVersion::gfx908;
} else if (p.device_name == "MI200") {
gfx_version = GfxVersion::gfx90a;
} else {
panic("Unknown GPU device %s\n", p.device_name);
}
if (p.trace_file != "") {
mmioReader.readMMIOTrace(p.trace_file);
}
@@ -126,15 +135,47 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
panic("Unknown GPU device %s\n", p.device_name);
}
// Setup PM4 packet processors and sanity check IDs
std::set<int> pm4_ids;
for (auto& pm4 : p.pm4_pkt_procs) {
pm4->setGPUDevice(this);
fatal_if(pm4_ids.count(pm4->getIpId()),
"Two PM4s with same IP IDs is not allowed");
pm4_ids.insert(pm4->getIpId());
pm4PktProcs.insert({pm4->getIpId(), pm4});
pm4Ranges.insert({pm4->getMMIORange(), pm4});
}
// There should be at least one PM4 packet processor with ID 0
fatal_if(!pm4PktProcs.count(0), "No default PM4 processor found");
deviceIH->setGPUDevice(this);
pm4PktProc->setGPUDevice(this);
cp->hsaPacketProc().setGPUDevice(this);
cp->setGPUDevice(this);
nbio.setGPUDevice(this);
// Address aperture for device memory. We tell this to the driver and
// could possibly be anything, but these are the values used by hardware.
uint64_t mmhubBase = 0x8000ULL << 24;
uint64_t mmhubTop = 0x83ffULL << 24;
uint64_t mem_size = 0x3ff0; // 16 GB of memory
gpuvm.setMMHUBBase(mmhubBase);
gpuvm.setMMHUBTop(mmhubTop);
// Map other MMIO apertures based on gfx version. This must be done before
// any calls to get/setRegVal.
// NBIO 0x0 - 0x4280
// IH 0x4280 - 0x4980
// GRBM 0x8000 - 0xC000
// GFX 0x28000 - 0x3F000
// MMHUB 0x68000 - 0x6a120
gpuvm.setMMIOAperture(NBIO_MMIO_RANGE, AddrRange(0x0, 0x4280));
gpuvm.setMMIOAperture(IH_MMIO_RANGE, AddrRange(0x4280, 0x4980));
gpuvm.setMMIOAperture(GRBM_MMIO_RANGE, AddrRange(0x8000, 0xC000));
gpuvm.setMMIOAperture(GFX_MMIO_RANGE, AddrRange(0x28000, 0x3F000));
gpuvm.setMMIOAperture(MMHUB_MMIO_RANGE, AddrRange(0x68000, 0x6A120));
// These are hardcoded register values to return what the driver expects
setRegVal(AMDGPU_MP0_SMN_C2PMSG_33, 0x80000000);
@@ -144,27 +185,19 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
if (p.device_name == "Vega10") {
setRegVal(VEGA10_FB_LOCATION_BASE, mmhubBase >> 24);
setRegVal(VEGA10_FB_LOCATION_TOP, mmhubTop >> 24);
gfx_version = GfxVersion::gfx900;
} else if (p.device_name == "MI100") {
setRegVal(MI100_FB_LOCATION_BASE, mmhubBase >> 24);
setRegVal(MI100_FB_LOCATION_TOP, mmhubTop >> 24);
setRegVal(MI100_MEM_SIZE_REG, 0x3ff0); // 16GB of memory
gfx_version = GfxVersion::gfx908;
setRegVal(MI100_MEM_SIZE_REG, mem_size);
} else if (p.device_name == "MI200") {
// This device can have either 64GB or 128GB of device memory.
// This limits to 16GB for simulation.
setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
setRegVal(MI200_MEM_SIZE_REG, 0x3ff0);
gfx_version = GfxVersion::gfx90a;
setRegVal(MI200_MEM_SIZE_REG, mem_size);
} else {
panic("Unknown GPU device %s\n", p.device_name);
}
gpuvm.setMMHUBBase(mmhubBase);
gpuvm.setMMHUBTop(mmhubTop);
nbio.setGPUDevice(this);
}
void
@@ -357,36 +390,28 @@ AMDGPUDevice::readDoorbell(PacketPtr pkt, Addr offset)
void
AMDGPUDevice::readMMIO(PacketPtr pkt, Addr offset)
{
Addr aperture = gpuvm.getMmioAperture(offset);
Addr aperture_offset = offset - aperture;
AddrRange aperture = gpuvm.getMMIOAperture(offset);
Addr aperture_offset = offset - aperture.start();
// By default read from MMIO trace. Overwrite the packet for a select
// few more dynamic MMIOs.
DPRINTF(AMDGPUDevice, "Read MMIO %#lx\n", offset);
mmioReader.readFromTrace(pkt, MMIO_BAR, offset);
if (regs.find(offset) != regs.end()) {
uint64_t value = regs[offset];
DPRINTF(AMDGPUDevice, "Reading what kernel wrote before: %#x\n",
value);
pkt->setUintX(value, ByteOrder::little);
}
switch (aperture) {
case NBIO_BASE:
if (aperture == gpuvm.getMMIORange(NBIO_MMIO_RANGE)) {
DPRINTF(AMDGPUDevice, "NBIO base\n");
nbio.readMMIO(pkt, aperture_offset);
break;
case GRBM_BASE:
} else if (aperture == gpuvm.getMMIORange(GRBM_MMIO_RANGE)) {
DPRINTF(AMDGPUDevice, "GRBM base\n");
gpuvm.readMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
break;
case GFX_BASE:
} else if (aperture == gpuvm.getMMIORange(GFX_MMIO_RANGE)) {
DPRINTF(AMDGPUDevice, "GFX base\n");
gfx.readMMIO(pkt, aperture_offset);
break;
case MMHUB_BASE:
} else if (aperture == gpuvm.getMMIORange(MMHUB_MMIO_RANGE)) {
DPRINTF(AMDGPUDevice, "MMHUB base\n");
gpuvm.readMMIO(pkt, aperture_offset >> MMHUB_OFFSET_SHIFT);
break;
default:
break;
} else {
DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for read %#x\n", offset);
}
}
@@ -430,17 +455,22 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
DPRINTF(AMDGPUDevice, "Wrote doorbell %#lx\n", offset);
if (doorbells.find(offset) != doorbells.end()) {
QueueType q_type = doorbells[offset];
QueueType q_type = doorbells[offset].qtype;
int ip_id = doorbells[offset].ip_id;
DPRINTF(AMDGPUDevice, "Doorbell offset %p queue: %d\n",
offset, q_type);
switch (q_type) {
case Compute:
pm4PktProc->process(pm4PktProc->getQueue(offset),
pkt->getLE<uint64_t>());
assert(pm4PktProcs.count(ip_id));
pm4PktProcs[ip_id]->process(
pm4PktProcs[ip_id]->getQueue(offset),
pkt->getLE<uint64_t>());
break;
case Gfx:
pm4PktProc->process(pm4PktProc->getQueue(offset, true),
pkt->getLE<uint64_t>());
assert(pm4PktProcs.count(ip_id));
pm4PktProcs[ip_id]->process(
pm4PktProcs[ip_id]->getQueue(offset, true),
pkt->getLE<uint64_t>());
break;
case SDMAGfx: {
SDMAEngine *sdmaEng = getSDMAEngine(offset);
@@ -451,9 +481,11 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
sdmaEng->processPage(pkt->getLE<uint64_t>());
} break;
case ComputeAQL: {
assert(pm4PktProcs.count(ip_id));
cp->hsaPacketProc().hwScheduler()->write(offset,
pkt->getLE<uint64_t>() + 1);
pm4PktProc->updateReadIndex(offset, pkt->getLE<uint64_t>() + 1);
pm4PktProcs[ip_id]->updateReadIndex(offset,
pkt->getLE<uint64_t>() + 1);
} break;
case InterruptHandler:
deviceIH->updateRptr(pkt->getLE<uint32_t>());
@@ -483,12 +515,12 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
void
AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset)
{
Addr aperture = gpuvm.getMmioAperture(offset);
Addr aperture_offset = offset - aperture;
AddrRange aperture = gpuvm.getMMIOAperture(offset);
Addr aperture_offset = offset - aperture.start();
DPRINTF(AMDGPUDevice, "Wrote MMIO %#lx\n", offset);
// Check SDMA functions first, then fallback to switch statement
// Check SDMA functions first, then fallback to MMIO ranges.
for (int idx = 0; idx < sdmaIds.size(); ++idx) {
if (sdmaMmios[idx].contains(offset)) {
Addr sdma_offset = (offset - sdmaMmios[idx].start()) >> 2;
@@ -506,26 +538,31 @@ AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset)
}
}
switch (aperture) {
/* Write a general register to the graphics register bus manager. */
case GRBM_BASE:
// Check PM4s next, returning to avoid duplicate writes.
for (auto& [range, pm4_proc] : pm4Ranges) {
if (range.contains(offset)) {
// PM4 MMIOs are offset based on the MMIO range start
Addr ip_offset = offset - range.start();
pm4_proc->writeMMIO(pkt, ip_offset >> GRBM_OFFSET_SHIFT);
return;
}
}
if (aperture == gpuvm.getMMIORange(GRBM_MMIO_RANGE)) {
DPRINTF(AMDGPUDevice, "GRBM base\n");
gpuvm.writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
pm4PktProc->writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
break;
/* Write a register to the interrupt handler. */
case IH_BASE:
} else if (aperture == gpuvm.getMMIORange(IH_MMIO_RANGE)) {
DPRINTF(AMDGPUDevice, "IH base\n");
deviceIH->writeMMIO(pkt, aperture_offset >> IH_OFFSET_SHIFT);
break;
/* Write an IO space register */
case NBIO_BASE:
} else if (aperture == gpuvm.getMMIORange(NBIO_MMIO_RANGE)) {
DPRINTF(AMDGPUDevice, "NBIO base\n");
nbio.writeMMIO(pkt, aperture_offset);
break;
case GFX_BASE:
} else if (aperture == gpuvm.getMMIORange(GFX_MMIO_RANGE)) {
DPRINTF(AMDGPUDevice, "GFX base\n");
gfx.writeMMIO(pkt, aperture_offset);
break;
default:
DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for %#x\n", offset);
break;
} else {
DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for write %#x\n", offset);
}
}
@@ -610,33 +647,47 @@ AMDGPUDevice::processPendingDoorbells(uint32_t offset)
}
}
bool
AMDGPUDevice::haveRegVal(uint32_t addr)
{
return regs.count(addr);
}
uint32_t
AMDGPUDevice::getRegVal(uint32_t addr)
AMDGPUDevice::getRegVal(uint64_t addr)
{
// This is somewhat of a guess based on amdgpu_device_mm_access
// in amdgpu_device.c in the ROCk driver. If bit 32 is 1 then
// assume VRAM and use full address, otherwise assume register
// address and only user lower 31 bits.
Addr fixup_addr = bits(addr, 31, 31) ? addr : addr & 0x7fffffff;
uint32_t pkt_data = 0;
RequestPtr request = std::make_shared<Request>(fixup_addr,
sizeof(uint32_t), 0 /* flags */, vramRequestorId());
PacketPtr pkt = Packet::createRead(request);
pkt->dataStatic((uint8_t *)&pkt_data);
readMMIO(pkt, addr);
DPRINTF(AMDGPUDevice, "Getting register 0x%lx = %x\n",
addr, regs[addr]);
return regs[addr];
fixup_addr, pkt->getLE<uint32_t>());
return pkt->getLE<uint32_t>();
}
void
AMDGPUDevice::setRegVal(uint32_t addr, uint32_t value)
AMDGPUDevice::setRegVal(uint64_t addr, uint32_t value)
{
DPRINTF(AMDGPUDevice, "Setting register 0x%lx to %x\n",
addr, value);
regs[addr] = value;
uint32_t pkt_data = value;
RequestPtr request = std::make_shared<Request>(addr,
sizeof(uint32_t), 0 /* flags */, vramRequestorId());
PacketPtr pkt = Packet::createWrite(request);
pkt->dataStatic((uint8_t *)&pkt_data);
writeMMIO(pkt, addr);
}
void
AMDGPUDevice::setDoorbellType(uint32_t offset, QueueType qt)
AMDGPUDevice::setDoorbellType(uint32_t offset, QueueType qt, int ip_id)
{
DPRINTF(AMDGPUDevice, "Setting doorbell type for %x\n", offset);
doorbells[offset] = qt;
doorbells[offset].qtype = qt;
doorbells[offset].ip_id = ip_id;
}
void
@@ -675,22 +726,19 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
// Serialize the PciDevice base class
PciDevice::serialize(cp);
uint64_t regs_size = regs.size();
uint64_t doorbells_size = doorbells.size();
uint64_t sdma_engs_size = sdmaEngs.size();
uint64_t used_vmid_map_size = usedVMIDs.size();
SERIALIZE_SCALAR(regs_size);
SERIALIZE_SCALAR(doorbells_size);
SERIALIZE_SCALAR(sdma_engs_size);
// Save the number of vmids used
SERIALIZE_SCALAR(used_vmid_map_size);
// Make a c-style array of the regs to serialize
uint32_t reg_addrs[regs_size];
uint64_t reg_values[regs_size];
uint32_t doorbells_offset[doorbells_size];
QueueType doorbells_queues[doorbells_size];
int doorbells_ip_ids[doorbells_size];
uint32_t sdma_engs_offset[sdma_engs_size];
int sdma_engs[sdma_engs_size];
int used_vmids[used_vmid_map_size];
@@ -698,16 +746,10 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
std::vector<int> used_vmid_sets;
int idx = 0;
for (auto & it : regs) {
reg_addrs[idx] = it.first;
reg_values[idx] = it.second;
++idx;
}
idx = 0;
for (auto & it : doorbells) {
doorbells_offset[idx] = it.first;
doorbells_queues[idx] = it.second;
doorbells_queues[idx] = it.second.qtype;
doorbells_ip_ids[idx] = it.second.ip_id;
++idx;
}
@@ -732,12 +774,12 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
int* vmid_array = new int[num_queue_id];
std::copy(used_vmid_sets.begin(), used_vmid_sets.end(), vmid_array);
SERIALIZE_ARRAY(reg_addrs, sizeof(reg_addrs)/sizeof(reg_addrs[0]));
SERIALIZE_ARRAY(reg_values, sizeof(reg_values)/sizeof(reg_values[0]));
SERIALIZE_ARRAY(doorbells_offset, sizeof(doorbells_offset)/
sizeof(doorbells_offset[0]));
SERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/
sizeof(doorbells_queues[0]));
SERIALIZE_ARRAY(doorbells_ip_ids, sizeof(doorbells_ip_ids)/
sizeof(doorbells_ip_ids[0]));
SERIALIZE_ARRAY(sdma_engs_offset, sizeof(sdma_engs_offset)/
sizeof(sdma_engs_offset[0]));
SERIALIZE_ARRAY(sdma_engs, sizeof(sdma_engs)/sizeof(sdma_engs[0]));
@@ -764,43 +806,30 @@ AMDGPUDevice::unserialize(CheckpointIn &cp)
// Unserialize the PciDevice base class
PciDevice::unserialize(cp);
uint64_t regs_size = 0;
uint64_t doorbells_size = 0;
uint64_t sdma_engs_size = 0;
uint64_t used_vmid_map_size = 0;
UNSERIALIZE_SCALAR(regs_size);
UNSERIALIZE_SCALAR(doorbells_size);
UNSERIALIZE_SCALAR(sdma_engs_size);
UNSERIALIZE_SCALAR(used_vmid_map_size);
if (regs_size > 0) {
uint32_t reg_addrs[regs_size];
uint64_t reg_values[regs_size];
UNSERIALIZE_ARRAY(reg_addrs, sizeof(reg_addrs)/sizeof(reg_addrs[0]));
UNSERIALIZE_ARRAY(reg_values,
sizeof(reg_values)/sizeof(reg_values[0]));
for (int idx = 0; idx < regs_size; ++idx) {
regs.insert(std::make_pair(reg_addrs[idx], reg_values[idx]));
}
}
if (doorbells_size > 0) {
uint32_t doorbells_offset[doorbells_size];
QueueType doorbells_queues[doorbells_size];
int doorbells_ip_ids[doorbells_size];
UNSERIALIZE_ARRAY(doorbells_offset, sizeof(doorbells_offset)/
sizeof(doorbells_offset[0]));
UNSERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/
sizeof(doorbells_queues[0]));
UNSERIALIZE_ARRAY(doorbells_ip_ids, sizeof(doorbells_ip_ids)/
sizeof(doorbells_ip_ids[0]));
for (int idx = 0; idx < doorbells_size; ++idx) {
regs.insert(std::make_pair(doorbells_offset[idx],
doorbells_queues[idx]));
doorbells[doorbells_offset[idx]] = doorbells_queues[idx];
doorbells[doorbells_offset[idx]].qtype = doorbells_queues[idx];
doorbells[doorbells_offset[idx]].ip_id = doorbells_ip_ids[idx];
}
}

View File

@@ -87,9 +87,7 @@ class AMDGPUDevice : public PciDevice
/**
* Structures to hold registers, doorbells, and some frame memory
*/
using GPURegMap = std::unordered_map<uint32_t, uint64_t>;
GPURegMap regs;
std::unordered_map<uint32_t, QueueType> doorbells;
std::unordered_map<uint32_t, DoorbellInfo> doorbells;
std::unordered_map<uint32_t, PacketPtr> pendingDoorbellPkts;
/**
@@ -115,9 +113,19 @@ class AMDGPUDevice : public PciDevice
AMDGPUMemoryManager *gpuMemMgr;
AMDGPUInterruptHandler *deviceIH;
AMDGPUVM gpuvm;
PM4PacketProcessor *pm4PktProc;
GPUCommandProcessor *cp;
struct AddrRangeHasher
{
std::size_t operator()(const AddrRange& k) const
{
return k.start();
}
};
std::unordered_map<int, PM4PacketProcessor *> pm4PktProcs;
std::unordered_map<AddrRange, PM4PacketProcessor *,
AddrRangeHasher> pm4Ranges;
// SDMAs mapped by doorbell offset
std::unordered_map<uint32_t, SDMAEngine *> sdmaEngs;
// SDMAs mapped by ID
@@ -187,7 +195,7 @@ class AMDGPUDevice : public PciDevice
/**
* Set handles to GPU blocks.
*/
void setDoorbellType(uint32_t offset, QueueType qt);
void setDoorbellType(uint32_t offset, QueueType qt, int ip_id = 0);
void processPendingDoorbells(uint32_t offset);
void setSDMAEngine(Addr offset, SDMAEngine *eng);
@@ -195,9 +203,8 @@ class AMDGPUDevice : public PciDevice
* Register value getter/setter. Used by other GPU blocks to change
* values from incoming driver/user packets.
*/
bool haveRegVal(uint32_t addr);
uint32_t getRegVal(uint32_t addr);
void setRegVal(uint32_t addr, uint32_t value);
uint32_t getRegVal(uint64_t addr);
void setRegVal(uint64_t addr, uint32_t value);
/**
* Methods related to translations and system/device memory.

View File

@@ -37,6 +37,13 @@
namespace gem5
{
AMDGPUGfx::AMDGPUGfx()
{
for (int i = 0; i < SCRATCH_REGS; ++i) {
scratchRegs[i] = 0;
}
}
void
AMDGPUGfx::readMMIO(PacketPtr pkt, Addr offset)
{
@@ -47,6 +54,9 @@ AMDGPUGfx::readMMIO(PacketPtr pkt, Addr offset)
case AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB:
pkt->setLE<uint32_t>(captured_clock_count >> 32);
break;
case AMDGPU_MM_SCRATCH_REG0:
pkt->setLE<uint32_t>(scratchRegs[0]);
break;
default:
break;
}
@@ -65,6 +75,9 @@ AMDGPUGfx::writeMMIO(PacketPtr pkt, Addr offset)
captured_clock_count = curTick() / sim_clock::as_int::ns;
}
break;
case AMDGPU_MM_SCRATCH_REG0:
scratchRegs[0] = pkt->getLE<uint32_t>();
break;
default:
break;
}

View File

@@ -52,13 +52,16 @@
#define AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB 0x13094
#define AMDGPU_MM_RLC_CAPTURE_GPU_CLOCK_COUNT 0x13098
// Scratch registers used for GPU post
#define AMDGPU_MM_SCRATCH_REG0 0x08100
namespace gem5
{
class AMDGPUGfx
{
public:
AMDGPUGfx() { }
AMDGPUGfx();
void readMMIO(PacketPtr pkt, Addr offset);
void writeMMIO(PacketPtr pkt, Addr offset);
@@ -68,6 +71,12 @@ class AMDGPUGfx
* GPU clock count at the time capture MMIO is received.
*/
uint64_t captured_clock_count = 1;
/*
* Scratch registers.
*/
static constexpr int SCRATCH_REGS = 8;
std::array<uint32_t, SCRATCH_REGS> scratchRegs;
};
} // namespace gem5

View File

@@ -53,22 +53,44 @@ AMDGPUNbio::setGPUDevice(AMDGPUDevice *gpu_device)
void
AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset)
{
// For Vega10 we rely on the golden values in an MMIO trace. Return
// immediately as to not clobber those values.
if (gpuDevice->getGfxVersion() == GfxVersion::gfx900) {
if (offset == AMDGPU_PCIE_DATA || offset == AMDGPU_PCIE_DATA2) {
return;
}
}
switch (offset) {
// This is a PCIe status register. At some point during driver init
// the driver checks that interrupts are enabled. This is only
// checked once, so if the MMIO trace does not exactly line up with
// what the driver is doing in gem5, this may still have the first
// bit zero causing driver to fail. Therefore, we always set this
// bit to one as there is no harm to do so.
case AMDGPU_PCIE_DATA_REG:
// PCIE_DATA, PCIE_DATA2, PCIE_INDEX, and PCIE_INDEX2 handle "indirect
// "register reads/writes from the driver. This provides a way to read
// any register by providing a 32-bit address to one of the two INDEX
// registers and then reading the corresponding DATA register. See:
// https://github.com/ROCm/ROCK-Kernel-Driver/blob/roc-6.0.x/drivers/
// gpu/drm/amd/amdgpu/amdgpu_device.c#L459
case AMDGPU_PCIE_DATA:
{
uint32_t value = pkt->getLE<uint32_t>() | 0x1;
DPRINTF(AMDGPUDevice, "Marking interrupts enabled: %#lx\n", value);
uint32_t value = gpuDevice->getRegVal(pcie_index_reg);
DPRINTF(AMDGPUDevice, "Read PCIe index %lx data %x\n",
pcie_index_reg, value);
pkt->setLE<uint32_t>(value);
}
break;
case AMDGPU_PCIE_DATA2:
{
uint32_t value = gpuDevice->getRegVal(pcie_index2_reg);
DPRINTF(AMDGPUDevice, "Read PCIe index2 %lx data2 %x\n",
pcie_index2_reg, value);
pkt->setLE<uint32_t>(value);
}
break;
case AMDGPU_PCIE_INDEX:
pkt->setLE<uint32_t>(pcie_index_reg);
break;
case AMDGPU_PCIE_INDEX2:
pkt->setLE<uint32_t>(pcie_index2_reg);
break;
case AMDGPU_MM_DATA:
//pkt->setLE<uint32_t>(regs[mm_index_reg]);
pkt->setLE<uint32_t>(gpuDevice->getRegVal(mm_index_reg));
break;
case VEGA10_INV_ENG17_ACK1:
@@ -89,17 +111,17 @@ AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset)
case AMDGPU_MP0_SMN_C2PMSG_35:
pkt->setLE<uint32_t>(0x80000000);
break;
case AMDGPU_MP1_SMN_C2PMSG_90:
pkt->setLE<uint32_t>(0x1);
break;
default:
if (triggered_reads.count(offset)) {
DPRINTF(AMDGPUDevice, "Found triggered read for %#x\n", offset);
pkt->setLE<uint32_t>(triggered_reads[offset]);
} else if (gpuDevice->haveRegVal(offset)) {
uint32_t reg_val = gpuDevice->getRegVal(offset);
DPRINTF(AMDGPUDevice, "Reading value of %#lx from regs: %#lx\n",
offset, reg_val);
pkt->setLE<uint32_t>(reg_val);
} else if (regs.count(offset)) {
DPRINTF(AMDGPUDevice, "Returning value of unknown MMIO offset "
"%x: %x\n", offset, regs[offset]);
pkt->setLE<uint32_t>(regs[offset]);
} else {
DPRINTF(AMDGPUDevice, "NBIO Unknown MMIO %#x (%#x)\n", offset,
pkt->getAddr());
@@ -123,6 +145,24 @@ AMDGPUNbio::writeMMIO(PacketPtr pkt, Addr offset)
DPRINTF(AMDGPUDevice, "MM write to reg %#lx data %#lx\n",
mm_index_reg, pkt->getLE<uint32_t>());
gpuDevice->setRegVal(AMDGPU_MM_DATA, pkt->getLE<uint32_t>());
// PCIE_DATA, PCIE_DATA2, PCIE_INDEX, and PCIE_INDEX2 handle "indirect
// "register reads/writes from the driver. This provides a way to read
// any register by providing a 32-bit address to one of the two INDEX
// registers and then reading the corresponding DATA register. See:
// https://github.com/ROCm/ROCK-Kernel-Driver/blob/roc-6.0.x/drivers/
// gpu/drm/amd/amdgpu/amdgpu_device.c#L459
} else if (offset == AMDGPU_PCIE_INDEX) {
assert(pkt->getSize() == 4);
pcie_index_reg = pkt->getLE<uint32_t>();
} else if (offset == AMDGPU_PCIE_DATA) {
assert(pkt->getSize() == 4);
gpuDevice->setRegVal(pcie_index_reg, pkt->getLE<uint32_t>());
} else if (offset == AMDGPU_PCIE_INDEX2) {
assert(pkt->getSize() == 4);
pcie_index2_reg = pkt->getLE<uint32_t>();
} else if (offset == AMDGPU_PCIE_DATA2) {
assert(pkt->getSize() == 4);
gpuDevice->setRegVal(pcie_index2_reg, pkt->getLE<uint32_t>());
} else if (offset == AMDGPU_MP0_SMN_C2PMSG_35) {
// See psp_v3_1_bootloader_load_sos in amdgpu driver code.
if (pkt->getLE<uint32_t>() == 0x10000) {
@@ -144,6 +184,14 @@ AMDGPUNbio::writeMMIO(PacketPtr pkt, Addr offset)
} else if (offset == AMDGPU_MP0_SMN_C2PMSG_71) {
// PSP ring size
psp_ring_size = pkt->getLE<uint32_t>();
} else {
// Fallback to a map of register values. This was previously in the
// AMDGPUDevice, however that short-circuited some reads from other
// IP blocks. Since this is an end point IP block it is safer to use
// here.
regs[offset] = pkt->getLE<uint32_t>();
DPRINTF(AMDGPUDevice, "Writing value of unknown MMIO offset "
"%x: %x\n", offset, regs[offset]);
}
}

View File

@@ -56,7 +56,11 @@ class AMDGPUDevice;
#define AMDGPU_MM_INDEX 0x00000
#define AMDGPU_MM_INDEX_HI 0x00018
#define AMDGPU_MM_DATA 0x00004
#define AMDGPU_PCIE_DATA_REG 0x0003c
#define AMDGPU_PCIE_INDEX 0x00030
#define AMDGPU_PCIE_INDEX2 0x00038
#define AMDGPU_PCIE_DATA 0x00034
#define AMDGPU_PCIE_DATA2 0x0003c
// Message bus related to psp
#define AMDGPU_MP0_SMN_C2PMSG_33 0x58184
@@ -66,6 +70,7 @@ class AMDGPUDevice;
#define AMDGPU_MP0_SMN_C2PMSG_70 0x58218
#define AMDGPU_MP0_SMN_C2PMSG_71 0x5821c
#define AMDGPU_MP0_SMN_C2PMSG_81 0x58244
#define AMDGPU_MP1_SMN_C2PMSG_90 0x58a68
// Device specific invalidation engines used during initialization
#define VEGA10_INV_ENG17_ACK1 0x0a318
@@ -105,6 +110,8 @@ class AMDGPUNbio
* Driver initialization sequence helper variables.
*/
uint64_t mm_index_reg = 0;
uint32_t pcie_index_reg = 0;
uint32_t pcie_index2_reg = 0;
std::unordered_map<uint32_t, uint32_t> triggered_reads;
/*
@@ -115,6 +122,12 @@ class AMDGPUNbio
Addr psp_ring_listen_addr = 0;
int psp_ring_size = 0;
int psp_ring_value = 0;
/*
* Hold values of other registers not explicitly modelled by other blocks.
*/
using GPURegMap = std::unordered_map<uint64_t, uint32_t>;
GPURegMap regs;
};
} // namespace gem5

View File

@@ -37,6 +37,7 @@
#include "base/trace.hh"
#include "debug/AMDGPUDevice.hh"
#include "dev/amdgpu/amdgpu_defines.hh"
#include "dev/amdgpu/amdgpu_device.hh"
#include "mem/packet_access.hh"
namespace gem5
@@ -51,6 +52,35 @@ AMDGPUVM::AMDGPUVM()
for (int i = 0; i < AMDGPU_VM_COUNT; ++i) {
memset(&vmContexts[0], 0, sizeof(AMDGPUVMContext));
}
for (int i = 0; i < NUM_MMIO_RANGES; ++i) {
mmioRanges[i] = AddrRange();
}
}
void
AMDGPUVM::setMMIOAperture(mmio_range_t mmio_aperture, AddrRange range)
{
mmioRanges[mmio_aperture] = range;
}
AddrRange
AMDGPUVM::getMMIORange(mmio_range_t mmio_aperture)
{
return mmioRanges[mmio_aperture];
}
const AddrRange&
AMDGPUVM::getMMIOAperture(Addr offset)
{
for (int i = 0; i < NUM_MMIO_RANGES; ++i) {
if (mmioRanges[i].contains(offset)) {
return mmioRanges[i];
}
}
// Default to NBIO
return mmioRanges[NBIO_MMIO_RANGE];
}
Addr

View File

@@ -99,9 +99,23 @@ static constexpr int AMDGPU_USER_PAGE_SIZE = 4096;
namespace gem5
{
typedef enum : int
{
NBIO_MMIO_RANGE,
MMHUB_MMIO_RANGE,
GFX_MMIO_RANGE,
GRBM_MMIO_RANGE,
IH_MMIO_RANGE,
NUM_MMIO_RANGES
} mmio_range_t;
class AMDGPUDevice;
class AMDGPUVM : public Serializable
{
private:
AMDGPUDevice *gpuDevice;
typedef struct GEM5_PACKED
{
// Page table addresses: from (Base + Start) to (End)
@@ -160,9 +174,13 @@ class AMDGPUVM : public Serializable
*/
std::vector<VegaISA::GpuTLB *> gpu_tlbs;
std::array<AddrRange, NUM_MMIO_RANGES> mmioRanges;
public:
AMDGPUVM();
void setGPUDevice(AMDGPUDevice *gpu_device) { gpuDevice = gpu_device; }
/**
* Return base address of GART table in framebuffer.
*/
@@ -172,6 +190,12 @@ class AMDGPUVM : public Serializable
*/
Addr gartSize();
bool
inGARTRange(Addr paddr)
{
return ((paddr >= gartBase()) && (paddr <= (gartBase() + gartSize())));
}
/**
* Copy of GART table. Typically resides in device memory, however we use
* a copy in gem5 to simplify the interface.
@@ -226,38 +250,11 @@ class AMDGPUVM : public Serializable
Addr getSysAddrRangeLow () { return vmContext0.sysAddrL; }
Addr getSysAddrRangeHigh () { return vmContext0.sysAddrH; }
Addr
getMmioAperture(Addr addr)
{
// Aperture ranges:
// NBIO 0x0 - 0x4280
// IH 0x4280 - 0x4980
// SDMA0 0x4980 - 0x5180
// SDMA1 0x5180 - 0x5980
// GRBM 0x8000 - 0xD000
// GFX 0x28000 - 0x3F000
// MMHUB 0x68000 - 0x6a120
void setMMIOAperture(mmio_range_t mmio_aperture, AddrRange range);
const AddrRange& getMMIOAperture(Addr addr);
AddrRange getMMIORange(mmio_range_t mmio_aperture);
if (IH_BASE <= addr && addr < IH_BASE + IH_SIZE)
return IH_BASE;
else if (SDMA0_BASE <= addr && addr < SDMA0_BASE + SDMA_SIZE)
return SDMA0_BASE;
else if (SDMA1_BASE <= addr && addr < SDMA1_BASE + SDMA_SIZE)
return SDMA1_BASE;
else if (GRBM_BASE <= addr && addr < GRBM_BASE + GRBM_SIZE)
return GRBM_BASE;
else if (GFX_BASE <= addr && addr < GFX_BASE + GFX_SIZE)
return GFX_BASE;
else if (MMHUB_BASE <= addr && addr < MMHUB_BASE + MMHUB_SIZE)
return MMHUB_BASE;
else {
warn_once("Accessing unsupported MMIO aperture! Assuming NBIO\n");
return NBIO_BASE;
}
}
// Gettig mapped aperture base addresses
// Getting mapped aperture base addresses
Addr
getFrameAperture(Addr addr)
{

View File

@@ -75,7 +75,8 @@ void
AMDGPUInterruptHandler::prepareInterruptCookie(ContextID cntxt_id,
uint32_t ring_id,
uint32_t client_id,
uint32_t source_id)
uint32_t source_id,
unsigned node_id)
{
assert(client_id == SOC15_IH_CLIENTID_RLC ||
client_id == SOC15_IH_CLIENTID_SDMA0 ||
@@ -112,6 +113,7 @@ AMDGPUInterruptHandler::prepareInterruptCookie(ContextID cntxt_id,
cookie->clientId = client_id;
cookie->sourceId = source_id;
cookie->ringId = ring_id;
cookie->nodeId = node_id;
cookie->source_data_dw1 = cntxt_id;
interruptQueue.push(cookie);
}

View File

@@ -101,7 +101,8 @@ typedef struct
uint32_t reserved2 : 15;
uint32_t timestamp_src : 1;
uint32_t pasid : 16;
uint32_t reserved3 : 15;
uint32_t nodeId : 8;
uint32_t reserved3 : 7;
uint32_t pasid_src : 1;
uint32_t source_data_dw1;
uint32_t source_data_dw2;
@@ -171,7 +172,7 @@ class AMDGPUInterruptHandler : public DmaDevice
void setGPUDevice(AMDGPUDevice *gpu_device) { gpuDevice = gpu_device; }
void prepareInterruptCookie(ContextID cntxtId, uint32_t ring_id,
uint32_t client_id, uint32_t source_id);
uint32_t client_id, uint32_t source_id, unsigned node_id);
void submitInterruptCookie();
void submitWritePointer();
void intrPost();

View File

@@ -36,34 +36,34 @@
namespace gem5
{
#define mmCP_RB0_BASE 0x1040
#define mmCP_RB0_CNTL 0x1041
#define mmCP_RB_WPTR_POLL_ADDR_LO 0x1046
#define mmCP_RB_WPTR_POLL_ADDR_HI 0x1047
#define mmCP_RB_VMID 0x1051
#define mmCP_RB0_RPTR_ADDR 0x1043
#define mmCP_RB0_RPTR_ADDR_HI 0x1044
#define mmCP_RB0_WPTR 0x1054
#define mmCP_RB0_WPTR_HI 0x1055
#define mmCP_RB_DOORBELL_CONTROL 0x1059
#define mmCP_RB_DOORBELL_RANGE_LOWER 0x105a
#define mmCP_RB_DOORBELL_RANGE_UPPER 0x105b
#define mmCP_RB0_BASE_HI 0x10b1
#define mmCP_RB0_BASE 0x040
#define mmCP_RB0_CNTL 0x041
#define mmCP_RB_WPTR_POLL_ADDR_LO 0x046
#define mmCP_RB_WPTR_POLL_ADDR_HI 0x047
#define mmCP_RB_VMID 0x051
#define mmCP_RB0_RPTR_ADDR 0x043
#define mmCP_RB0_RPTR_ADDR_HI 0x044
#define mmCP_RB0_WPTR 0x054
#define mmCP_RB0_WPTR_HI 0x055
#define mmCP_RB_DOORBELL_CONTROL 0x059
#define mmCP_RB_DOORBELL_RANGE_LOWER 0x05a
#define mmCP_RB_DOORBELL_RANGE_UPPER 0x05b
#define mmCP_RB0_BASE_HI 0x0b1
#define mmCP_HQD_ACTIVE 0x1247
#define mmCP_HQD_VMID 0x1248
#define mmCP_HQD_PQ_BASE 0x124d
#define mmCP_HQD_PQ_BASE_HI 0x124e
#define mmCP_HQD_PQ_DOORBELL_CONTROL 0x1254
#define mmCP_HQD_PQ_RPTR 0x124f
#define mmCP_HQD_PQ_RPTR_REPORT_ADDR 0x1250
#define mmCP_HQD_PQ_RPTR_REPORT_ADDR_HI 0x1251
#define mmCP_HQD_PQ_WPTR_POLL_ADDR 0x1252
#define mmCP_HQD_PQ_WPTR_POLL_ADDR_HI 0x1253
#define mmCP_HQD_PQ_CONTROL 0x1256
#define mmCP_HQD_IB_CONTROL 0x125a
#define mmCP_HQD_PQ_WPTR_LO 0x127b
#define mmCP_HQD_PQ_WPTR_HI 0x127c
#define mmCP_HQD_ACTIVE 0x247
#define mmCP_HQD_VMID 0x248
#define mmCP_HQD_PQ_BASE 0x24d
#define mmCP_HQD_PQ_BASE_HI 0x24e
#define mmCP_HQD_PQ_DOORBELL_CONTROL 0x254
#define mmCP_HQD_PQ_RPTR 0x24f
#define mmCP_HQD_PQ_RPTR_REPORT_ADDR 0x250
#define mmCP_HQD_PQ_RPTR_REPORT_ADDR_HI 0x251
#define mmCP_HQD_PQ_WPTR_POLL_ADDR 0x252
#define mmCP_HQD_PQ_WPTR_POLL_ADDR_HI 0x253
#define mmCP_HQD_PQ_CONTROL 0x256
#define mmCP_HQD_IB_CONTROL 0x25a
#define mmCP_HQD_PQ_WPTR_LO 0x27b
#define mmCP_HQD_PQ_WPTR_HI 0x27c
} // namespace gem5

View File

@@ -49,7 +49,7 @@ namespace gem5
{
PM4PacketProcessor::PM4PacketProcessor(const PM4PacketProcessorParams &p)
: DmaVirtDevice(p)
: DmaVirtDevice(p), _ipId(p.ip_id), _mmioRange(p.mmio_range)
{
memset(&kiq, 0, sizeof(QueueDesc));
memset(&pq, 0, sizeof(QueueDesc));
@@ -144,7 +144,7 @@ PM4PacketProcessor::newQueue(QueueDesc *mqd, Addr offset,
QueueType qt;
qt = mqd->aql ? QueueType::ComputeAQL
: QueueType::Compute;
gpuDevice->setDoorbellType(offset, qt);
gpuDevice->setDoorbellType(offset, qt, getIpId());
DPRINTF(PM4PacketProcessor, "New PM4 queue %d, base: %p offset: %p, me: "
"%d, pipe %d queue: %d size: %d\n", id, q->base(), q->offset(),
@@ -227,9 +227,11 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header)
} break;
case IT_WRITE_DATA: {
dmaBuffer = new PM4WriteData();
DPRINTF(PM4PacketProcessor, "PM4 writeData header: %x, count: %d\n",
header.ordinal, header.count);
cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &)
{ writeData(q, (PM4WriteData *)dmaBuffer); });
{ writeData(q, (PM4WriteData *)dmaBuffer, header); });
dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4WriteData), cb,
dmaBuffer);
} break;
@@ -350,21 +352,46 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header)
}
void
PM4PacketProcessor::writeData(PM4Queue *q, PM4WriteData *pkt)
PM4PacketProcessor::writeData(PM4Queue *q, PM4WriteData *pkt, PM4Header header)
{
q->incRptr(sizeof(PM4WriteData));
Addr addr = getGARTAddr(pkt->destAddr);
DPRINTF(PM4PacketProcessor, "PM4 write addr: %p data: %p.\n", addr,
pkt->data);
auto cb = new DmaVirtCallback<uint32_t>(
[ = ](const uint32_t &) { writeDataDone(q, pkt, addr); });
//TODO: the specs indicate that pkt->data holds the number of dword that
//need to be written.
dmaWriteVirt(addr, sizeof(uint32_t), cb, &pkt->data);
DPRINTF(PM4PacketProcessor, "PM4 write addr: %p data: %p destSel: %d "
"addrIncr: %d resume: %d writeConfirm: %d cachePolicy: %d\n",
pkt->destAddr, pkt->data, pkt->destSel, pkt->addrIncr,
pkt->resume, pkt->writeConfirm, pkt->cachePolicy);
if (!pkt->writeConfirm)
if (pkt->destSel == 5) {
// Memory address destination
Addr addr = getGARTAddr(pkt->destAddr);
// This is a variable length packet. The size of the packet is in
// the header.count field and is set as Number Of Dwords - 1. This
// packet is 4 bytes minuimum meaning the count is minimum 3. To
// get the number of dwords of data subtract two from the count.
unsigned size = (header.count - 2) * sizeof(uint32_t);
DPRINTF(PM4PacketProcessor, "Writing %d bytes to %p\n", size, addr);
auto cb = new DmaVirtCallback<uint32_t>(
[ = ](const uint32_t &) { writeDataDone(q, pkt, addr); });
dmaWriteVirt(addr, size, cb, &pkt->data);
if (!pkt->writeConfirm) {
decodeNext(q);
}
} else if (pkt->destSel == 0) {
// Register dword address destination
Addr byte_addr = pkt->destAddr << 2;
gpuDevice->setRegVal(byte_addr, pkt->data);
// setRegVal is instant on the simulated device so we ignore write
// confirm.
delete pkt;
decodeNext(q);
} else {
fatal("Unknown PM4 writeData destination %d\n", pkt->destSel);
}
}
void
@@ -373,8 +400,9 @@ PM4PacketProcessor::writeDataDone(PM4Queue *q, PM4WriteData *pkt, Addr addr)
DPRINTF(PM4PacketProcessor, "PM4 write completed to %p, %p.\n", addr,
pkt->data);
if (pkt->writeConfirm)
if (pkt->writeConfirm) {
decodeNext(q);
}
delete pkt;
}
@@ -493,7 +521,7 @@ PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
// Register doorbell with GPU device
gpuDevice->setSDMAEngine(pkt->doorbellOffset << 2, sdma_eng);
gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC);
gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC, getIpId());
gpuDevice->processPendingDoorbells(pkt->doorbellOffset << 2);
}
@@ -537,7 +565,8 @@ PM4PacketProcessor::releaseMemDone(PM4Queue *q, PM4ReleaseMem *pkt, Addr addr)
ringId = (q->queue() << 4) | (q->me() << 2) | q->pipe();
}
gpuDevice->getIH()->prepareInterruptCookie(pkt->intCtxId, ringId,
SOC15_IH_CLIENTID_GRBM_CP, CP_EOP);
SOC15_IH_CLIENTID_GRBM_CP, CP_EOP,
0);
gpuDevice->getIH()->submitInterruptCookie();
}
@@ -745,9 +774,14 @@ PM4PacketProcessor::setUconfigReg(PM4Queue *q, PM4SetUconfigReg *pkt)
{
q->incRptr(sizeof(PM4SetUconfigReg));
DPRINTF(PM4PacketProcessor, "SetUconfig offset %x data %x\n",
pkt->offset, pkt->data);
// SET_UCONFIG_REG_START and pkt->offset are dword addresses
uint32_t reg_addr = (PACKET3_SET_UCONFIG_REG_START + pkt->offset) * 4;
// Additional CPs respond to addresses 0x40000 apart.
reg_addr += 0x40000 * getIpId();
gpuDevice->setRegVal(reg_addr, pkt->data);
decodeNext(q);
@@ -822,7 +856,7 @@ PM4PacketProcessor::writeMMIO(PacketPtr pkt, Addr mmio_offset)
break;
case mmCP_HQD_PQ_DOORBELL_CONTROL:
setHqdPqDoorbellCtrl(pkt->getLE<uint32_t>());
gpuDevice->setDoorbellType(getKiqDoorbellOffset(), Compute);
gpuDevice->setDoorbellType(getKiqDoorbellOffset(), Compute, getIpId());
break;
case mmCP_HQD_PQ_RPTR:
setHqdPqPtr(pkt->getLE<uint32_t>());
@@ -884,7 +918,7 @@ PM4PacketProcessor::writeMMIO(PacketPtr pkt, Addr mmio_offset)
break;
case mmCP_RB_DOORBELL_CONTROL:
setRbDoorbellCntrl(pkt->getLE<uint32_t>());
gpuDevice->setDoorbellType(getPqDoorbellOffset(), Gfx);
gpuDevice->setDoorbellType(getPqDoorbellOffset(), Gfx, getIpId());
break;
case mmCP_RB_DOORBELL_RANGE_LOWER:
setRbDoorbellRangeLo(pkt->getLE<uint32_t>());

View File

@@ -63,6 +63,10 @@ class PM4PacketProcessor : public DmaVirtDevice
std::unordered_map<uint16_t, PM4Queue *> queues;
/* A map of PM4 queues based on doorbell offset */
std::unordered_map<uint32_t, PM4Queue *> queuesMap;
int _ipId;
AddrRange _mmioRange;
public:
PM4PacketProcessor(const PM4PacketProcessorParams &p);
@@ -136,7 +140,7 @@ class PM4PacketProcessor : public DmaVirtDevice
void decodeHeader(PM4Queue *q, PM4Header header);
/* Methods that implement PM4 packets */
void writeData(PM4Queue *q, PM4WriteData *pkt);
void writeData(PM4Queue *q, PM4WriteData *pkt, PM4Header header);
void writeDataDone(PM4Queue *q, PM4WriteData *pkt, Addr addr);
void mapQueues(PM4Queue *q, PM4MapQueues *pkt);
void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt);
@@ -188,6 +192,9 @@ class PM4PacketProcessor : public DmaVirtDevice
void setRbDoorbellCntrl(uint32_t data);
void setRbDoorbellRangeLo(uint32_t data);
void setRbDoorbellRangeHi(uint32_t data);
int getIpId() const { return _ipId; }
AddrRange getMMIORange() const { return _mmioRange; }
};
} // namespace gem5

View File

@@ -81,9 +81,9 @@ SDMAEngine::setGPUDevice(AMDGPUDevice *gpu_device)
}
int
SDMAEngine::getIHClientId()
SDMAEngine::getIHClientId(int _id)
{
switch (id) {
switch (_id) {
case 0:
return SOC15_IH_CLIENTID_SDMA0;
case 1:
@@ -627,10 +627,14 @@ SDMAEngine::writeReadData(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer)
// lastly we write read data to the destination address
if (gpuDevice->getVM().inMMHUB(pkt->dest)) {
Addr mmhubAddr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
Addr mmhub_addr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
fatal_if(gpuDevice->getVM().inGARTRange(mmhub_addr),
"SDMA write to GART not implemented");
auto cb = new EventFunctionWrapper(
[ = ]{ writeDone(q, pkt, dmaBuffer); }, name());
gpuDevice->getMemMgr()->writeRequest(mmhubAddr, (uint8_t *)dmaBuffer,
gpuDevice->getMemMgr()->writeRequest(mmhub_addr, (uint8_t *)dmaBuffer,
bufferSize, 0, cb);
} else {
if (q->priv()) {
@@ -663,9 +667,11 @@ SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt)
// count represents the number of bytes - 1 to be copied
pkt->count++;
if (q->priv()) {
DPRINTF(SDMAEngine, "Getting GART addr for %lx\n", pkt->source);
pkt->source = getGARTAddr(pkt->source);
DPRINTF(SDMAEngine, "GART addr %lx\n", pkt->source);
if (!gpuDevice->getVM().inMMHUB(pkt->source)) {
DPRINTF(SDMAEngine, "Getting GART addr for %lx\n", pkt->source);
pkt->source = getGARTAddr(pkt->source);
DPRINTF(SDMAEngine, "GART addr %lx\n", pkt->source);
}
}
// Read data from the source first, then call the copyReadData method
@@ -742,6 +748,19 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
[ = ] (const uint64_t &) { copyDone(q, pkt, dmaBuffer); });
dmaWriteVirt(pkt->dest, pkt->count, cb, (void *)dmaBuffer);
}
// For destinations in the GART table, gem5 uses a mapping tables instead
// of functionally going to device memory, so we need to update that copy.
if (gpuDevice->getVM().inGARTRange(device_addr)) {
// GART entries are always 8 bytes.
assert((pkt->count % 8) == 0);
for (int i = 0; i < pkt->count/8; ++i) {
Addr gart_addr = device_addr + i*8 - gpuDevice->getVM().gartBase();
DPRINTF(SDMAEngine, "Shadow copying to GART table %lx -> %lx\n",
gart_addr, dmaBuffer64[i]);
gpuDevice->getVM().gartTable[gart_addr] = dmaBuffer64[i];
}
}
}
/* Completion of a copy packet. */
@@ -809,8 +828,12 @@ SDMAEngine::trap(SDMAQueue *q, sdmaTrap *pkt)
uint32_t ring_id = (q->queueType() == SDMAPage) ? 3 : 0;
int node_id = 0;
int local_id = getId();
gpuDevice->getIH()->prepareInterruptCookie(pkt->intrContext, ring_id,
getIHClientId(), TRAP_ID);
getIHClientId(local_id),
TRAP_ID, 2*node_id);
gpuDevice->getIH()->submitInterruptCookie();
delete pkt;
@@ -836,8 +859,7 @@ SDMAEngine::srbmWrite(SDMAQueue *q, sdmaSRBMWriteHeader *header,
DPRINTF(SDMAEngine, "SRBM write to %#x with data %#x\n",
reg_addr, pkt->data);
warn_once("SRBM write not performed, no SRBM model. This needs to be fixed"
" if correct system simulation is relying on SRBM registers.");
gpuDevice->setRegVal(reg_addr, pkt->data);
delete header;
delete pkt;
@@ -967,10 +989,14 @@ SDMAEngine::ptePde(SDMAQueue *q, sdmaPtePde *pkt)
// Writing generated data to the destination address.
if (gpuDevice->getVM().inMMHUB(pkt->dest)) {
Addr mmhubAddr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
Addr mmhub_addr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
fatal_if(gpuDevice->getVM().inGARTRange(mmhub_addr),
"SDMA write to GART not implemented");
auto cb = new EventFunctionWrapper(
[ = ]{ ptePdeDone(q, pkt, dmaBuffer); }, name());
gpuDevice->getMemMgr()->writeRequest(mmhubAddr, (uint8_t *)dmaBuffer,
gpuDevice->getMemMgr()->writeRequest(mmhub_addr, (uint8_t *)dmaBuffer,
sizeof(uint64_t) * pkt->count, 0,
cb);
} else {

View File

@@ -172,7 +172,7 @@ class SDMAEngine : public DmaVirtDevice
/**
* Returns the client id for the Interrupt Handler.
*/
int getIHClientId();
int getIHClientId(int _id);
/**
* Methods for translation.