dev-amdgpu: Support for ROCm 6.0 (#926)
Implement several features new in ROCm 6.0 and features required for future devices. Includes the following: - Support for multiple command processors - Improve handling of unknown register addresses - Use AddrRange for MMIO address regions - Handle GART writes through SDMA copy - Implement PCIe indirect reads and writes - Improve PM4 write to check dword count - Implement common MI300X instruction
This commit is contained in:
@@ -188,9 +188,15 @@ def makeGpuFSSystem(args):
|
||||
|
||||
system.pc.south_bridge.gpu.sdmas = sdma_engines
|
||||
|
||||
# Setup PM4 packet processor
|
||||
pm4_pkt_proc = PM4PacketProcessor()
|
||||
system.pc.south_bridge.gpu.pm4_pkt_proc = pm4_pkt_proc
|
||||
# Setup PM4 packet processors
|
||||
pm4_procs = []
|
||||
pm4_procs.append(
|
||||
PM4PacketProcessor(
|
||||
ip_id=0, mmio_range=AddrRange(start=0xC000, end=0xD000)
|
||||
)
|
||||
)
|
||||
|
||||
system.pc.south_bridge.gpu.pm4_pkt_procs = pm4_procs
|
||||
|
||||
# GPU data path
|
||||
gpu_mem_mgr = AMDGPUMemoryManager()
|
||||
@@ -207,7 +213,8 @@ def makeGpuFSSystem(args):
|
||||
for sdma in sdma_engines:
|
||||
system._dma_ports.append(sdma)
|
||||
system._dma_ports.append(device_ih)
|
||||
system._dma_ports.append(pm4_pkt_proc)
|
||||
for pm4_proc in pm4_procs:
|
||||
system._dma_ports.append(pm4_proc)
|
||||
system._dma_ports.append(system_hub)
|
||||
system._dma_ports.append(gpu_mem_mgr)
|
||||
system._dma_ports.append(hsapp_pt_walker)
|
||||
@@ -221,7 +228,8 @@ def makeGpuFSSystem(args):
|
||||
for sdma in sdma_engines:
|
||||
sdma.pio = system.iobus.mem_side_ports
|
||||
device_ih.pio = system.iobus.mem_side_ports
|
||||
pm4_pkt_proc.pio = system.iobus.mem_side_ports
|
||||
for pm4_proc in pm4_procs:
|
||||
pm4_proc.pio = system.iobus.mem_side_ports
|
||||
system_hub.pio = system.iobus.mem_side_ports
|
||||
|
||||
# Full system needs special TLBs for SQC, Scalar, and vector data ports
|
||||
|
||||
@@ -52,7 +52,7 @@ if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then
|
||||
echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5."
|
||||
/sbin/m5 exit
|
||||
fi
|
||||
modprobe -v amdgpu ip_block_mask=0xff ppfeaturemask=0 dpm=0 audio=0
|
||||
modprobe -v amdgpu ip_block_mask=0xdf ppfeaturemask=0 dpm=0 audio=0
|
||||
echo "Running {} {}"
|
||||
echo "{}" | base64 -d > myapp
|
||||
chmod +x myapp
|
||||
|
||||
@@ -500,10 +500,10 @@ namespace VegaISA
|
||||
&Decoder::subDecode_OP_FLAT,
|
||||
&Decoder::subDecode_OP_FLAT,
|
||||
&Decoder::subDecode_OP_FLAT,
|
||||
&Decoder::decode_invalid,
|
||||
&Decoder::decode_invalid,
|
||||
&Decoder::decode_invalid,
|
||||
&Decoder::decode_invalid,
|
||||
&Decoder::subDecode_OP_FLAT,
|
||||
&Decoder::subDecode_OP_FLAT,
|
||||
&Decoder::subDecode_OP_FLAT,
|
||||
&Decoder::subDecode_OP_FLAT,
|
||||
&Decoder::subDecode_OP_MUBUF,
|
||||
&Decoder::subDecode_OP_MUBUF,
|
||||
&Decoder::subDecode_OP_MUBUF,
|
||||
@@ -1091,7 +1091,7 @@ namespace VegaISA
|
||||
&Decoder::decode_OPU_VOP3__V_MAD_I16,
|
||||
&Decoder::decode_OPU_VOP3__V_FMA_F16,
|
||||
&Decoder::decode_OPU_VOP3__V_DIV_FIXUP_F16,
|
||||
&Decoder::decode_invalid,
|
||||
&Decoder::decode_OPU_VOP3__V_LSHL_ADD_U64,
|
||||
&Decoder::decode_invalid,
|
||||
&Decoder::decode_invalid,
|
||||
&Decoder::decode_invalid,
|
||||
@@ -7053,6 +7053,12 @@ namespace VegaISA
|
||||
return new Inst_VOP3__V_DIV_FIXUP_F16(&iFmt->iFmt_VOP3A);
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OPU_VOP3__V_LSHL_ADD_U64(MachInst iFmt)
|
||||
{
|
||||
return new Inst_VOP3__V_LSHL_ADD_U64(&iFmt->iFmt_VOP3A);
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OPU_VOP3__V_INTERP_P1_F32(MachInst iFmt)
|
||||
{
|
||||
|
||||
@@ -470,6 +470,7 @@ namespace VegaISA
|
||||
GPUStaticInst* decode_OPU_VOP3__V_MAD_I16(MachInst);
|
||||
GPUStaticInst* decode_OPU_VOP3__V_FMA_F16(MachInst);
|
||||
GPUStaticInst* decode_OPU_VOP3__V_DIV_FIXUP_F16(MachInst);
|
||||
GPUStaticInst* decode_OPU_VOP3__V_LSHL_ADD_U64(MachInst);
|
||||
GPUStaticInst* decode_OPU_VOP3__V_INTERP_P1_F32(MachInst);
|
||||
GPUStaticInst* decode_OPU_VOP3__V_INTERP_P2_F32(MachInst);
|
||||
GPUStaticInst* decode_OPU_VOP3__V_INTERP_MOV_F32(MachInst);
|
||||
|
||||
@@ -30192,6 +30192,42 @@ namespace VegaISA
|
||||
void execute(GPUDynInstPtr) override;
|
||||
}; // Inst_VOP3__V_DIV_FIXUP_F16
|
||||
|
||||
class Inst_VOP3__V_LSHL_ADD_U64 : public Inst_VOP3A
|
||||
{
|
||||
public:
|
||||
Inst_VOP3__V_LSHL_ADD_U64(InFmt_VOP3A*);
|
||||
~Inst_VOP3__V_LSHL_ADD_U64();
|
||||
|
||||
int
|
||||
getNumOperands() override
|
||||
{
|
||||
return numDstRegOperands() + numSrcRegOperands();
|
||||
} // getNumOperands
|
||||
|
||||
int numDstRegOperands() override { return 1; }
|
||||
int numSrcRegOperands() override { return 3; }
|
||||
|
||||
int
|
||||
getOperandSize(int opIdx) override
|
||||
{
|
||||
switch (opIdx) {
|
||||
case 0: //src_0
|
||||
return 8;
|
||||
case 1: //src_1
|
||||
return 4;
|
||||
case 2: //src_2
|
||||
return 8;
|
||||
case 3: //vdst
|
||||
return 8;
|
||||
default:
|
||||
fatal("op idx %i out of bounds\n", opIdx);
|
||||
return -1;
|
||||
}
|
||||
} // getOperandSize
|
||||
|
||||
void execute(GPUDynInstPtr) override;
|
||||
}; // Inst_VOP3__V_LSHL_ADD_U64
|
||||
|
||||
class Inst_VOP3__V_CVT_PKACCUM_U8_F32 : public Inst_VOP3A
|
||||
{
|
||||
public:
|
||||
|
||||
@@ -7630,6 +7630,54 @@ namespace VegaISA
|
||||
{
|
||||
panicUnimplemented();
|
||||
} // execute
|
||||
// --- Inst_VOP3__V_LSHL_ADD_U64 class methods ---
|
||||
|
||||
Inst_VOP3__V_LSHL_ADD_U64::Inst_VOP3__V_LSHL_ADD_U64(InFmt_VOP3A *iFmt)
|
||||
: Inst_VOP3A(iFmt, "v_lshl_add_u64", false)
|
||||
{
|
||||
setFlag(ALU);
|
||||
} // Inst_VOP3__V_LSHL_ADD_U64
|
||||
|
||||
Inst_VOP3__V_LSHL_ADD_U64::~Inst_VOP3__V_LSHL_ADD_U64()
|
||||
{
|
||||
} // ~Inst_VOP3__V_LSHL_ADD_U64
|
||||
|
||||
// --- description from .arch file ---
|
||||
// D.u = (S0.u << S1.u[4:0]) + S2.u.
|
||||
void
|
||||
Inst_VOP3__V_LSHL_ADD_U64::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
|
||||
ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
|
||||
ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
|
||||
VecOperandU64 vdst(gpuDynInst, instData.VDST);
|
||||
|
||||
src0.readSrc();
|
||||
src1.readSrc();
|
||||
src2.readSrc();
|
||||
|
||||
/**
|
||||
* input modifiers are supported by FP operations only
|
||||
*/
|
||||
assert(!(instData.ABS & 0x1));
|
||||
assert(!(instData.ABS & 0x2));
|
||||
assert(!(instData.ABS & 0x4));
|
||||
assert(!(extData.NEG & 0x1));
|
||||
assert(!(extData.NEG & 0x2));
|
||||
assert(!(extData.NEG & 0x4));
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (wf->execMask(lane)) {
|
||||
int shift_amount = bits(src1[lane], 2, 0);
|
||||
shift_amount = shift_amount > 4 ? 0 : shift_amount;
|
||||
vdst[lane] = (src0[lane] << shift_amount)
|
||||
+ src2[lane];
|
||||
}
|
||||
}
|
||||
|
||||
vdst.write();
|
||||
} // execute
|
||||
// --- Inst_VOP3__V_CVT_PKACCUM_U8_F32 class methods ---
|
||||
|
||||
Inst_VOP3__V_CVT_PKACCUM_U8_F32::Inst_VOP3__V_CVT_PKACCUM_U8_F32(
|
||||
|
||||
@@ -95,7 +95,7 @@ class AMDGPUDevice(PciDevice):
|
||||
# The config script should not create a new cp here but rather assign the
|
||||
# same cp that is assigned to the Shader SimObject.
|
||||
cp = Param.GPUCommandProcessor(NULL, "Command Processor")
|
||||
pm4_pkt_proc = Param.PM4PacketProcessor("PM4 Packet Processor")
|
||||
pm4_pkt_procs = VectorParam.PM4PacketProcessor("PM4 Packet Processor")
|
||||
memory_manager = Param.AMDGPUMemoryManager("GPU Memory Manager")
|
||||
memories = VectorParam.AbstractMemory([], "All memories in the device")
|
||||
device_ih = Param.AMDGPUInterruptHandler("GPU Interrupt handler")
|
||||
@@ -118,6 +118,10 @@ class PM4PacketProcessor(DmaVirtDevice):
|
||||
cxx_header = "dev/amdgpu/pm4_packet_processor.hh"
|
||||
cxx_class = "gem5::PM4PacketProcessor"
|
||||
|
||||
# Default to 0 as the common case is one PM4 packet processor
|
||||
ip_id = Param.Int(0, "Instance ID of this PM4 processor")
|
||||
mmio_range = Param.AddrRange("Range of MMIO addresses")
|
||||
|
||||
|
||||
class AMDGPUMemoryManager(ClockedObject):
|
||||
type = "AMDGPUMemoryManager"
|
||||
|
||||
@@ -49,6 +49,16 @@ enum QueueType
|
||||
RLC
|
||||
};
|
||||
|
||||
/*
|
||||
* Hold information about doorbells including queue type and the IP
|
||||
* block ID if the IP can have multiple instances.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
QueueType qtype;
|
||||
int ip_id;
|
||||
} DoorbellInfo;
|
||||
|
||||
// AMD GPUs support 16 different virtual address spaces
|
||||
static constexpr int AMDGPU_VM_COUNT = 16;
|
||||
|
||||
@@ -61,36 +71,11 @@ constexpr int MMIO_BAR = 5;
|
||||
constexpr uint32_t VGA_ROM_DEFAULT = 0xc0000;
|
||||
constexpr uint32_t ROM_SIZE = 0x20000; // 128kB
|
||||
|
||||
/* SDMA base, size, mmio offset shift. */
|
||||
static constexpr uint32_t SDMA0_BASE = 0x4980;
|
||||
static constexpr uint32_t SDMA1_BASE = 0x5180;
|
||||
static constexpr uint32_t SDMA_SIZE = 0x800;
|
||||
static constexpr uint32_t SDMA_OFFSET_SHIFT = 2;
|
||||
|
||||
/* Interrupt handler base, size, mmio offset shift. */
|
||||
static constexpr uint32_t IH_BASE = 0x4280;
|
||||
static constexpr uint32_t IH_SIZE = 0x700;
|
||||
/* Most MMIOs use DWORD addresses and thus need to be shifted. */
|
||||
static constexpr uint32_t IH_OFFSET_SHIFT = 2;
|
||||
|
||||
/* Graphics register bus manager base, size, mmio offset shift. */
|
||||
static constexpr uint32_t GRBM_BASE = 0x8000;
|
||||
static constexpr uint32_t GRBM_SIZE = 0x5000;
|
||||
static constexpr uint32_t GRBM_OFFSET_SHIFT = 2;
|
||||
|
||||
/* GFX base, size, mmio offset shift. */
|
||||
static constexpr uint32_t GFX_BASE = 0x28000;
|
||||
static constexpr uint32_t GFX_SIZE = 0x17000;
|
||||
static constexpr uint32_t GFX_OFFSET_SHIFT = 2;
|
||||
|
||||
/* MMHUB base, size, mmio offset shift. */
|
||||
static constexpr uint32_t MMHUB_BASE = 0x68000;
|
||||
static constexpr uint32_t MMHUB_SIZE = 0x2120;
|
||||
static constexpr uint32_t MMHUB_OFFSET_SHIFT = 2;
|
||||
|
||||
/* NBIO base and size. */
|
||||
static constexpr uint32_t NBIO_BASE = 0x0;
|
||||
static constexpr uint32_t NBIO_SIZE = 0x4280;
|
||||
|
||||
} // namespace gem5
|
||||
|
||||
#endif // __DEV_AMDGPU_AMDGPU_DEFINES_HH__
|
||||
|
||||
@@ -54,8 +54,7 @@ namespace gem5
|
||||
|
||||
AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
|
||||
: PciDevice(p), gpuMemMgr(p.memory_manager), deviceIH(p.device_ih),
|
||||
pm4PktProc(p.pm4_pkt_proc), cp(p.cp),
|
||||
checkpoint_before_mmios(p.checkpoint_before_mmios),
|
||||
cp(p.cp), checkpoint_before_mmios(p.checkpoint_before_mmios),
|
||||
init_interrupt_count(0), _lastVMID(0),
|
||||
deviceMem(name() + ".deviceMem", p.memories, false, "", false)
|
||||
{
|
||||
@@ -81,6 +80,16 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
|
||||
romRange = RangeSize(VGA_ROM_DEFAULT, ROM_SIZE);
|
||||
}
|
||||
|
||||
if (p.device_name == "Vega10") {
|
||||
gfx_version = GfxVersion::gfx900;
|
||||
} else if (p.device_name == "MI100") {
|
||||
gfx_version = GfxVersion::gfx908;
|
||||
} else if (p.device_name == "MI200") {
|
||||
gfx_version = GfxVersion::gfx90a;
|
||||
} else {
|
||||
panic("Unknown GPU device %s\n", p.device_name);
|
||||
}
|
||||
|
||||
if (p.trace_file != "") {
|
||||
mmioReader.readMMIOTrace(p.trace_file);
|
||||
}
|
||||
@@ -126,15 +135,47 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
|
||||
panic("Unknown GPU device %s\n", p.device_name);
|
||||
}
|
||||
|
||||
// Setup PM4 packet processors and sanity check IDs
|
||||
std::set<int> pm4_ids;
|
||||
for (auto& pm4 : p.pm4_pkt_procs) {
|
||||
pm4->setGPUDevice(this);
|
||||
fatal_if(pm4_ids.count(pm4->getIpId()),
|
||||
"Two PM4s with same IP IDs is not allowed");
|
||||
pm4_ids.insert(pm4->getIpId());
|
||||
pm4PktProcs.insert({pm4->getIpId(), pm4});
|
||||
|
||||
pm4Ranges.insert({pm4->getMMIORange(), pm4});
|
||||
}
|
||||
|
||||
// There should be at least one PM4 packet processor with ID 0
|
||||
fatal_if(!pm4PktProcs.count(0), "No default PM4 processor found");
|
||||
|
||||
deviceIH->setGPUDevice(this);
|
||||
pm4PktProc->setGPUDevice(this);
|
||||
cp->hsaPacketProc().setGPUDevice(this);
|
||||
cp->setGPUDevice(this);
|
||||
nbio.setGPUDevice(this);
|
||||
|
||||
// Address aperture for device memory. We tell this to the driver and
|
||||
// could possibly be anything, but these are the values used by hardware.
|
||||
uint64_t mmhubBase = 0x8000ULL << 24;
|
||||
uint64_t mmhubTop = 0x83ffULL << 24;
|
||||
uint64_t mem_size = 0x3ff0; // 16 GB of memory
|
||||
|
||||
gpuvm.setMMHUBBase(mmhubBase);
|
||||
gpuvm.setMMHUBTop(mmhubTop);
|
||||
|
||||
// Map other MMIO apertures based on gfx version. This must be done before
|
||||
// any calls to get/setRegVal.
|
||||
// NBIO 0x0 - 0x4280
|
||||
// IH 0x4280 - 0x4980
|
||||
// GRBM 0x8000 - 0xC000
|
||||
// GFX 0x28000 - 0x3F000
|
||||
// MMHUB 0x68000 - 0x6a120
|
||||
gpuvm.setMMIOAperture(NBIO_MMIO_RANGE, AddrRange(0x0, 0x4280));
|
||||
gpuvm.setMMIOAperture(IH_MMIO_RANGE, AddrRange(0x4280, 0x4980));
|
||||
gpuvm.setMMIOAperture(GRBM_MMIO_RANGE, AddrRange(0x8000, 0xC000));
|
||||
gpuvm.setMMIOAperture(GFX_MMIO_RANGE, AddrRange(0x28000, 0x3F000));
|
||||
gpuvm.setMMIOAperture(MMHUB_MMIO_RANGE, AddrRange(0x68000, 0x6A120));
|
||||
|
||||
// These are hardcoded register values to return what the driver expects
|
||||
setRegVal(AMDGPU_MP0_SMN_C2PMSG_33, 0x80000000);
|
||||
@@ -144,27 +185,19 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p)
|
||||
if (p.device_name == "Vega10") {
|
||||
setRegVal(VEGA10_FB_LOCATION_BASE, mmhubBase >> 24);
|
||||
setRegVal(VEGA10_FB_LOCATION_TOP, mmhubTop >> 24);
|
||||
gfx_version = GfxVersion::gfx900;
|
||||
} else if (p.device_name == "MI100") {
|
||||
setRegVal(MI100_FB_LOCATION_BASE, mmhubBase >> 24);
|
||||
setRegVal(MI100_FB_LOCATION_TOP, mmhubTop >> 24);
|
||||
setRegVal(MI100_MEM_SIZE_REG, 0x3ff0); // 16GB of memory
|
||||
gfx_version = GfxVersion::gfx908;
|
||||
setRegVal(MI100_MEM_SIZE_REG, mem_size);
|
||||
} else if (p.device_name == "MI200") {
|
||||
// This device can have either 64GB or 128GB of device memory.
|
||||
// This limits to 16GB for simulation.
|
||||
setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24);
|
||||
setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24);
|
||||
setRegVal(MI200_MEM_SIZE_REG, 0x3ff0);
|
||||
gfx_version = GfxVersion::gfx90a;
|
||||
setRegVal(MI200_MEM_SIZE_REG, mem_size);
|
||||
} else {
|
||||
panic("Unknown GPU device %s\n", p.device_name);
|
||||
}
|
||||
|
||||
gpuvm.setMMHUBBase(mmhubBase);
|
||||
gpuvm.setMMHUBTop(mmhubTop);
|
||||
|
||||
nbio.setGPUDevice(this);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -357,36 +390,28 @@ AMDGPUDevice::readDoorbell(PacketPtr pkt, Addr offset)
|
||||
void
|
||||
AMDGPUDevice::readMMIO(PacketPtr pkt, Addr offset)
|
||||
{
|
||||
Addr aperture = gpuvm.getMmioAperture(offset);
|
||||
Addr aperture_offset = offset - aperture;
|
||||
AddrRange aperture = gpuvm.getMMIOAperture(offset);
|
||||
Addr aperture_offset = offset - aperture.start();
|
||||
|
||||
// By default read from MMIO trace. Overwrite the packet for a select
|
||||
// few more dynamic MMIOs.
|
||||
DPRINTF(AMDGPUDevice, "Read MMIO %#lx\n", offset);
|
||||
mmioReader.readFromTrace(pkt, MMIO_BAR, offset);
|
||||
|
||||
if (regs.find(offset) != regs.end()) {
|
||||
uint64_t value = regs[offset];
|
||||
DPRINTF(AMDGPUDevice, "Reading what kernel wrote before: %#x\n",
|
||||
value);
|
||||
pkt->setUintX(value, ByteOrder::little);
|
||||
}
|
||||
|
||||
switch (aperture) {
|
||||
case NBIO_BASE:
|
||||
if (aperture == gpuvm.getMMIORange(NBIO_MMIO_RANGE)) {
|
||||
DPRINTF(AMDGPUDevice, "NBIO base\n");
|
||||
nbio.readMMIO(pkt, aperture_offset);
|
||||
break;
|
||||
case GRBM_BASE:
|
||||
} else if (aperture == gpuvm.getMMIORange(GRBM_MMIO_RANGE)) {
|
||||
DPRINTF(AMDGPUDevice, "GRBM base\n");
|
||||
gpuvm.readMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
|
||||
break;
|
||||
case GFX_BASE:
|
||||
} else if (aperture == gpuvm.getMMIORange(GFX_MMIO_RANGE)) {
|
||||
DPRINTF(AMDGPUDevice, "GFX base\n");
|
||||
gfx.readMMIO(pkt, aperture_offset);
|
||||
break;
|
||||
case MMHUB_BASE:
|
||||
} else if (aperture == gpuvm.getMMIORange(MMHUB_MMIO_RANGE)) {
|
||||
DPRINTF(AMDGPUDevice, "MMHUB base\n");
|
||||
gpuvm.readMMIO(pkt, aperture_offset >> MMHUB_OFFSET_SHIFT);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
} else {
|
||||
DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for read %#x\n", offset);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -430,17 +455,22 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
|
||||
DPRINTF(AMDGPUDevice, "Wrote doorbell %#lx\n", offset);
|
||||
|
||||
if (doorbells.find(offset) != doorbells.end()) {
|
||||
QueueType q_type = doorbells[offset];
|
||||
QueueType q_type = doorbells[offset].qtype;
|
||||
int ip_id = doorbells[offset].ip_id;
|
||||
DPRINTF(AMDGPUDevice, "Doorbell offset %p queue: %d\n",
|
||||
offset, q_type);
|
||||
switch (q_type) {
|
||||
case Compute:
|
||||
pm4PktProc->process(pm4PktProc->getQueue(offset),
|
||||
pkt->getLE<uint64_t>());
|
||||
assert(pm4PktProcs.count(ip_id));
|
||||
pm4PktProcs[ip_id]->process(
|
||||
pm4PktProcs[ip_id]->getQueue(offset),
|
||||
pkt->getLE<uint64_t>());
|
||||
break;
|
||||
case Gfx:
|
||||
pm4PktProc->process(pm4PktProc->getQueue(offset, true),
|
||||
pkt->getLE<uint64_t>());
|
||||
assert(pm4PktProcs.count(ip_id));
|
||||
pm4PktProcs[ip_id]->process(
|
||||
pm4PktProcs[ip_id]->getQueue(offset, true),
|
||||
pkt->getLE<uint64_t>());
|
||||
break;
|
||||
case SDMAGfx: {
|
||||
SDMAEngine *sdmaEng = getSDMAEngine(offset);
|
||||
@@ -451,9 +481,11 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
|
||||
sdmaEng->processPage(pkt->getLE<uint64_t>());
|
||||
} break;
|
||||
case ComputeAQL: {
|
||||
assert(pm4PktProcs.count(ip_id));
|
||||
cp->hsaPacketProc().hwScheduler()->write(offset,
|
||||
pkt->getLE<uint64_t>() + 1);
|
||||
pm4PktProc->updateReadIndex(offset, pkt->getLE<uint64_t>() + 1);
|
||||
pm4PktProcs[ip_id]->updateReadIndex(offset,
|
||||
pkt->getLE<uint64_t>() + 1);
|
||||
} break;
|
||||
case InterruptHandler:
|
||||
deviceIH->updateRptr(pkt->getLE<uint32_t>());
|
||||
@@ -483,12 +515,12 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset)
|
||||
void
|
||||
AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset)
|
||||
{
|
||||
Addr aperture = gpuvm.getMmioAperture(offset);
|
||||
Addr aperture_offset = offset - aperture;
|
||||
AddrRange aperture = gpuvm.getMMIOAperture(offset);
|
||||
Addr aperture_offset = offset - aperture.start();
|
||||
|
||||
DPRINTF(AMDGPUDevice, "Wrote MMIO %#lx\n", offset);
|
||||
|
||||
// Check SDMA functions first, then fallback to switch statement
|
||||
// Check SDMA functions first, then fallback to MMIO ranges.
|
||||
for (int idx = 0; idx < sdmaIds.size(); ++idx) {
|
||||
if (sdmaMmios[idx].contains(offset)) {
|
||||
Addr sdma_offset = (offset - sdmaMmios[idx].start()) >> 2;
|
||||
@@ -506,26 +538,31 @@ AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset)
|
||||
}
|
||||
}
|
||||
|
||||
switch (aperture) {
|
||||
/* Write a general register to the graphics register bus manager. */
|
||||
case GRBM_BASE:
|
||||
// Check PM4s next, returning to avoid duplicate writes.
|
||||
for (auto& [range, pm4_proc] : pm4Ranges) {
|
||||
if (range.contains(offset)) {
|
||||
// PM4 MMIOs are offset based on the MMIO range start
|
||||
Addr ip_offset = offset - range.start();
|
||||
pm4_proc->writeMMIO(pkt, ip_offset >> GRBM_OFFSET_SHIFT);
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (aperture == gpuvm.getMMIORange(GRBM_MMIO_RANGE)) {
|
||||
DPRINTF(AMDGPUDevice, "GRBM base\n");
|
||||
gpuvm.writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
|
||||
pm4PktProc->writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT);
|
||||
break;
|
||||
/* Write a register to the interrupt handler. */
|
||||
case IH_BASE:
|
||||
} else if (aperture == gpuvm.getMMIORange(IH_MMIO_RANGE)) {
|
||||
DPRINTF(AMDGPUDevice, "IH base\n");
|
||||
deviceIH->writeMMIO(pkt, aperture_offset >> IH_OFFSET_SHIFT);
|
||||
break;
|
||||
/* Write an IO space register */
|
||||
case NBIO_BASE:
|
||||
} else if (aperture == gpuvm.getMMIORange(NBIO_MMIO_RANGE)) {
|
||||
DPRINTF(AMDGPUDevice, "NBIO base\n");
|
||||
nbio.writeMMIO(pkt, aperture_offset);
|
||||
break;
|
||||
case GFX_BASE:
|
||||
} else if (aperture == gpuvm.getMMIORange(GFX_MMIO_RANGE)) {
|
||||
DPRINTF(AMDGPUDevice, "GFX base\n");
|
||||
gfx.writeMMIO(pkt, aperture_offset);
|
||||
break;
|
||||
default:
|
||||
DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for %#x\n", offset);
|
||||
break;
|
||||
} else {
|
||||
DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for write %#x\n", offset);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -610,33 +647,47 @@ AMDGPUDevice::processPendingDoorbells(uint32_t offset)
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
AMDGPUDevice::haveRegVal(uint32_t addr)
|
||||
{
|
||||
return regs.count(addr);
|
||||
}
|
||||
|
||||
uint32_t
|
||||
AMDGPUDevice::getRegVal(uint32_t addr)
|
||||
AMDGPUDevice::getRegVal(uint64_t addr)
|
||||
{
|
||||
// This is somewhat of a guess based on amdgpu_device_mm_access
|
||||
// in amdgpu_device.c in the ROCk driver. If bit 32 is 1 then
|
||||
// assume VRAM and use full address, otherwise assume register
|
||||
// address and only user lower 31 bits.
|
||||
Addr fixup_addr = bits(addr, 31, 31) ? addr : addr & 0x7fffffff;
|
||||
|
||||
uint32_t pkt_data = 0;
|
||||
RequestPtr request = std::make_shared<Request>(fixup_addr,
|
||||
sizeof(uint32_t), 0 /* flags */, vramRequestorId());
|
||||
PacketPtr pkt = Packet::createRead(request);
|
||||
pkt->dataStatic((uint8_t *)&pkt_data);
|
||||
readMMIO(pkt, addr);
|
||||
DPRINTF(AMDGPUDevice, "Getting register 0x%lx = %x\n",
|
||||
addr, regs[addr]);
|
||||
return regs[addr];
|
||||
fixup_addr, pkt->getLE<uint32_t>());
|
||||
|
||||
return pkt->getLE<uint32_t>();
|
||||
}
|
||||
|
||||
void
|
||||
AMDGPUDevice::setRegVal(uint32_t addr, uint32_t value)
|
||||
AMDGPUDevice::setRegVal(uint64_t addr, uint32_t value)
|
||||
{
|
||||
DPRINTF(AMDGPUDevice, "Setting register 0x%lx to %x\n",
|
||||
addr, value);
|
||||
regs[addr] = value;
|
||||
|
||||
uint32_t pkt_data = value;
|
||||
RequestPtr request = std::make_shared<Request>(addr,
|
||||
sizeof(uint32_t), 0 /* flags */, vramRequestorId());
|
||||
PacketPtr pkt = Packet::createWrite(request);
|
||||
pkt->dataStatic((uint8_t *)&pkt_data);
|
||||
writeMMIO(pkt, addr);
|
||||
}
|
||||
|
||||
void
|
||||
AMDGPUDevice::setDoorbellType(uint32_t offset, QueueType qt)
|
||||
AMDGPUDevice::setDoorbellType(uint32_t offset, QueueType qt, int ip_id)
|
||||
{
|
||||
DPRINTF(AMDGPUDevice, "Setting doorbell type for %x\n", offset);
|
||||
doorbells[offset] = qt;
|
||||
doorbells[offset].qtype = qt;
|
||||
doorbells[offset].ip_id = ip_id;
|
||||
}
|
||||
|
||||
void
|
||||
@@ -675,22 +726,19 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
|
||||
// Serialize the PciDevice base class
|
||||
PciDevice::serialize(cp);
|
||||
|
||||
uint64_t regs_size = regs.size();
|
||||
uint64_t doorbells_size = doorbells.size();
|
||||
uint64_t sdma_engs_size = sdmaEngs.size();
|
||||
uint64_t used_vmid_map_size = usedVMIDs.size();
|
||||
|
||||
SERIALIZE_SCALAR(regs_size);
|
||||
SERIALIZE_SCALAR(doorbells_size);
|
||||
SERIALIZE_SCALAR(sdma_engs_size);
|
||||
// Save the number of vmids used
|
||||
SERIALIZE_SCALAR(used_vmid_map_size);
|
||||
|
||||
// Make a c-style array of the regs to serialize
|
||||
uint32_t reg_addrs[regs_size];
|
||||
uint64_t reg_values[regs_size];
|
||||
uint32_t doorbells_offset[doorbells_size];
|
||||
QueueType doorbells_queues[doorbells_size];
|
||||
int doorbells_ip_ids[doorbells_size];
|
||||
uint32_t sdma_engs_offset[sdma_engs_size];
|
||||
int sdma_engs[sdma_engs_size];
|
||||
int used_vmids[used_vmid_map_size];
|
||||
@@ -698,16 +746,10 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
|
||||
std::vector<int> used_vmid_sets;
|
||||
|
||||
int idx = 0;
|
||||
for (auto & it : regs) {
|
||||
reg_addrs[idx] = it.first;
|
||||
reg_values[idx] = it.second;
|
||||
++idx;
|
||||
}
|
||||
|
||||
idx = 0;
|
||||
for (auto & it : doorbells) {
|
||||
doorbells_offset[idx] = it.first;
|
||||
doorbells_queues[idx] = it.second;
|
||||
doorbells_queues[idx] = it.second.qtype;
|
||||
doorbells_ip_ids[idx] = it.second.ip_id;
|
||||
++idx;
|
||||
}
|
||||
|
||||
@@ -732,12 +774,12 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const
|
||||
int* vmid_array = new int[num_queue_id];
|
||||
std::copy(used_vmid_sets.begin(), used_vmid_sets.end(), vmid_array);
|
||||
|
||||
SERIALIZE_ARRAY(reg_addrs, sizeof(reg_addrs)/sizeof(reg_addrs[0]));
|
||||
SERIALIZE_ARRAY(reg_values, sizeof(reg_values)/sizeof(reg_values[0]));
|
||||
SERIALIZE_ARRAY(doorbells_offset, sizeof(doorbells_offset)/
|
||||
sizeof(doorbells_offset[0]));
|
||||
SERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/
|
||||
sizeof(doorbells_queues[0]));
|
||||
SERIALIZE_ARRAY(doorbells_ip_ids, sizeof(doorbells_ip_ids)/
|
||||
sizeof(doorbells_ip_ids[0]));
|
||||
SERIALIZE_ARRAY(sdma_engs_offset, sizeof(sdma_engs_offset)/
|
||||
sizeof(sdma_engs_offset[0]));
|
||||
SERIALIZE_ARRAY(sdma_engs, sizeof(sdma_engs)/sizeof(sdma_engs[0]));
|
||||
@@ -764,43 +806,30 @@ AMDGPUDevice::unserialize(CheckpointIn &cp)
|
||||
// Unserialize the PciDevice base class
|
||||
PciDevice::unserialize(cp);
|
||||
|
||||
uint64_t regs_size = 0;
|
||||
uint64_t doorbells_size = 0;
|
||||
uint64_t sdma_engs_size = 0;
|
||||
uint64_t used_vmid_map_size = 0;
|
||||
|
||||
UNSERIALIZE_SCALAR(regs_size);
|
||||
UNSERIALIZE_SCALAR(doorbells_size);
|
||||
UNSERIALIZE_SCALAR(sdma_engs_size);
|
||||
UNSERIALIZE_SCALAR(used_vmid_map_size);
|
||||
|
||||
|
||||
if (regs_size > 0) {
|
||||
uint32_t reg_addrs[regs_size];
|
||||
uint64_t reg_values[regs_size];
|
||||
|
||||
UNSERIALIZE_ARRAY(reg_addrs, sizeof(reg_addrs)/sizeof(reg_addrs[0]));
|
||||
UNSERIALIZE_ARRAY(reg_values,
|
||||
sizeof(reg_values)/sizeof(reg_values[0]));
|
||||
|
||||
for (int idx = 0; idx < regs_size; ++idx) {
|
||||
regs.insert(std::make_pair(reg_addrs[idx], reg_values[idx]));
|
||||
}
|
||||
}
|
||||
|
||||
if (doorbells_size > 0) {
|
||||
uint32_t doorbells_offset[doorbells_size];
|
||||
QueueType doorbells_queues[doorbells_size];
|
||||
int doorbells_ip_ids[doorbells_size];
|
||||
|
||||
UNSERIALIZE_ARRAY(doorbells_offset, sizeof(doorbells_offset)/
|
||||
sizeof(doorbells_offset[0]));
|
||||
UNSERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/
|
||||
sizeof(doorbells_queues[0]));
|
||||
UNSERIALIZE_ARRAY(doorbells_ip_ids, sizeof(doorbells_ip_ids)/
|
||||
sizeof(doorbells_ip_ids[0]));
|
||||
|
||||
for (int idx = 0; idx < doorbells_size; ++idx) {
|
||||
regs.insert(std::make_pair(doorbells_offset[idx],
|
||||
doorbells_queues[idx]));
|
||||
doorbells[doorbells_offset[idx]] = doorbells_queues[idx];
|
||||
doorbells[doorbells_offset[idx]].qtype = doorbells_queues[idx];
|
||||
doorbells[doorbells_offset[idx]].ip_id = doorbells_ip_ids[idx];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -87,9 +87,7 @@ class AMDGPUDevice : public PciDevice
|
||||
/**
|
||||
* Structures to hold registers, doorbells, and some frame memory
|
||||
*/
|
||||
using GPURegMap = std::unordered_map<uint32_t, uint64_t>;
|
||||
GPURegMap regs;
|
||||
std::unordered_map<uint32_t, QueueType> doorbells;
|
||||
std::unordered_map<uint32_t, DoorbellInfo> doorbells;
|
||||
std::unordered_map<uint32_t, PacketPtr> pendingDoorbellPkts;
|
||||
|
||||
/**
|
||||
@@ -115,9 +113,19 @@ class AMDGPUDevice : public PciDevice
|
||||
AMDGPUMemoryManager *gpuMemMgr;
|
||||
AMDGPUInterruptHandler *deviceIH;
|
||||
AMDGPUVM gpuvm;
|
||||
PM4PacketProcessor *pm4PktProc;
|
||||
GPUCommandProcessor *cp;
|
||||
|
||||
struct AddrRangeHasher
|
||||
{
|
||||
std::size_t operator()(const AddrRange& k) const
|
||||
{
|
||||
return k.start();
|
||||
}
|
||||
};
|
||||
std::unordered_map<int, PM4PacketProcessor *> pm4PktProcs;
|
||||
std::unordered_map<AddrRange, PM4PacketProcessor *,
|
||||
AddrRangeHasher> pm4Ranges;
|
||||
|
||||
// SDMAs mapped by doorbell offset
|
||||
std::unordered_map<uint32_t, SDMAEngine *> sdmaEngs;
|
||||
// SDMAs mapped by ID
|
||||
@@ -187,7 +195,7 @@ class AMDGPUDevice : public PciDevice
|
||||
/**
|
||||
* Set handles to GPU blocks.
|
||||
*/
|
||||
void setDoorbellType(uint32_t offset, QueueType qt);
|
||||
void setDoorbellType(uint32_t offset, QueueType qt, int ip_id = 0);
|
||||
void processPendingDoorbells(uint32_t offset);
|
||||
void setSDMAEngine(Addr offset, SDMAEngine *eng);
|
||||
|
||||
@@ -195,9 +203,8 @@ class AMDGPUDevice : public PciDevice
|
||||
* Register value getter/setter. Used by other GPU blocks to change
|
||||
* values from incoming driver/user packets.
|
||||
*/
|
||||
bool haveRegVal(uint32_t addr);
|
||||
uint32_t getRegVal(uint32_t addr);
|
||||
void setRegVal(uint32_t addr, uint32_t value);
|
||||
uint32_t getRegVal(uint64_t addr);
|
||||
void setRegVal(uint64_t addr, uint32_t value);
|
||||
|
||||
/**
|
||||
* Methods related to translations and system/device memory.
|
||||
|
||||
@@ -37,6 +37,13 @@
|
||||
namespace gem5
|
||||
{
|
||||
|
||||
AMDGPUGfx::AMDGPUGfx()
|
||||
{
|
||||
for (int i = 0; i < SCRATCH_REGS; ++i) {
|
||||
scratchRegs[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
AMDGPUGfx::readMMIO(PacketPtr pkt, Addr offset)
|
||||
{
|
||||
@@ -47,6 +54,9 @@ AMDGPUGfx::readMMIO(PacketPtr pkt, Addr offset)
|
||||
case AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB:
|
||||
pkt->setLE<uint32_t>(captured_clock_count >> 32);
|
||||
break;
|
||||
case AMDGPU_MM_SCRATCH_REG0:
|
||||
pkt->setLE<uint32_t>(scratchRegs[0]);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -65,6 +75,9 @@ AMDGPUGfx::writeMMIO(PacketPtr pkt, Addr offset)
|
||||
captured_clock_count = curTick() / sim_clock::as_int::ns;
|
||||
}
|
||||
break;
|
||||
case AMDGPU_MM_SCRATCH_REG0:
|
||||
scratchRegs[0] = pkt->getLE<uint32_t>();
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -52,13 +52,16 @@
|
||||
#define AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB 0x13094
|
||||
#define AMDGPU_MM_RLC_CAPTURE_GPU_CLOCK_COUNT 0x13098
|
||||
|
||||
// Scratch registers used for GPU post
|
||||
#define AMDGPU_MM_SCRATCH_REG0 0x08100
|
||||
|
||||
namespace gem5
|
||||
{
|
||||
|
||||
class AMDGPUGfx
|
||||
{
|
||||
public:
|
||||
AMDGPUGfx() { }
|
||||
AMDGPUGfx();
|
||||
|
||||
void readMMIO(PacketPtr pkt, Addr offset);
|
||||
void writeMMIO(PacketPtr pkt, Addr offset);
|
||||
@@ -68,6 +71,12 @@ class AMDGPUGfx
|
||||
* GPU clock count at the time capture MMIO is received.
|
||||
*/
|
||||
uint64_t captured_clock_count = 1;
|
||||
|
||||
/*
|
||||
* Scratch registers.
|
||||
*/
|
||||
static constexpr int SCRATCH_REGS = 8;
|
||||
std::array<uint32_t, SCRATCH_REGS> scratchRegs;
|
||||
};
|
||||
|
||||
} // namespace gem5
|
||||
|
||||
@@ -53,22 +53,44 @@ AMDGPUNbio::setGPUDevice(AMDGPUDevice *gpu_device)
|
||||
void
|
||||
AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset)
|
||||
{
|
||||
// For Vega10 we rely on the golden values in an MMIO trace. Return
|
||||
// immediately as to not clobber those values.
|
||||
if (gpuDevice->getGfxVersion() == GfxVersion::gfx900) {
|
||||
if (offset == AMDGPU_PCIE_DATA || offset == AMDGPU_PCIE_DATA2) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
switch (offset) {
|
||||
// This is a PCIe status register. At some point during driver init
|
||||
// the driver checks that interrupts are enabled. This is only
|
||||
// checked once, so if the MMIO trace does not exactly line up with
|
||||
// what the driver is doing in gem5, this may still have the first
|
||||
// bit zero causing driver to fail. Therefore, we always set this
|
||||
// bit to one as there is no harm to do so.
|
||||
case AMDGPU_PCIE_DATA_REG:
|
||||
// PCIE_DATA, PCIE_DATA2, PCIE_INDEX, and PCIE_INDEX2 handle "indirect
|
||||
// "register reads/writes from the driver. This provides a way to read
|
||||
// any register by providing a 32-bit address to one of the two INDEX
|
||||
// registers and then reading the corresponding DATA register. See:
|
||||
// https://github.com/ROCm/ROCK-Kernel-Driver/blob/roc-6.0.x/drivers/
|
||||
// gpu/drm/amd/amdgpu/amdgpu_device.c#L459
|
||||
case AMDGPU_PCIE_DATA:
|
||||
{
|
||||
uint32_t value = pkt->getLE<uint32_t>() | 0x1;
|
||||
DPRINTF(AMDGPUDevice, "Marking interrupts enabled: %#lx\n", value);
|
||||
uint32_t value = gpuDevice->getRegVal(pcie_index_reg);
|
||||
DPRINTF(AMDGPUDevice, "Read PCIe index %lx data %x\n",
|
||||
pcie_index_reg, value);
|
||||
pkt->setLE<uint32_t>(value);
|
||||
}
|
||||
break;
|
||||
case AMDGPU_PCIE_DATA2:
|
||||
{
|
||||
uint32_t value = gpuDevice->getRegVal(pcie_index2_reg);
|
||||
DPRINTF(AMDGPUDevice, "Read PCIe index2 %lx data2 %x\n",
|
||||
pcie_index2_reg, value);
|
||||
pkt->setLE<uint32_t>(value);
|
||||
}
|
||||
break;
|
||||
case AMDGPU_PCIE_INDEX:
|
||||
pkt->setLE<uint32_t>(pcie_index_reg);
|
||||
break;
|
||||
case AMDGPU_PCIE_INDEX2:
|
||||
pkt->setLE<uint32_t>(pcie_index2_reg);
|
||||
break;
|
||||
case AMDGPU_MM_DATA:
|
||||
//pkt->setLE<uint32_t>(regs[mm_index_reg]);
|
||||
pkt->setLE<uint32_t>(gpuDevice->getRegVal(mm_index_reg));
|
||||
break;
|
||||
case VEGA10_INV_ENG17_ACK1:
|
||||
@@ -89,17 +111,17 @@ AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset)
|
||||
case AMDGPU_MP0_SMN_C2PMSG_35:
|
||||
pkt->setLE<uint32_t>(0x80000000);
|
||||
break;
|
||||
case AMDGPU_MP1_SMN_C2PMSG_90:
|
||||
pkt->setLE<uint32_t>(0x1);
|
||||
break;
|
||||
default:
|
||||
if (triggered_reads.count(offset)) {
|
||||
DPRINTF(AMDGPUDevice, "Found triggered read for %#x\n", offset);
|
||||
pkt->setLE<uint32_t>(triggered_reads[offset]);
|
||||
} else if (gpuDevice->haveRegVal(offset)) {
|
||||
uint32_t reg_val = gpuDevice->getRegVal(offset);
|
||||
|
||||
DPRINTF(AMDGPUDevice, "Reading value of %#lx from regs: %#lx\n",
|
||||
offset, reg_val);
|
||||
|
||||
pkt->setLE<uint32_t>(reg_val);
|
||||
} else if (regs.count(offset)) {
|
||||
DPRINTF(AMDGPUDevice, "Returning value of unknown MMIO offset "
|
||||
"%x: %x\n", offset, regs[offset]);
|
||||
pkt->setLE<uint32_t>(regs[offset]);
|
||||
} else {
|
||||
DPRINTF(AMDGPUDevice, "NBIO Unknown MMIO %#x (%#x)\n", offset,
|
||||
pkt->getAddr());
|
||||
@@ -123,6 +145,24 @@ AMDGPUNbio::writeMMIO(PacketPtr pkt, Addr offset)
|
||||
DPRINTF(AMDGPUDevice, "MM write to reg %#lx data %#lx\n",
|
||||
mm_index_reg, pkt->getLE<uint32_t>());
|
||||
gpuDevice->setRegVal(AMDGPU_MM_DATA, pkt->getLE<uint32_t>());
|
||||
// PCIE_DATA, PCIE_DATA2, PCIE_INDEX, and PCIE_INDEX2 handle "indirect
|
||||
// "register reads/writes from the driver. This provides a way to read
|
||||
// any register by providing a 32-bit address to one of the two INDEX
|
||||
// registers and then reading the corresponding DATA register. See:
|
||||
// https://github.com/ROCm/ROCK-Kernel-Driver/blob/roc-6.0.x/drivers/
|
||||
// gpu/drm/amd/amdgpu/amdgpu_device.c#L459
|
||||
} else if (offset == AMDGPU_PCIE_INDEX) {
|
||||
assert(pkt->getSize() == 4);
|
||||
pcie_index_reg = pkt->getLE<uint32_t>();
|
||||
} else if (offset == AMDGPU_PCIE_DATA) {
|
||||
assert(pkt->getSize() == 4);
|
||||
gpuDevice->setRegVal(pcie_index_reg, pkt->getLE<uint32_t>());
|
||||
} else if (offset == AMDGPU_PCIE_INDEX2) {
|
||||
assert(pkt->getSize() == 4);
|
||||
pcie_index2_reg = pkt->getLE<uint32_t>();
|
||||
} else if (offset == AMDGPU_PCIE_DATA2) {
|
||||
assert(pkt->getSize() == 4);
|
||||
gpuDevice->setRegVal(pcie_index2_reg, pkt->getLE<uint32_t>());
|
||||
} else if (offset == AMDGPU_MP0_SMN_C2PMSG_35) {
|
||||
// See psp_v3_1_bootloader_load_sos in amdgpu driver code.
|
||||
if (pkt->getLE<uint32_t>() == 0x10000) {
|
||||
@@ -144,6 +184,14 @@ AMDGPUNbio::writeMMIO(PacketPtr pkt, Addr offset)
|
||||
} else if (offset == AMDGPU_MP0_SMN_C2PMSG_71) {
|
||||
// PSP ring size
|
||||
psp_ring_size = pkt->getLE<uint32_t>();
|
||||
} else {
|
||||
// Fallback to a map of register values. This was previously in the
|
||||
// AMDGPUDevice, however that short-circuited some reads from other
|
||||
// IP blocks. Since this is an end point IP block it is safer to use
|
||||
// here.
|
||||
regs[offset] = pkt->getLE<uint32_t>();
|
||||
DPRINTF(AMDGPUDevice, "Writing value of unknown MMIO offset "
|
||||
"%x: %x\n", offset, regs[offset]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -56,7 +56,11 @@ class AMDGPUDevice;
|
||||
#define AMDGPU_MM_INDEX 0x00000
|
||||
#define AMDGPU_MM_INDEX_HI 0x00018
|
||||
#define AMDGPU_MM_DATA 0x00004
|
||||
#define AMDGPU_PCIE_DATA_REG 0x0003c
|
||||
|
||||
#define AMDGPU_PCIE_INDEX 0x00030
|
||||
#define AMDGPU_PCIE_INDEX2 0x00038
|
||||
#define AMDGPU_PCIE_DATA 0x00034
|
||||
#define AMDGPU_PCIE_DATA2 0x0003c
|
||||
|
||||
// Message bus related to psp
|
||||
#define AMDGPU_MP0_SMN_C2PMSG_33 0x58184
|
||||
@@ -66,6 +70,7 @@ class AMDGPUDevice;
|
||||
#define AMDGPU_MP0_SMN_C2PMSG_70 0x58218
|
||||
#define AMDGPU_MP0_SMN_C2PMSG_71 0x5821c
|
||||
#define AMDGPU_MP0_SMN_C2PMSG_81 0x58244
|
||||
#define AMDGPU_MP1_SMN_C2PMSG_90 0x58a68
|
||||
|
||||
// Device specific invalidation engines used during initialization
|
||||
#define VEGA10_INV_ENG17_ACK1 0x0a318
|
||||
@@ -105,6 +110,8 @@ class AMDGPUNbio
|
||||
* Driver initialization sequence helper variables.
|
||||
*/
|
||||
uint64_t mm_index_reg = 0;
|
||||
uint32_t pcie_index_reg = 0;
|
||||
uint32_t pcie_index2_reg = 0;
|
||||
std::unordered_map<uint32_t, uint32_t> triggered_reads;
|
||||
|
||||
/*
|
||||
@@ -115,6 +122,12 @@ class AMDGPUNbio
|
||||
Addr psp_ring_listen_addr = 0;
|
||||
int psp_ring_size = 0;
|
||||
int psp_ring_value = 0;
|
||||
|
||||
/*
|
||||
* Hold values of other registers not explicitly modelled by other blocks.
|
||||
*/
|
||||
using GPURegMap = std::unordered_map<uint64_t, uint32_t>;
|
||||
GPURegMap regs;
|
||||
};
|
||||
|
||||
} // namespace gem5
|
||||
|
||||
@@ -37,6 +37,7 @@
|
||||
#include "base/trace.hh"
|
||||
#include "debug/AMDGPUDevice.hh"
|
||||
#include "dev/amdgpu/amdgpu_defines.hh"
|
||||
#include "dev/amdgpu/amdgpu_device.hh"
|
||||
#include "mem/packet_access.hh"
|
||||
|
||||
namespace gem5
|
||||
@@ -51,6 +52,35 @@ AMDGPUVM::AMDGPUVM()
|
||||
for (int i = 0; i < AMDGPU_VM_COUNT; ++i) {
|
||||
memset(&vmContexts[0], 0, sizeof(AMDGPUVMContext));
|
||||
}
|
||||
|
||||
for (int i = 0; i < NUM_MMIO_RANGES; ++i) {
|
||||
mmioRanges[i] = AddrRange();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
AMDGPUVM::setMMIOAperture(mmio_range_t mmio_aperture, AddrRange range)
|
||||
{
|
||||
mmioRanges[mmio_aperture] = range;
|
||||
}
|
||||
|
||||
AddrRange
|
||||
AMDGPUVM::getMMIORange(mmio_range_t mmio_aperture)
|
||||
{
|
||||
return mmioRanges[mmio_aperture];
|
||||
}
|
||||
|
||||
const AddrRange&
|
||||
AMDGPUVM::getMMIOAperture(Addr offset)
|
||||
{
|
||||
for (int i = 0; i < NUM_MMIO_RANGES; ++i) {
|
||||
if (mmioRanges[i].contains(offset)) {
|
||||
return mmioRanges[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Default to NBIO
|
||||
return mmioRanges[NBIO_MMIO_RANGE];
|
||||
}
|
||||
|
||||
Addr
|
||||
|
||||
@@ -99,9 +99,23 @@ static constexpr int AMDGPU_USER_PAGE_SIZE = 4096;
|
||||
namespace gem5
|
||||
{
|
||||
|
||||
typedef enum : int
|
||||
{
|
||||
NBIO_MMIO_RANGE,
|
||||
MMHUB_MMIO_RANGE,
|
||||
GFX_MMIO_RANGE,
|
||||
GRBM_MMIO_RANGE,
|
||||
IH_MMIO_RANGE,
|
||||
NUM_MMIO_RANGES
|
||||
} mmio_range_t;
|
||||
|
||||
class AMDGPUDevice;
|
||||
|
||||
class AMDGPUVM : public Serializable
|
||||
{
|
||||
private:
|
||||
AMDGPUDevice *gpuDevice;
|
||||
|
||||
typedef struct GEM5_PACKED
|
||||
{
|
||||
// Page table addresses: from (Base + Start) to (End)
|
||||
@@ -160,9 +174,13 @@ class AMDGPUVM : public Serializable
|
||||
*/
|
||||
std::vector<VegaISA::GpuTLB *> gpu_tlbs;
|
||||
|
||||
std::array<AddrRange, NUM_MMIO_RANGES> mmioRanges;
|
||||
|
||||
public:
|
||||
AMDGPUVM();
|
||||
|
||||
void setGPUDevice(AMDGPUDevice *gpu_device) { gpuDevice = gpu_device; }
|
||||
|
||||
/**
|
||||
* Return base address of GART table in framebuffer.
|
||||
*/
|
||||
@@ -172,6 +190,12 @@ class AMDGPUVM : public Serializable
|
||||
*/
|
||||
Addr gartSize();
|
||||
|
||||
bool
|
||||
inGARTRange(Addr paddr)
|
||||
{
|
||||
return ((paddr >= gartBase()) && (paddr <= (gartBase() + gartSize())));
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy of GART table. Typically resides in device memory, however we use
|
||||
* a copy in gem5 to simplify the interface.
|
||||
@@ -226,38 +250,11 @@ class AMDGPUVM : public Serializable
|
||||
Addr getSysAddrRangeLow () { return vmContext0.sysAddrL; }
|
||||
Addr getSysAddrRangeHigh () { return vmContext0.sysAddrH; }
|
||||
|
||||
Addr
|
||||
getMmioAperture(Addr addr)
|
||||
{
|
||||
// Aperture ranges:
|
||||
// NBIO 0x0 - 0x4280
|
||||
// IH 0x4280 - 0x4980
|
||||
// SDMA0 0x4980 - 0x5180
|
||||
// SDMA1 0x5180 - 0x5980
|
||||
// GRBM 0x8000 - 0xD000
|
||||
// GFX 0x28000 - 0x3F000
|
||||
// MMHUB 0x68000 - 0x6a120
|
||||
void setMMIOAperture(mmio_range_t mmio_aperture, AddrRange range);
|
||||
const AddrRange& getMMIOAperture(Addr addr);
|
||||
AddrRange getMMIORange(mmio_range_t mmio_aperture);
|
||||
|
||||
if (IH_BASE <= addr && addr < IH_BASE + IH_SIZE)
|
||||
return IH_BASE;
|
||||
else if (SDMA0_BASE <= addr && addr < SDMA0_BASE + SDMA_SIZE)
|
||||
return SDMA0_BASE;
|
||||
else if (SDMA1_BASE <= addr && addr < SDMA1_BASE + SDMA_SIZE)
|
||||
return SDMA1_BASE;
|
||||
else if (GRBM_BASE <= addr && addr < GRBM_BASE + GRBM_SIZE)
|
||||
return GRBM_BASE;
|
||||
else if (GFX_BASE <= addr && addr < GFX_BASE + GFX_SIZE)
|
||||
return GFX_BASE;
|
||||
else if (MMHUB_BASE <= addr && addr < MMHUB_BASE + MMHUB_SIZE)
|
||||
return MMHUB_BASE;
|
||||
else {
|
||||
warn_once("Accessing unsupported MMIO aperture! Assuming NBIO\n");
|
||||
return NBIO_BASE;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Gettig mapped aperture base addresses
|
||||
// Getting mapped aperture base addresses
|
||||
Addr
|
||||
getFrameAperture(Addr addr)
|
||||
{
|
||||
|
||||
@@ -75,7 +75,8 @@ void
|
||||
AMDGPUInterruptHandler::prepareInterruptCookie(ContextID cntxt_id,
|
||||
uint32_t ring_id,
|
||||
uint32_t client_id,
|
||||
uint32_t source_id)
|
||||
uint32_t source_id,
|
||||
unsigned node_id)
|
||||
{
|
||||
assert(client_id == SOC15_IH_CLIENTID_RLC ||
|
||||
client_id == SOC15_IH_CLIENTID_SDMA0 ||
|
||||
@@ -112,6 +113,7 @@ AMDGPUInterruptHandler::prepareInterruptCookie(ContextID cntxt_id,
|
||||
cookie->clientId = client_id;
|
||||
cookie->sourceId = source_id;
|
||||
cookie->ringId = ring_id;
|
||||
cookie->nodeId = node_id;
|
||||
cookie->source_data_dw1 = cntxt_id;
|
||||
interruptQueue.push(cookie);
|
||||
}
|
||||
|
||||
@@ -101,7 +101,8 @@ typedef struct
|
||||
uint32_t reserved2 : 15;
|
||||
uint32_t timestamp_src : 1;
|
||||
uint32_t pasid : 16;
|
||||
uint32_t reserved3 : 15;
|
||||
uint32_t nodeId : 8;
|
||||
uint32_t reserved3 : 7;
|
||||
uint32_t pasid_src : 1;
|
||||
uint32_t source_data_dw1;
|
||||
uint32_t source_data_dw2;
|
||||
@@ -171,7 +172,7 @@ class AMDGPUInterruptHandler : public DmaDevice
|
||||
|
||||
void setGPUDevice(AMDGPUDevice *gpu_device) { gpuDevice = gpu_device; }
|
||||
void prepareInterruptCookie(ContextID cntxtId, uint32_t ring_id,
|
||||
uint32_t client_id, uint32_t source_id);
|
||||
uint32_t client_id, uint32_t source_id, unsigned node_id);
|
||||
void submitInterruptCookie();
|
||||
void submitWritePointer();
|
||||
void intrPost();
|
||||
|
||||
@@ -36,34 +36,34 @@
|
||||
namespace gem5
|
||||
{
|
||||
|
||||
#define mmCP_RB0_BASE 0x1040
|
||||
#define mmCP_RB0_CNTL 0x1041
|
||||
#define mmCP_RB_WPTR_POLL_ADDR_LO 0x1046
|
||||
#define mmCP_RB_WPTR_POLL_ADDR_HI 0x1047
|
||||
#define mmCP_RB_VMID 0x1051
|
||||
#define mmCP_RB0_RPTR_ADDR 0x1043
|
||||
#define mmCP_RB0_RPTR_ADDR_HI 0x1044
|
||||
#define mmCP_RB0_WPTR 0x1054
|
||||
#define mmCP_RB0_WPTR_HI 0x1055
|
||||
#define mmCP_RB_DOORBELL_CONTROL 0x1059
|
||||
#define mmCP_RB_DOORBELL_RANGE_LOWER 0x105a
|
||||
#define mmCP_RB_DOORBELL_RANGE_UPPER 0x105b
|
||||
#define mmCP_RB0_BASE_HI 0x10b1
|
||||
#define mmCP_RB0_BASE 0x040
|
||||
#define mmCP_RB0_CNTL 0x041
|
||||
#define mmCP_RB_WPTR_POLL_ADDR_LO 0x046
|
||||
#define mmCP_RB_WPTR_POLL_ADDR_HI 0x047
|
||||
#define mmCP_RB_VMID 0x051
|
||||
#define mmCP_RB0_RPTR_ADDR 0x043
|
||||
#define mmCP_RB0_RPTR_ADDR_HI 0x044
|
||||
#define mmCP_RB0_WPTR 0x054
|
||||
#define mmCP_RB0_WPTR_HI 0x055
|
||||
#define mmCP_RB_DOORBELL_CONTROL 0x059
|
||||
#define mmCP_RB_DOORBELL_RANGE_LOWER 0x05a
|
||||
#define mmCP_RB_DOORBELL_RANGE_UPPER 0x05b
|
||||
#define mmCP_RB0_BASE_HI 0x0b1
|
||||
|
||||
#define mmCP_HQD_ACTIVE 0x1247
|
||||
#define mmCP_HQD_VMID 0x1248
|
||||
#define mmCP_HQD_PQ_BASE 0x124d
|
||||
#define mmCP_HQD_PQ_BASE_HI 0x124e
|
||||
#define mmCP_HQD_PQ_DOORBELL_CONTROL 0x1254
|
||||
#define mmCP_HQD_PQ_RPTR 0x124f
|
||||
#define mmCP_HQD_PQ_RPTR_REPORT_ADDR 0x1250
|
||||
#define mmCP_HQD_PQ_RPTR_REPORT_ADDR_HI 0x1251
|
||||
#define mmCP_HQD_PQ_WPTR_POLL_ADDR 0x1252
|
||||
#define mmCP_HQD_PQ_WPTR_POLL_ADDR_HI 0x1253
|
||||
#define mmCP_HQD_PQ_CONTROL 0x1256
|
||||
#define mmCP_HQD_IB_CONTROL 0x125a
|
||||
#define mmCP_HQD_PQ_WPTR_LO 0x127b
|
||||
#define mmCP_HQD_PQ_WPTR_HI 0x127c
|
||||
#define mmCP_HQD_ACTIVE 0x247
|
||||
#define mmCP_HQD_VMID 0x248
|
||||
#define mmCP_HQD_PQ_BASE 0x24d
|
||||
#define mmCP_HQD_PQ_BASE_HI 0x24e
|
||||
#define mmCP_HQD_PQ_DOORBELL_CONTROL 0x254
|
||||
#define mmCP_HQD_PQ_RPTR 0x24f
|
||||
#define mmCP_HQD_PQ_RPTR_REPORT_ADDR 0x250
|
||||
#define mmCP_HQD_PQ_RPTR_REPORT_ADDR_HI 0x251
|
||||
#define mmCP_HQD_PQ_WPTR_POLL_ADDR 0x252
|
||||
#define mmCP_HQD_PQ_WPTR_POLL_ADDR_HI 0x253
|
||||
#define mmCP_HQD_PQ_CONTROL 0x256
|
||||
#define mmCP_HQD_IB_CONTROL 0x25a
|
||||
#define mmCP_HQD_PQ_WPTR_LO 0x27b
|
||||
#define mmCP_HQD_PQ_WPTR_HI 0x27c
|
||||
|
||||
} // namespace gem5
|
||||
|
||||
|
||||
@@ -49,7 +49,7 @@ namespace gem5
|
||||
{
|
||||
|
||||
PM4PacketProcessor::PM4PacketProcessor(const PM4PacketProcessorParams &p)
|
||||
: DmaVirtDevice(p)
|
||||
: DmaVirtDevice(p), _ipId(p.ip_id), _mmioRange(p.mmio_range)
|
||||
{
|
||||
memset(&kiq, 0, sizeof(QueueDesc));
|
||||
memset(&pq, 0, sizeof(QueueDesc));
|
||||
@@ -144,7 +144,7 @@ PM4PacketProcessor::newQueue(QueueDesc *mqd, Addr offset,
|
||||
QueueType qt;
|
||||
qt = mqd->aql ? QueueType::ComputeAQL
|
||||
: QueueType::Compute;
|
||||
gpuDevice->setDoorbellType(offset, qt);
|
||||
gpuDevice->setDoorbellType(offset, qt, getIpId());
|
||||
|
||||
DPRINTF(PM4PacketProcessor, "New PM4 queue %d, base: %p offset: %p, me: "
|
||||
"%d, pipe %d queue: %d size: %d\n", id, q->base(), q->offset(),
|
||||
@@ -227,9 +227,11 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header)
|
||||
} break;
|
||||
case IT_WRITE_DATA: {
|
||||
dmaBuffer = new PM4WriteData();
|
||||
DPRINTF(PM4PacketProcessor, "PM4 writeData header: %x, count: %d\n",
|
||||
header.ordinal, header.count);
|
||||
cb = new DmaVirtCallback<uint64_t>(
|
||||
[ = ] (const uint64_t &)
|
||||
{ writeData(q, (PM4WriteData *)dmaBuffer); });
|
||||
{ writeData(q, (PM4WriteData *)dmaBuffer, header); });
|
||||
dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4WriteData), cb,
|
||||
dmaBuffer);
|
||||
} break;
|
||||
@@ -350,21 +352,46 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header)
|
||||
}
|
||||
|
||||
void
|
||||
PM4PacketProcessor::writeData(PM4Queue *q, PM4WriteData *pkt)
|
||||
PM4PacketProcessor::writeData(PM4Queue *q, PM4WriteData *pkt, PM4Header header)
|
||||
{
|
||||
q->incRptr(sizeof(PM4WriteData));
|
||||
|
||||
Addr addr = getGARTAddr(pkt->destAddr);
|
||||
DPRINTF(PM4PacketProcessor, "PM4 write addr: %p data: %p.\n", addr,
|
||||
pkt->data);
|
||||
auto cb = new DmaVirtCallback<uint32_t>(
|
||||
[ = ](const uint32_t &) { writeDataDone(q, pkt, addr); });
|
||||
//TODO: the specs indicate that pkt->data holds the number of dword that
|
||||
//need to be written.
|
||||
dmaWriteVirt(addr, sizeof(uint32_t), cb, &pkt->data);
|
||||
DPRINTF(PM4PacketProcessor, "PM4 write addr: %p data: %p destSel: %d "
|
||||
"addrIncr: %d resume: %d writeConfirm: %d cachePolicy: %d\n",
|
||||
pkt->destAddr, pkt->data, pkt->destSel, pkt->addrIncr,
|
||||
pkt->resume, pkt->writeConfirm, pkt->cachePolicy);
|
||||
|
||||
if (!pkt->writeConfirm)
|
||||
if (pkt->destSel == 5) {
|
||||
// Memory address destination
|
||||
Addr addr = getGARTAddr(pkt->destAddr);
|
||||
|
||||
// This is a variable length packet. The size of the packet is in
|
||||
// the header.count field and is set as Number Of Dwords - 1. This
|
||||
// packet is 4 bytes minuimum meaning the count is minimum 3. To
|
||||
// get the number of dwords of data subtract two from the count.
|
||||
unsigned size = (header.count - 2) * sizeof(uint32_t);
|
||||
|
||||
DPRINTF(PM4PacketProcessor, "Writing %d bytes to %p\n", size, addr);
|
||||
auto cb = new DmaVirtCallback<uint32_t>(
|
||||
[ = ](const uint32_t &) { writeDataDone(q, pkt, addr); });
|
||||
dmaWriteVirt(addr, size, cb, &pkt->data);
|
||||
|
||||
if (!pkt->writeConfirm) {
|
||||
decodeNext(q);
|
||||
}
|
||||
} else if (pkt->destSel == 0) {
|
||||
// Register dword address destination
|
||||
Addr byte_addr = pkt->destAddr << 2;
|
||||
|
||||
gpuDevice->setRegVal(byte_addr, pkt->data);
|
||||
|
||||
// setRegVal is instant on the simulated device so we ignore write
|
||||
// confirm.
|
||||
delete pkt;
|
||||
decodeNext(q);
|
||||
} else {
|
||||
fatal("Unknown PM4 writeData destination %d\n", pkt->destSel);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
@@ -373,8 +400,9 @@ PM4PacketProcessor::writeDataDone(PM4Queue *q, PM4WriteData *pkt, Addr addr)
|
||||
DPRINTF(PM4PacketProcessor, "PM4 write completed to %p, %p.\n", addr,
|
||||
pkt->data);
|
||||
|
||||
if (pkt->writeConfirm)
|
||||
if (pkt->writeConfirm) {
|
||||
decodeNext(q);
|
||||
}
|
||||
|
||||
delete pkt;
|
||||
}
|
||||
@@ -493,7 +521,7 @@ PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
|
||||
|
||||
// Register doorbell with GPU device
|
||||
gpuDevice->setSDMAEngine(pkt->doorbellOffset << 2, sdma_eng);
|
||||
gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC);
|
||||
gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC, getIpId());
|
||||
|
||||
gpuDevice->processPendingDoorbells(pkt->doorbellOffset << 2);
|
||||
}
|
||||
@@ -537,7 +565,8 @@ PM4PacketProcessor::releaseMemDone(PM4Queue *q, PM4ReleaseMem *pkt, Addr addr)
|
||||
ringId = (q->queue() << 4) | (q->me() << 2) | q->pipe();
|
||||
}
|
||||
gpuDevice->getIH()->prepareInterruptCookie(pkt->intCtxId, ringId,
|
||||
SOC15_IH_CLIENTID_GRBM_CP, CP_EOP);
|
||||
SOC15_IH_CLIENTID_GRBM_CP, CP_EOP,
|
||||
0);
|
||||
gpuDevice->getIH()->submitInterruptCookie();
|
||||
}
|
||||
|
||||
@@ -745,9 +774,14 @@ PM4PacketProcessor::setUconfigReg(PM4Queue *q, PM4SetUconfigReg *pkt)
|
||||
{
|
||||
q->incRptr(sizeof(PM4SetUconfigReg));
|
||||
|
||||
DPRINTF(PM4PacketProcessor, "SetUconfig offset %x data %x\n",
|
||||
pkt->offset, pkt->data);
|
||||
|
||||
// SET_UCONFIG_REG_START and pkt->offset are dword addresses
|
||||
uint32_t reg_addr = (PACKET3_SET_UCONFIG_REG_START + pkt->offset) * 4;
|
||||
|
||||
// Additional CPs respond to addresses 0x40000 apart.
|
||||
reg_addr += 0x40000 * getIpId();
|
||||
gpuDevice->setRegVal(reg_addr, pkt->data);
|
||||
|
||||
decodeNext(q);
|
||||
@@ -822,7 +856,7 @@ PM4PacketProcessor::writeMMIO(PacketPtr pkt, Addr mmio_offset)
|
||||
break;
|
||||
case mmCP_HQD_PQ_DOORBELL_CONTROL:
|
||||
setHqdPqDoorbellCtrl(pkt->getLE<uint32_t>());
|
||||
gpuDevice->setDoorbellType(getKiqDoorbellOffset(), Compute);
|
||||
gpuDevice->setDoorbellType(getKiqDoorbellOffset(), Compute, getIpId());
|
||||
break;
|
||||
case mmCP_HQD_PQ_RPTR:
|
||||
setHqdPqPtr(pkt->getLE<uint32_t>());
|
||||
@@ -884,7 +918,7 @@ PM4PacketProcessor::writeMMIO(PacketPtr pkt, Addr mmio_offset)
|
||||
break;
|
||||
case mmCP_RB_DOORBELL_CONTROL:
|
||||
setRbDoorbellCntrl(pkt->getLE<uint32_t>());
|
||||
gpuDevice->setDoorbellType(getPqDoorbellOffset(), Gfx);
|
||||
gpuDevice->setDoorbellType(getPqDoorbellOffset(), Gfx, getIpId());
|
||||
break;
|
||||
case mmCP_RB_DOORBELL_RANGE_LOWER:
|
||||
setRbDoorbellRangeLo(pkt->getLE<uint32_t>());
|
||||
|
||||
@@ -63,6 +63,10 @@ class PM4PacketProcessor : public DmaVirtDevice
|
||||
std::unordered_map<uint16_t, PM4Queue *> queues;
|
||||
/* A map of PM4 queues based on doorbell offset */
|
||||
std::unordered_map<uint32_t, PM4Queue *> queuesMap;
|
||||
|
||||
int _ipId;
|
||||
AddrRange _mmioRange;
|
||||
|
||||
public:
|
||||
PM4PacketProcessor(const PM4PacketProcessorParams &p);
|
||||
|
||||
@@ -136,7 +140,7 @@ class PM4PacketProcessor : public DmaVirtDevice
|
||||
void decodeHeader(PM4Queue *q, PM4Header header);
|
||||
|
||||
/* Methods that implement PM4 packets */
|
||||
void writeData(PM4Queue *q, PM4WriteData *pkt);
|
||||
void writeData(PM4Queue *q, PM4WriteData *pkt, PM4Header header);
|
||||
void writeDataDone(PM4Queue *q, PM4WriteData *pkt, Addr addr);
|
||||
void mapQueues(PM4Queue *q, PM4MapQueues *pkt);
|
||||
void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt);
|
||||
@@ -188,6 +192,9 @@ class PM4PacketProcessor : public DmaVirtDevice
|
||||
void setRbDoorbellCntrl(uint32_t data);
|
||||
void setRbDoorbellRangeLo(uint32_t data);
|
||||
void setRbDoorbellRangeHi(uint32_t data);
|
||||
|
||||
int getIpId() const { return _ipId; }
|
||||
AddrRange getMMIORange() const { return _mmioRange; }
|
||||
};
|
||||
|
||||
} // namespace gem5
|
||||
|
||||
@@ -81,9 +81,9 @@ SDMAEngine::setGPUDevice(AMDGPUDevice *gpu_device)
|
||||
}
|
||||
|
||||
int
|
||||
SDMAEngine::getIHClientId()
|
||||
SDMAEngine::getIHClientId(int _id)
|
||||
{
|
||||
switch (id) {
|
||||
switch (_id) {
|
||||
case 0:
|
||||
return SOC15_IH_CLIENTID_SDMA0;
|
||||
case 1:
|
||||
@@ -627,10 +627,14 @@ SDMAEngine::writeReadData(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer)
|
||||
|
||||
// lastly we write read data to the destination address
|
||||
if (gpuDevice->getVM().inMMHUB(pkt->dest)) {
|
||||
Addr mmhubAddr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
|
||||
Addr mmhub_addr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
|
||||
|
||||
fatal_if(gpuDevice->getVM().inGARTRange(mmhub_addr),
|
||||
"SDMA write to GART not implemented");
|
||||
|
||||
auto cb = new EventFunctionWrapper(
|
||||
[ = ]{ writeDone(q, pkt, dmaBuffer); }, name());
|
||||
gpuDevice->getMemMgr()->writeRequest(mmhubAddr, (uint8_t *)dmaBuffer,
|
||||
gpuDevice->getMemMgr()->writeRequest(mmhub_addr, (uint8_t *)dmaBuffer,
|
||||
bufferSize, 0, cb);
|
||||
} else {
|
||||
if (q->priv()) {
|
||||
@@ -663,9 +667,11 @@ SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt)
|
||||
// count represents the number of bytes - 1 to be copied
|
||||
pkt->count++;
|
||||
if (q->priv()) {
|
||||
DPRINTF(SDMAEngine, "Getting GART addr for %lx\n", pkt->source);
|
||||
pkt->source = getGARTAddr(pkt->source);
|
||||
DPRINTF(SDMAEngine, "GART addr %lx\n", pkt->source);
|
||||
if (!gpuDevice->getVM().inMMHUB(pkt->source)) {
|
||||
DPRINTF(SDMAEngine, "Getting GART addr for %lx\n", pkt->source);
|
||||
pkt->source = getGARTAddr(pkt->source);
|
||||
DPRINTF(SDMAEngine, "GART addr %lx\n", pkt->source);
|
||||
}
|
||||
}
|
||||
|
||||
// Read data from the source first, then call the copyReadData method
|
||||
@@ -742,6 +748,19 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer)
|
||||
[ = ] (const uint64_t &) { copyDone(q, pkt, dmaBuffer); });
|
||||
dmaWriteVirt(pkt->dest, pkt->count, cb, (void *)dmaBuffer);
|
||||
}
|
||||
|
||||
// For destinations in the GART table, gem5 uses a mapping tables instead
|
||||
// of functionally going to device memory, so we need to update that copy.
|
||||
if (gpuDevice->getVM().inGARTRange(device_addr)) {
|
||||
// GART entries are always 8 bytes.
|
||||
assert((pkt->count % 8) == 0);
|
||||
for (int i = 0; i < pkt->count/8; ++i) {
|
||||
Addr gart_addr = device_addr + i*8 - gpuDevice->getVM().gartBase();
|
||||
DPRINTF(SDMAEngine, "Shadow copying to GART table %lx -> %lx\n",
|
||||
gart_addr, dmaBuffer64[i]);
|
||||
gpuDevice->getVM().gartTable[gart_addr] = dmaBuffer64[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Completion of a copy packet. */
|
||||
@@ -809,8 +828,12 @@ SDMAEngine::trap(SDMAQueue *q, sdmaTrap *pkt)
|
||||
|
||||
uint32_t ring_id = (q->queueType() == SDMAPage) ? 3 : 0;
|
||||
|
||||
int node_id = 0;
|
||||
int local_id = getId();
|
||||
|
||||
gpuDevice->getIH()->prepareInterruptCookie(pkt->intrContext, ring_id,
|
||||
getIHClientId(), TRAP_ID);
|
||||
getIHClientId(local_id),
|
||||
TRAP_ID, 2*node_id);
|
||||
gpuDevice->getIH()->submitInterruptCookie();
|
||||
|
||||
delete pkt;
|
||||
@@ -836,8 +859,7 @@ SDMAEngine::srbmWrite(SDMAQueue *q, sdmaSRBMWriteHeader *header,
|
||||
DPRINTF(SDMAEngine, "SRBM write to %#x with data %#x\n",
|
||||
reg_addr, pkt->data);
|
||||
|
||||
warn_once("SRBM write not performed, no SRBM model. This needs to be fixed"
|
||||
" if correct system simulation is relying on SRBM registers.");
|
||||
gpuDevice->setRegVal(reg_addr, pkt->data);
|
||||
|
||||
delete header;
|
||||
delete pkt;
|
||||
@@ -967,10 +989,14 @@ SDMAEngine::ptePde(SDMAQueue *q, sdmaPtePde *pkt)
|
||||
|
||||
// Writing generated data to the destination address.
|
||||
if (gpuDevice->getVM().inMMHUB(pkt->dest)) {
|
||||
Addr mmhubAddr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
|
||||
Addr mmhub_addr = pkt->dest - gpuDevice->getVM().getMMHUBBase();
|
||||
|
||||
fatal_if(gpuDevice->getVM().inGARTRange(mmhub_addr),
|
||||
"SDMA write to GART not implemented");
|
||||
|
||||
auto cb = new EventFunctionWrapper(
|
||||
[ = ]{ ptePdeDone(q, pkt, dmaBuffer); }, name());
|
||||
gpuDevice->getMemMgr()->writeRequest(mmhubAddr, (uint8_t *)dmaBuffer,
|
||||
gpuDevice->getMemMgr()->writeRequest(mmhub_addr, (uint8_t *)dmaBuffer,
|
||||
sizeof(uint64_t) * pkt->count, 0,
|
||||
cb);
|
||||
} else {
|
||||
|
||||
@@ -172,7 +172,7 @@ class SDMAEngine : public DmaVirtDevice
|
||||
/**
|
||||
* Returns the client id for the Interrupt Handler.
|
||||
*/
|
||||
int getIHClientId();
|
||||
int getIHClientId(int _id);
|
||||
|
||||
/**
|
||||
* Methods for translation.
|
||||
|
||||
Reference in New Issue
Block a user