From f36be791aa8c25cd1531ca9e84f85dcdf2acfb31 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Tue, 13 Feb 2024 16:27:55 -0600 Subject: [PATCH 1/9] arch-vega: Expand FLAT subDecode range in main decoder The main decoder for GPU instructions looks at the first 9 bits of a dword to determine either the instruction or a subDecode table with more information for specific instructions types. For flat instructions the first 9 bits currently consist of 6 fixed encoding bits, a reserved bit, and the first two bits of the opcode. Hence to support all opcodes there are four indirections to the flat subDecode table. In MI300 the reserved bit is part of a field to determine memory scope and therefore may be non-zero. This commit adds four addition calls to the subDecode table for the cases where the scope bit is 1. See page 468 (PDF page 478) below: https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/ instruction-set-architectures/ amd-instinct-mi300-cdna3-instruction-set-architecture.pdf Change-Id: Ic3c786f0ca00a758cbe87f42c5e3470576f73a32 --- src/arch/amdgpu/vega/gpu_decoder.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc index 940840719b..2220d820b1 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.cc +++ b/src/arch/amdgpu/vega/gpu_decoder.cc @@ -500,10 +500,10 @@ namespace VegaISA &Decoder::subDecode_OP_FLAT, &Decoder::subDecode_OP_FLAT, &Decoder::subDecode_OP_FLAT, - &Decoder::decode_invalid, - &Decoder::decode_invalid, - &Decoder::decode_invalid, - &Decoder::decode_invalid, + &Decoder::subDecode_OP_FLAT, + &Decoder::subDecode_OP_FLAT, + &Decoder::subDecode_OP_FLAT, + &Decoder::subDecode_OP_FLAT, &Decoder::subDecode_OP_MUBUF, &Decoder::subDecode_OP_MUBUF, &Decoder::subDecode_OP_MUBUF, From 9ab004cccca501b56cd39be1cefc677fd12b7a4c Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Tue, 13 Feb 2024 16:34:05 -0600 Subject: [PATCH 2/9] arch-vega: Implement V_LSHL_ADD_U64 This is a new instruction in MI300 and operates similar to V_LSHL_ADD_U32 but on 64-bit values. Change-Id: Ia4ac65160bdad748fccdcb28286ba03157cc4046 --- src/arch/amdgpu/vega/gpu_decoder.cc | 8 +++- src/arch/amdgpu/vega/gpu_decoder.hh | 1 + src/arch/amdgpu/vega/insts/instructions.hh | 36 ++++++++++++++++ src/arch/amdgpu/vega/insts/vop3.cc | 48 ++++++++++++++++++++++ 4 files changed, 92 insertions(+), 1 deletion(-) diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc index 2220d820b1..406ada6c52 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.cc +++ b/src/arch/amdgpu/vega/gpu_decoder.cc @@ -1091,7 +1091,7 @@ namespace VegaISA &Decoder::decode_OPU_VOP3__V_MAD_I16, &Decoder::decode_OPU_VOP3__V_FMA_F16, &Decoder::decode_OPU_VOP3__V_DIV_FIXUP_F16, - &Decoder::decode_invalid, + &Decoder::decode_OPU_VOP3__V_LSHL_ADD_U64, &Decoder::decode_invalid, &Decoder::decode_invalid, &Decoder::decode_invalid, @@ -7054,6 +7054,12 @@ namespace VegaISA return new Inst_VOP3__V_DIV_FIXUP_F16(&iFmt->iFmt_VOP3A); } + GPUStaticInst* + Decoder::decode_OPU_VOP3__V_LSHL_ADD_U64(MachInst iFmt) + { + return new Inst_VOP3__V_LSHL_ADD_U64(&iFmt->iFmt_VOP3A); + } + GPUStaticInst* Decoder::decode_OPU_VOP3__V_INTERP_P1_F32(MachInst iFmt) { diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh index 48084a6913..d3b39fd945 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.hh +++ b/src/arch/amdgpu/vega/gpu_decoder.hh @@ -470,6 +470,7 @@ namespace VegaISA GPUStaticInst* decode_OPU_VOP3__V_MAD_I16(MachInst); GPUStaticInst* decode_OPU_VOP3__V_FMA_F16(MachInst); GPUStaticInst* decode_OPU_VOP3__V_DIV_FIXUP_F16(MachInst); + GPUStaticInst* decode_OPU_VOP3__V_LSHL_ADD_U64(MachInst); GPUStaticInst* decode_OPU_VOP3__V_INTERP_P1_F32(MachInst); GPUStaticInst* decode_OPU_VOP3__V_INTERP_P2_F32(MachInst); GPUStaticInst* decode_OPU_VOP3__V_INTERP_MOV_F32(MachInst); diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh index db03548a3d..4c96a3e34b 100644 --- a/src/arch/amdgpu/vega/insts/instructions.hh +++ b/src/arch/amdgpu/vega/insts/instructions.hh @@ -30158,6 +30158,42 @@ namespace VegaISA void execute(GPUDynInstPtr) override; }; // Inst_VOP3__V_DIV_FIXUP_F16 + class Inst_VOP3__V_LSHL_ADD_U64 : public Inst_VOP3A + { + public: + Inst_VOP3__V_LSHL_ADD_U64(InFmt_VOP3A*); + ~Inst_VOP3__V_LSHL_ADD_U64(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 3; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: //src_0 + return 8; + case 1: //src_1 + return 4; + case 2: //src_2 + return 8; + case 3: //vdst + return 8; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + }; // Inst_VOP3__V_LSHL_ADD_U64 + class Inst_VOP3__V_CVT_PKACCUM_U8_F32 : public Inst_VOP3A { public: diff --git a/src/arch/amdgpu/vega/insts/vop3.cc b/src/arch/amdgpu/vega/insts/vop3.cc index 8f6794c9c2..f78f64bc91 100644 --- a/src/arch/amdgpu/vega/insts/vop3.cc +++ b/src/arch/amdgpu/vega/insts/vop3.cc @@ -7630,6 +7630,54 @@ namespace VegaISA { panicUnimplemented(); } // execute + // --- Inst_VOP3__V_LSHL_ADD_U64 class methods --- + + Inst_VOP3__V_LSHL_ADD_U64::Inst_VOP3__V_LSHL_ADD_U64(InFmt_VOP3A *iFmt) + : Inst_VOP3A(iFmt, "v_lshl_add_u64", false) + { + setFlag(ALU); + } // Inst_VOP3__V_LSHL_ADD_U64 + + Inst_VOP3__V_LSHL_ADD_U64::~Inst_VOP3__V_LSHL_ADD_U64() + { + } // ~Inst_VOP3__V_LSHL_ADD_U64 + + // --- description from .arch file --- + // D.u = (S0.u << S1.u[4:0]) + S2.u. + void + Inst_VOP3__V_LSHL_ADD_U64::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandU64 src0(gpuDynInst, extData.SRC0); + ConstVecOperandU32 src1(gpuDynInst, extData.SRC1); + ConstVecOperandU64 src2(gpuDynInst, extData.SRC2); + VecOperandU64 vdst(gpuDynInst, instData.VDST); + + src0.readSrc(); + src1.readSrc(); + src2.readSrc(); + + /** + * input modifiers are supported by FP operations only + */ + assert(!(instData.ABS & 0x1)); + assert(!(instData.ABS & 0x2)); + assert(!(instData.ABS & 0x4)); + assert(!(extData.NEG & 0x1)); + assert(!(extData.NEG & 0x2)); + assert(!(extData.NEG & 0x4)); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + int shift_amount = bits(src1[lane], 2, 0); + shift_amount = shift_amount > 4 ? 0 : shift_amount; + vdst[lane] = (src0[lane] << shift_amount) + + src2[lane]; + } + } + + vdst.write(); + } // execute // --- Inst_VOP3__V_CVT_PKACCUM_U8_F32 class methods --- Inst_VOP3__V_CVT_PKACCUM_U8_F32::Inst_VOP3__V_CVT_PKACCUM_U8_F32( From c045c6854007dc6784105a3dcbd990e4d7b7d36d Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Tue, 13 Feb 2024 16:39:50 -0600 Subject: [PATCH 3/9] dev-amdgpu: Add node_id to interrupt handler The ROCm 6.0 driver adds a node_id field to interrupts which must match before passing on the interrupt to be cleared by the cookie from gem5's interrupt handler implementation. Add this field and enable for gfx942. The usage of the field can be seen in event_interrupt_isr_v9_4_3 at https://github.com/ROCm/ROCK-Kernel-Driver/blob/roc-6.0.x/drivers/ gpu/drm/amd/amdkfd/kfd_int_process_v9.c#L449 Change-Id: Iae8b8f0386a5ad2852b4a3c69f2c161d965c4922 --- src/dev/amdgpu/interrupt_handler.cc | 4 +++- src/dev/amdgpu/interrupt_handler.hh | 5 +++-- src/dev/amdgpu/pm4_packet_processor.cc | 3 ++- src/dev/amdgpu/sdma_engine.cc | 10 +++++++--- src/dev/amdgpu/sdma_engine.hh | 2 +- 5 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/dev/amdgpu/interrupt_handler.cc b/src/dev/amdgpu/interrupt_handler.cc index 6f277a1618..cb99ba7a39 100644 --- a/src/dev/amdgpu/interrupt_handler.cc +++ b/src/dev/amdgpu/interrupt_handler.cc @@ -75,7 +75,8 @@ void AMDGPUInterruptHandler::prepareInterruptCookie(ContextID cntxt_id, uint32_t ring_id, uint32_t client_id, - uint32_t source_id) + uint32_t source_id, + unsigned node_id) { assert(client_id == SOC15_IH_CLIENTID_RLC || client_id == SOC15_IH_CLIENTID_SDMA0 || @@ -112,6 +113,7 @@ AMDGPUInterruptHandler::prepareInterruptCookie(ContextID cntxt_id, cookie->clientId = client_id; cookie->sourceId = source_id; cookie->ringId = ring_id; + cookie->nodeId = node_id; cookie->source_data_dw1 = cntxt_id; interruptQueue.push(cookie); } diff --git a/src/dev/amdgpu/interrupt_handler.hh b/src/dev/amdgpu/interrupt_handler.hh index 9b80e081cc..a895eabafc 100644 --- a/src/dev/amdgpu/interrupt_handler.hh +++ b/src/dev/amdgpu/interrupt_handler.hh @@ -101,7 +101,8 @@ typedef struct uint32_t reserved2 : 15; uint32_t timestamp_src : 1; uint32_t pasid : 16; - uint32_t reserved3 : 15; + uint32_t nodeId : 8; + uint32_t reserved3 : 7; uint32_t pasid_src : 1; uint32_t source_data_dw1; uint32_t source_data_dw2; @@ -171,7 +172,7 @@ class AMDGPUInterruptHandler : public DmaDevice void setGPUDevice(AMDGPUDevice *gpu_device) { gpuDevice = gpu_device; } void prepareInterruptCookie(ContextID cntxtId, uint32_t ring_id, - uint32_t client_id, uint32_t source_id); + uint32_t client_id, uint32_t source_id, unsigned node_id); void submitInterruptCookie(); void submitWritePointer(); void intrPost(); diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc index 5f270a0c70..b7952f0698 100644 --- a/src/dev/amdgpu/pm4_packet_processor.cc +++ b/src/dev/amdgpu/pm4_packet_processor.cc @@ -537,7 +537,8 @@ PM4PacketProcessor::releaseMemDone(PM4Queue *q, PM4ReleaseMem *pkt, Addr addr) ringId = (q->queue() << 4) | (q->me() << 2) | q->pipe(); } gpuDevice->getIH()->prepareInterruptCookie(pkt->intCtxId, ringId, - SOC15_IH_CLIENTID_GRBM_CP, CP_EOP); + SOC15_IH_CLIENTID_GRBM_CP, CP_EOP, + 2 * getIpId()); gpuDevice->getIH()->submitInterruptCookie(); } diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc index 4015e83eaf..34ad027234 100644 --- a/src/dev/amdgpu/sdma_engine.cc +++ b/src/dev/amdgpu/sdma_engine.cc @@ -81,9 +81,9 @@ SDMAEngine::setGPUDevice(AMDGPUDevice *gpu_device) } int -SDMAEngine::getIHClientId() +SDMAEngine::getIHClientId(int _id) { - switch (id) { + switch (_id) { case 0: return SOC15_IH_CLIENTID_SDMA0; case 1: @@ -809,8 +809,12 @@ SDMAEngine::trap(SDMAQueue *q, sdmaTrap *pkt) uint32_t ring_id = (q->queueType() == SDMAPage) ? 3 : 0; + int node_id = 0; + int local_id = getId(); + gpuDevice->getIH()->prepareInterruptCookie(pkt->intrContext, ring_id, - getIHClientId(), TRAP_ID); + getIHClientId(local_id), + TRAP_ID, 2*node_id); gpuDevice->getIH()->submitInterruptCookie(); delete pkt; diff --git a/src/dev/amdgpu/sdma_engine.hh b/src/dev/amdgpu/sdma_engine.hh index d8ab31bbde..9407b97d73 100644 --- a/src/dev/amdgpu/sdma_engine.hh +++ b/src/dev/amdgpu/sdma_engine.hh @@ -172,7 +172,7 @@ class SDMAEngine : public DmaVirtDevice /** * Returns the client id for the Interrupt Handler. */ - int getIHClientId(); + int getIHClientId(int _id); /** * Methods for translation. From 998709d4fcf4bb2c60c2c98e5b5a001a730e7dcc Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Wed, 7 Feb 2024 13:27:30 -0600 Subject: [PATCH 4/9] dev-amdgpu: Improve PM4 write data packet The write data packet can write multiple dwords but currently always assumes there is one dword, which can cause some write data to be missed. This case is not common, but the number of dwords is implicitly defined in the PM4 header. This changeset passes the PM4 header to write data so that the correct number of dwords can be determined. For now we assume no page crossing when writing multiple dwords as the driver should be checking for that. Change-Id: I0e8c3cbc28873779f468c2a11fdcf177210a22b7 --- src/dev/amdgpu/pm4_packet_processor.cc | 54 +++++++++++++++++++------- src/dev/amdgpu/pm4_packet_processor.hh | 2 +- 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc index b7952f0698..c8baa5eab4 100644 --- a/src/dev/amdgpu/pm4_packet_processor.cc +++ b/src/dev/amdgpu/pm4_packet_processor.cc @@ -227,9 +227,11 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header) } break; case IT_WRITE_DATA: { dmaBuffer = new PM4WriteData(); + DPRINTF(PM4PacketProcessor, "PM4 writeData header: %x, count: %d\n", + header.ordinal, header.count); cb = new DmaVirtCallback( [ = ] (const uint64_t &) - { writeData(q, (PM4WriteData *)dmaBuffer); }); + { writeData(q, (PM4WriteData *)dmaBuffer, header); }); dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4WriteData), cb, dmaBuffer); } break; @@ -350,21 +352,46 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header) } void -PM4PacketProcessor::writeData(PM4Queue *q, PM4WriteData *pkt) +PM4PacketProcessor::writeData(PM4Queue *q, PM4WriteData *pkt, PM4Header header) { q->incRptr(sizeof(PM4WriteData)); - Addr addr = getGARTAddr(pkt->destAddr); - DPRINTF(PM4PacketProcessor, "PM4 write addr: %p data: %p.\n", addr, - pkt->data); - auto cb = new DmaVirtCallback( - [ = ](const uint32_t &) { writeDataDone(q, pkt, addr); }); - //TODO: the specs indicate that pkt->data holds the number of dword that - //need to be written. - dmaWriteVirt(addr, sizeof(uint32_t), cb, &pkt->data); + DPRINTF(PM4PacketProcessor, "PM4 write addr: %p data: %p destSel: %d " + "addrIncr: %d resume: %d writeConfirm: %d cachePolicy: %d\n", + pkt->destAddr, pkt->data, pkt->destSel, pkt->addrIncr, + pkt->resume, pkt->writeConfirm, pkt->cachePolicy); - if (!pkt->writeConfirm) + if (pkt->destSel == 5) { + // Memory address destination + Addr addr = getGARTAddr(pkt->destAddr); + + // This is a variable length packet. The size of the packet is in + // the header.count field and is set as Number Of Dwords - 1. This + // packet is 4 bytes minuimum meaning the count is minimum 3. To + // get the number of dwords of data subtract two from the count. + unsigned size = (header.count - 2) * sizeof(uint32_t); + + DPRINTF(PM4PacketProcessor, "Writing %d bytes to %p\n", size, addr); + auto cb = new DmaVirtCallback( + [ = ](const uint32_t &) { writeDataDone(q, pkt, addr); }); + dmaWriteVirt(addr, size, cb, &pkt->data); + + if (!pkt->writeConfirm) { + decodeNext(q); + } + } else if (pkt->destSel == 0) { + // Register dword address destination + Addr byte_addr = pkt->destAddr << 2; + + gpuDevice->setRegVal(byte_addr, pkt->data); + + // setRegVal is instant on the simulated device so we ignore write + // confirm. + delete pkt; decodeNext(q); + } else { + fatal("Unknown PM4 writeData destination %d\n", pkt->destSel); + } } void @@ -373,8 +400,9 @@ PM4PacketProcessor::writeDataDone(PM4Queue *q, PM4WriteData *pkt, Addr addr) DPRINTF(PM4PacketProcessor, "PM4 write completed to %p, %p.\n", addr, pkt->data); - if (pkt->writeConfirm) + if (pkt->writeConfirm) { decodeNext(q); + } delete pkt; } @@ -538,7 +566,7 @@ PM4PacketProcessor::releaseMemDone(PM4Queue *q, PM4ReleaseMem *pkt, Addr addr) } gpuDevice->getIH()->prepareInterruptCookie(pkt->intCtxId, ringId, SOC15_IH_CLIENTID_GRBM_CP, CP_EOP, - 2 * getIpId()); + 0); gpuDevice->getIH()->submitInterruptCookie(); } diff --git a/src/dev/amdgpu/pm4_packet_processor.hh b/src/dev/amdgpu/pm4_packet_processor.hh index 3fb055148c..4782e70829 100644 --- a/src/dev/amdgpu/pm4_packet_processor.hh +++ b/src/dev/amdgpu/pm4_packet_processor.hh @@ -136,7 +136,7 @@ class PM4PacketProcessor : public DmaVirtDevice void decodeHeader(PM4Queue *q, PM4Header header); /* Methods that implement PM4 packets */ - void writeData(PM4Queue *q, PM4WriteData *pkt); + void writeData(PM4Queue *q, PM4WriteData *pkt, PM4Header header); void writeDataDone(PM4Queue *q, PM4WriteData *pkt, Addr addr); void mapQueues(PM4Queue *q, PM4MapQueues *pkt); void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt); From 009cec56e0e7a082ed684e98c5600babc2d2283e Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Wed, 7 Feb 2024 13:29:44 -0600 Subject: [PATCH 5/9] dev-amdgpu: Check for SDMA copies to GART range The SDMA engine can potentially be used to write to the GART address range. Since gem5 has a shadow copy of the GART table to avoid sending functional reads to device memory, the GART table must be updated when copying to the GART range. This changeset adds a check in the VM for GART range and implements the SDMA copy packet writing to the GART range. A fatal is added to write and ptePde, which are the only other two ways to write to memory, as using these packets to update the GART table has not been observed. Change-Id: I1e62dfd9179cc9e987659e68414209fd77bba2bd --- src/dev/amdgpu/amdgpu_vm.hh | 6 ++++++ src/dev/amdgpu/sdma_engine.cc | 37 ++++++++++++++++++++++++++++------- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/src/dev/amdgpu/amdgpu_vm.hh b/src/dev/amdgpu/amdgpu_vm.hh index f35a735111..5af666f379 100644 --- a/src/dev/amdgpu/amdgpu_vm.hh +++ b/src/dev/amdgpu/amdgpu_vm.hh @@ -172,6 +172,12 @@ class AMDGPUVM : public Serializable */ Addr gartSize(); + bool + inGARTRange(Addr paddr) + { + return ((paddr >= gartBase()) && (paddr <= (gartBase() + gartSize()))); + } + /** * Copy of GART table. Typically resides in device memory, however we use * a copy in gem5 to simplify the interface. diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc index 34ad027234..94bcdf9cb9 100644 --- a/src/dev/amdgpu/sdma_engine.cc +++ b/src/dev/amdgpu/sdma_engine.cc @@ -627,10 +627,14 @@ SDMAEngine::writeReadData(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer) // lastly we write read data to the destination address if (gpuDevice->getVM().inMMHUB(pkt->dest)) { - Addr mmhubAddr = pkt->dest - gpuDevice->getVM().getMMHUBBase(); + Addr mmhub_addr = pkt->dest - gpuDevice->getVM().getMMHUBBase(); + + fatal_if(gpuDevice->getVM().inGARTRange(mmhub_addr), + "SDMA write to GART not implemented"); + auto cb = new EventFunctionWrapper( [ = ]{ writeDone(q, pkt, dmaBuffer); }, name()); - gpuDevice->getMemMgr()->writeRequest(mmhubAddr, (uint8_t *)dmaBuffer, + gpuDevice->getMemMgr()->writeRequest(mmhub_addr, (uint8_t *)dmaBuffer, bufferSize, 0, cb); } else { if (q->priv()) { @@ -663,9 +667,11 @@ SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt) // count represents the number of bytes - 1 to be copied pkt->count++; if (q->priv()) { - DPRINTF(SDMAEngine, "Getting GART addr for %lx\n", pkt->source); - pkt->source = getGARTAddr(pkt->source); - DPRINTF(SDMAEngine, "GART addr %lx\n", pkt->source); + if (!gpuDevice->getVM().inMMHUB(pkt->source)) { + DPRINTF(SDMAEngine, "Getting GART addr for %lx\n", pkt->source); + pkt->source = getGARTAddr(pkt->source); + DPRINTF(SDMAEngine, "GART addr %lx\n", pkt->source); + } } // Read data from the source first, then call the copyReadData method @@ -742,6 +748,19 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer) [ = ] (const uint64_t &) { copyDone(q, pkt, dmaBuffer); }); dmaWriteVirt(pkt->dest, pkt->count, cb, (void *)dmaBuffer); } + + // For destinations in the GART table, gem5 uses a mapping tables instead + // of functionally going to device memory, so we need to update that copy. + if (gpuDevice->getVM().inGARTRange(device_addr)) { + // GART entries are always 8 bytes. + assert((pkt->count % 8) == 0); + for (int i = 0; i < pkt->count/8; ++i) { + Addr gart_addr = device_addr + i*8 - gpuDevice->getVM().gartBase(); + DPRINTF(SDMAEngine, "Shadow copying to GART table %lx -> %lx\n", + gart_addr, dmaBuffer64[i]); + gpuDevice->getVM().gartTable[gart_addr] = dmaBuffer64[i]; + } + } } /* Completion of a copy packet. */ @@ -971,10 +990,14 @@ SDMAEngine::ptePde(SDMAQueue *q, sdmaPtePde *pkt) // Writing generated data to the destination address. if (gpuDevice->getVM().inMMHUB(pkt->dest)) { - Addr mmhubAddr = pkt->dest - gpuDevice->getVM().getMMHUBBase(); + Addr mmhub_addr = pkt->dest - gpuDevice->getVM().getMMHUBBase(); + + fatal_if(gpuDevice->getVM().inGARTRange(mmhub_addr), + "SDMA write to GART not implemented"); + auto cb = new EventFunctionWrapper( [ = ]{ ptePdeDone(q, pkt, dmaBuffer); }, name()); - gpuDevice->getMemMgr()->writeRequest(mmhubAddr, (uint8_t *)dmaBuffer, + gpuDevice->getMemMgr()->writeRequest(mmhub_addr, (uint8_t *)dmaBuffer, sizeof(uint64_t) * pkt->count, 0, cb); } else { From 6bbde8fbb885abae5d7f3ed630d19a9b982dd302 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Thu, 8 Feb 2024 12:26:27 -0600 Subject: [PATCH 6/9] dev-amdgpu: Rework handling of unknown registers The top level AMDGPUDevice currently reads/writes all unknown registers to/from a map containing the previously written value. This is intended as a way to handle registers that are not part of the model but the driver requires for functionality. Since this is at the top level, it can mask changes to register values which do not go through the same interface. For example, reading an MMIO, changing via PM4 queue, and reading again returns the stale cached value. This commit removes the usage of the regs map in AMDGPUDevice, implements some important MMIOs that were previously handled by it, and moves the unknown register handling to the NBIO aperture only. To reduce the number of additional MMIOs to implement, the display manager in vega10 is now disabled. Change-Id: Iff0a599dd82d663c7e710b79c6ef6d0ad1fc44a2 --- configs/example/gpufs/vega10.py | 2 +- src/dev/amdgpu/amdgpu_device.cc | 75 +++++++++++---------------------- src/dev/amdgpu/amdgpu_device.hh | 7 +-- src/dev/amdgpu/amdgpu_gfx.cc | 13 ++++++ src/dev/amdgpu/amdgpu_gfx.hh | 11 ++++- src/dev/amdgpu/amdgpu_nbio.cc | 41 ++++++++++++++---- src/dev/amdgpu/amdgpu_nbio.hh | 14 +++++- 7 files changed, 96 insertions(+), 67 deletions(-) diff --git a/configs/example/gpufs/vega10.py b/configs/example/gpufs/vega10.py index ae74efd39b..9c3116d415 100644 --- a/configs/example/gpufs/vega10.py +++ b/configs/example/gpufs/vega10.py @@ -52,7 +52,7 @@ if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5." /sbin/m5 exit fi -modprobe -v amdgpu ip_block_mask=0xff ppfeaturemask=0 dpm=0 audio=0 +modprobe -v amdgpu ip_block_mask=0xdf ppfeaturemask=0 dpm=0 audio=0 echo "Running {} {}" echo "{}" | base64 -d > myapp chmod +x myapp diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc index 48f450c2b2..4b684aa221 100644 --- a/src/dev/amdgpu/amdgpu_device.cc +++ b/src/dev/amdgpu/amdgpu_device.cc @@ -130,6 +130,7 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) pm4PktProc->setGPUDevice(this); cp->hsaPacketProc().setGPUDevice(this); cp->setGPUDevice(this); + nbio.setGPUDevice(this); // Address aperture for device memory. We tell this to the driver and // could possibly be anything, but these are the values used by hardware. @@ -163,8 +164,6 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) gpuvm.setMMHUBBase(mmhubBase); gpuvm.setMMHUBTop(mmhubTop); - - nbio.setGPUDevice(this); } void @@ -365,13 +364,6 @@ AMDGPUDevice::readMMIO(PacketPtr pkt, Addr offset) DPRINTF(AMDGPUDevice, "Read MMIO %#lx\n", offset); mmioReader.readFromTrace(pkt, MMIO_BAR, offset); - if (regs.find(offset) != regs.end()) { - uint64_t value = regs[offset]; - DPRINTF(AMDGPUDevice, "Reading what kernel wrote before: %#x\n", - value); - pkt->setUintX(value, ByteOrder::little); - } - switch (aperture) { case NBIO_BASE: nbio.readMMIO(pkt, aperture_offset); @@ -610,26 +602,39 @@ AMDGPUDevice::processPendingDoorbells(uint32_t offset) } } -bool -AMDGPUDevice::haveRegVal(uint32_t addr) -{ - return regs.count(addr); -} - uint32_t -AMDGPUDevice::getRegVal(uint32_t addr) +AMDGPUDevice::getRegVal(uint64_t addr) { + // This is somewhat of a guess based on amdgpu_device_mm_access + // in amdgpu_device.c in the ROCk driver. If bit 32 is 1 then + // assume VRAM and use full address, otherwise assume register + // address and only user lower 31 bits. + Addr fixup_addr = bits(addr, 31, 31) ? addr : addr & 0x7fffffff; + + uint32_t pkt_data = 0; + RequestPtr request = std::make_shared(fixup_addr, + sizeof(uint32_t), 0 /* flags */, vramRequestorId()); + PacketPtr pkt = Packet::createRead(request); + pkt->dataStatic((uint8_t *)&pkt_data); + readMMIO(pkt, addr); DPRINTF(AMDGPUDevice, "Getting register 0x%lx = %x\n", - addr, regs[addr]); - return regs[addr]; + fixup_addr, pkt->getLE()); + + return pkt->getLE(); } void -AMDGPUDevice::setRegVal(uint32_t addr, uint32_t value) +AMDGPUDevice::setRegVal(uint64_t addr, uint32_t value) { DPRINTF(AMDGPUDevice, "Setting register 0x%lx to %x\n", addr, value); - regs[addr] = value; + + uint32_t pkt_data = value; + RequestPtr request = std::make_shared(addr, + sizeof(uint32_t), 0 /* flags */, vramRequestorId()); + PacketPtr pkt = Packet::createWrite(request); + pkt->dataStatic((uint8_t *)&pkt_data); + writeMMIO(pkt, addr); } void @@ -675,20 +680,16 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const // Serialize the PciDevice base class PciDevice::serialize(cp); - uint64_t regs_size = regs.size(); uint64_t doorbells_size = doorbells.size(); uint64_t sdma_engs_size = sdmaEngs.size(); uint64_t used_vmid_map_size = usedVMIDs.size(); - SERIALIZE_SCALAR(regs_size); SERIALIZE_SCALAR(doorbells_size); SERIALIZE_SCALAR(sdma_engs_size); // Save the number of vmids used SERIALIZE_SCALAR(used_vmid_map_size); // Make a c-style array of the regs to serialize - uint32_t reg_addrs[regs_size]; - uint64_t reg_values[regs_size]; uint32_t doorbells_offset[doorbells_size]; QueueType doorbells_queues[doorbells_size]; uint32_t sdma_engs_offset[sdma_engs_size]; @@ -698,13 +699,6 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const std::vector used_vmid_sets; int idx = 0; - for (auto & it : regs) { - reg_addrs[idx] = it.first; - reg_values[idx] = it.second; - ++idx; - } - - idx = 0; for (auto & it : doorbells) { doorbells_offset[idx] = it.first; doorbells_queues[idx] = it.second; @@ -732,8 +726,6 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const int* vmid_array = new int[num_queue_id]; std::copy(used_vmid_sets.begin(), used_vmid_sets.end(), vmid_array); - SERIALIZE_ARRAY(reg_addrs, sizeof(reg_addrs)/sizeof(reg_addrs[0])); - SERIALIZE_ARRAY(reg_values, sizeof(reg_values)/sizeof(reg_values[0])); SERIALIZE_ARRAY(doorbells_offset, sizeof(doorbells_offset)/ sizeof(doorbells_offset[0])); SERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/ @@ -764,30 +756,15 @@ AMDGPUDevice::unserialize(CheckpointIn &cp) // Unserialize the PciDevice base class PciDevice::unserialize(cp); - uint64_t regs_size = 0; uint64_t doorbells_size = 0; uint64_t sdma_engs_size = 0; uint64_t used_vmid_map_size = 0; - UNSERIALIZE_SCALAR(regs_size); UNSERIALIZE_SCALAR(doorbells_size); UNSERIALIZE_SCALAR(sdma_engs_size); UNSERIALIZE_SCALAR(used_vmid_map_size); - if (regs_size > 0) { - uint32_t reg_addrs[regs_size]; - uint64_t reg_values[regs_size]; - - UNSERIALIZE_ARRAY(reg_addrs, sizeof(reg_addrs)/sizeof(reg_addrs[0])); - UNSERIALIZE_ARRAY(reg_values, - sizeof(reg_values)/sizeof(reg_values[0])); - - for (int idx = 0; idx < regs_size; ++idx) { - regs.insert(std::make_pair(reg_addrs[idx], reg_values[idx])); - } - } - if (doorbells_size > 0) { uint32_t doorbells_offset[doorbells_size]; QueueType doorbells_queues[doorbells_size]; @@ -798,8 +775,6 @@ AMDGPUDevice::unserialize(CheckpointIn &cp) sizeof(doorbells_queues[0])); for (int idx = 0; idx < doorbells_size; ++idx) { - regs.insert(std::make_pair(doorbells_offset[idx], - doorbells_queues[idx])); doorbells[doorbells_offset[idx]] = doorbells_queues[idx]; } } diff --git a/src/dev/amdgpu/amdgpu_device.hh b/src/dev/amdgpu/amdgpu_device.hh index b6b6e2a81a..fface5fb3e 100644 --- a/src/dev/amdgpu/amdgpu_device.hh +++ b/src/dev/amdgpu/amdgpu_device.hh @@ -87,8 +87,6 @@ class AMDGPUDevice : public PciDevice /** * Structures to hold registers, doorbells, and some frame memory */ - using GPURegMap = std::unordered_map; - GPURegMap regs; std::unordered_map doorbells; std::unordered_map pendingDoorbellPkts; @@ -195,9 +193,8 @@ class AMDGPUDevice : public PciDevice * Register value getter/setter. Used by other GPU blocks to change * values from incoming driver/user packets. */ - bool haveRegVal(uint32_t addr); - uint32_t getRegVal(uint32_t addr); - void setRegVal(uint32_t addr, uint32_t value); + uint32_t getRegVal(uint64_t addr); + void setRegVal(uint64_t addr, uint32_t value); /** * Methods related to translations and system/device memory. diff --git a/src/dev/amdgpu/amdgpu_gfx.cc b/src/dev/amdgpu/amdgpu_gfx.cc index 3d5b274b86..60fabaf31d 100644 --- a/src/dev/amdgpu/amdgpu_gfx.cc +++ b/src/dev/amdgpu/amdgpu_gfx.cc @@ -37,6 +37,13 @@ namespace gem5 { +AMDGPUGfx::AMDGPUGfx() +{ + for (int i = 0; i < SCRATCH_REGS; ++i) { + scratchRegs[i] = 0; + } +} + void AMDGPUGfx::readMMIO(PacketPtr pkt, Addr offset) { @@ -47,6 +54,9 @@ AMDGPUGfx::readMMIO(PacketPtr pkt, Addr offset) case AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB: pkt->setLE(captured_clock_count >> 32); break; + case AMDGPU_MM_SCRATCH_REG0: + pkt->setLE(scratchRegs[0]); + break; default: break; } @@ -65,6 +75,9 @@ AMDGPUGfx::writeMMIO(PacketPtr pkt, Addr offset) captured_clock_count = curTick() / sim_clock::as_int::ns; } break; + case AMDGPU_MM_SCRATCH_REG0: + scratchRegs[0] = pkt->getLE(); + break; default: break; } diff --git a/src/dev/amdgpu/amdgpu_gfx.hh b/src/dev/amdgpu/amdgpu_gfx.hh index c32b8624cf..9fb1d82553 100644 --- a/src/dev/amdgpu/amdgpu_gfx.hh +++ b/src/dev/amdgpu/amdgpu_gfx.hh @@ -52,13 +52,16 @@ #define AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB 0x13094 #define AMDGPU_MM_RLC_CAPTURE_GPU_CLOCK_COUNT 0x13098 +// Scratch registers used for GPU post +#define AMDGPU_MM_SCRATCH_REG0 0x08100 + namespace gem5 { class AMDGPUGfx { public: - AMDGPUGfx() { } + AMDGPUGfx(); void readMMIO(PacketPtr pkt, Addr offset); void writeMMIO(PacketPtr pkt, Addr offset); @@ -68,6 +71,12 @@ class AMDGPUGfx * GPU clock count at the time capture MMIO is received. */ uint64_t captured_clock_count = 1; + + /* + * Scratch registers. + */ + static constexpr int SCRATCH_REGS = 8; + std::array scratchRegs; }; } // namespace gem5 diff --git a/src/dev/amdgpu/amdgpu_nbio.cc b/src/dev/amdgpu/amdgpu_nbio.cc index 07027c3765..89b1682631 100644 --- a/src/dev/amdgpu/amdgpu_nbio.cc +++ b/src/dev/amdgpu/amdgpu_nbio.cc @@ -54,13 +54,21 @@ void AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset) { switch (offset) { + case AMDGPU_PCIE_DATA: + { + uint32_t value = gpuDevice->getRegVal(pcie_index_reg); + DPRINTF(AMDGPUDevice, "Read PCIe index %lx data %x\n", + pcie_index_reg, value); + pkt->setLE(value); + } + break; // This is a PCIe status register. At some point during driver init // the driver checks that interrupts are enabled. This is only // checked once, so if the MMIO trace does not exactly line up with // what the driver is doing in gem5, this may still have the first // bit zero causing driver to fail. Therefore, we always set this // bit to one as there is no harm to do so. - case AMDGPU_PCIE_DATA_REG: + case AMDGPU_PCIE_DATA2: { uint32_t value = pkt->getLE() | 0x1; DPRINTF(AMDGPUDevice, "Marking interrupts enabled: %#lx\n", value); @@ -68,7 +76,6 @@ AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset) } break; case AMDGPU_MM_DATA: - //pkt->setLE(regs[mm_index_reg]); pkt->setLE(gpuDevice->getRegVal(mm_index_reg)); break; case VEGA10_INV_ENG17_ACK1: @@ -89,17 +96,17 @@ AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset) case AMDGPU_MP0_SMN_C2PMSG_35: pkt->setLE(0x80000000); break; + case AMDGPU_MP1_SMN_C2PMSG_90: + pkt->setLE(0x1); + break; default: if (triggered_reads.count(offset)) { DPRINTF(AMDGPUDevice, "Found triggered read for %#x\n", offset); pkt->setLE(triggered_reads[offset]); - } else if (gpuDevice->haveRegVal(offset)) { - uint32_t reg_val = gpuDevice->getRegVal(offset); - - DPRINTF(AMDGPUDevice, "Reading value of %#lx from regs: %#lx\n", - offset, reg_val); - - pkt->setLE(reg_val); + } else if (regs.count(offset)) { + DPRINTF(AMDGPUDevice, "Returning value of unknown MMIO offset " + "%x: %x\n", offset, regs[offset]); + pkt->setLE(regs[offset]); } else { DPRINTF(AMDGPUDevice, "NBIO Unknown MMIO %#x (%#x)\n", offset, pkt->getAddr()); @@ -123,6 +130,14 @@ AMDGPUNbio::writeMMIO(PacketPtr pkt, Addr offset) DPRINTF(AMDGPUDevice, "MM write to reg %#lx data %#lx\n", mm_index_reg, pkt->getLE()); gpuDevice->setRegVal(AMDGPU_MM_DATA, pkt->getLE()); + } else if (offset == AMDGPU_PCIE_INDEX) { + assert(pkt->getSize() == 4); + pcie_index_reg = insertBits(pcie_index_reg, 31, 0, + pkt->getLE()); + } else if (offset == AMDGPU_PCIE_INDEX2) { + assert(pkt->getSize() == 4); + pcie_index_reg = insertBits(pcie_index_reg, 63, 32, + pkt->getLE()); } else if (offset == AMDGPU_MP0_SMN_C2PMSG_35) { // See psp_v3_1_bootloader_load_sos in amdgpu driver code. if (pkt->getLE() == 0x10000) { @@ -144,6 +159,14 @@ AMDGPUNbio::writeMMIO(PacketPtr pkt, Addr offset) } else if (offset == AMDGPU_MP0_SMN_C2PMSG_71) { // PSP ring size psp_ring_size = pkt->getLE(); + } else { + // Fallback to a map of register values. This was previously in the + // AMDGPUDevice, however that short-circuited some reads from other + // IP blocks. Since this is an end point IP block it is safer to use + // here. + regs[offset] = pkt->getLE(); + DPRINTF(AMDGPUDevice, "Writing value of unknown MMIO offset " + "%x: %x\n", offset, regs[offset]); } } diff --git a/src/dev/amdgpu/amdgpu_nbio.hh b/src/dev/amdgpu/amdgpu_nbio.hh index dc95443916..0d839d0e22 100644 --- a/src/dev/amdgpu/amdgpu_nbio.hh +++ b/src/dev/amdgpu/amdgpu_nbio.hh @@ -56,7 +56,11 @@ class AMDGPUDevice; #define AMDGPU_MM_INDEX 0x00000 #define AMDGPU_MM_INDEX_HI 0x00018 #define AMDGPU_MM_DATA 0x00004 -#define AMDGPU_PCIE_DATA_REG 0x0003c + +#define AMDGPU_PCIE_INDEX 0x00030 +#define AMDGPU_PCIE_INDEX2 0x00038 +#define AMDGPU_PCIE_DATA 0x00034 +#define AMDGPU_PCIE_DATA2 0x0003c // Message bus related to psp #define AMDGPU_MP0_SMN_C2PMSG_33 0x58184 @@ -66,6 +70,7 @@ class AMDGPUDevice; #define AMDGPU_MP0_SMN_C2PMSG_70 0x58218 #define AMDGPU_MP0_SMN_C2PMSG_71 0x5821c #define AMDGPU_MP0_SMN_C2PMSG_81 0x58244 +#define AMDGPU_MP1_SMN_C2PMSG_90 0x58a68 // Device specific invalidation engines used during initialization #define VEGA10_INV_ENG17_ACK1 0x0a318 @@ -105,6 +110,7 @@ class AMDGPUNbio * Driver initialization sequence helper variables. */ uint64_t mm_index_reg = 0; + uint64_t pcie_index_reg = 0; std::unordered_map triggered_reads; /* @@ -115,6 +121,12 @@ class AMDGPUNbio Addr psp_ring_listen_addr = 0; int psp_ring_size = 0; int psp_ring_value = 0; + + /* + * Hold values of other registers not explicitly modelled by other blocks. + */ + using GPURegMap = std::unordered_map; + GPURegMap regs; }; } // namespace gem5 From 047c19478023f5c467cc692e1ac717a9104a8ea7 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Tue, 13 Feb 2024 16:45:12 -0600 Subject: [PATCH 7/9] dev-amdgpu: Implement SRBM write The SRBM write packets where previously not required. This commit implements SRBM writes to set a register by using the new setRegVal interface. SRBM writes seem to be used for SRIOV enabled devices. Change-Id: I202653d339e882e8de59d69a995f65332b2dfb8c --- src/dev/amdgpu/sdma_engine.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc index 94bcdf9cb9..070c04fe64 100644 --- a/src/dev/amdgpu/sdma_engine.cc +++ b/src/dev/amdgpu/sdma_engine.cc @@ -859,8 +859,7 @@ SDMAEngine::srbmWrite(SDMAQueue *q, sdmaSRBMWriteHeader *header, DPRINTF(SDMAEngine, "SRBM write to %#x with data %#x\n", reg_addr, pkt->data); - warn_once("SRBM write not performed, no SRBM model. This needs to be fixed" - " if correct system simulation is relying on SRBM registers."); + gpuDevice->setRegVal(reg_addr, pkt->data); delete header; delete pkt; From 39153cd234c0dacd15351680df699fcd45d3fc01 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Tue, 13 Feb 2024 17:38:20 -0600 Subject: [PATCH 8/9] dev-amdgpu: Implement PCIe indirect read/write PCIe can read/write to any 32-bit address using the PCI index/index2 registers as an address and then reading/writing the corresponding data/data2 register. This commit adds this functionality and removes one magic value being written to support GPU POST. This feature is disabled for Vega10 which relies on an MMIO trace for too many values to implement in the MMIO interface. Change-Id: Iacfdd1294a7652fc3e60304b57df536d318c847b --- src/dev/amdgpu/amdgpu_nbio.cc | 49 ++++++++++++++++++++++++++--------- src/dev/amdgpu/amdgpu_nbio.hh | 3 ++- 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/src/dev/amdgpu/amdgpu_nbio.cc b/src/dev/amdgpu/amdgpu_nbio.cc index 89b1682631..ec44f16250 100644 --- a/src/dev/amdgpu/amdgpu_nbio.cc +++ b/src/dev/amdgpu/amdgpu_nbio.cc @@ -53,7 +53,21 @@ AMDGPUNbio::setGPUDevice(AMDGPUDevice *gpu_device) void AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset) { + // For Vega10 we rely on the golden values in an MMIO trace. Return + // immediately as to not clobber those values. + if (gpuDevice->getGfxVersion() == GfxVersion::gfx900) { + if (offset == AMDGPU_PCIE_DATA || offset == AMDGPU_PCIE_DATA2) { + return; + } + } + switch (offset) { + // PCIE_DATA, PCIE_DATA2, PCIE_INDEX, and PCIE_INDEX2 handle "indirect + // "register reads/writes from the driver. This provides a way to read + // any register by providing a 32-bit address to one of the two INDEX + // registers and then reading the corresponding DATA register. See: + // https://github.com/ROCm/ROCK-Kernel-Driver/blob/roc-6.0.x/drivers/ + // gpu/drm/amd/amdgpu/amdgpu_device.c#L459 case AMDGPU_PCIE_DATA: { uint32_t value = gpuDevice->getRegVal(pcie_index_reg); @@ -62,19 +76,20 @@ AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset) pkt->setLE(value); } break; - // This is a PCIe status register. At some point during driver init - // the driver checks that interrupts are enabled. This is only - // checked once, so if the MMIO trace does not exactly line up with - // what the driver is doing in gem5, this may still have the first - // bit zero causing driver to fail. Therefore, we always set this - // bit to one as there is no harm to do so. case AMDGPU_PCIE_DATA2: { - uint32_t value = pkt->getLE() | 0x1; - DPRINTF(AMDGPUDevice, "Marking interrupts enabled: %#lx\n", value); + uint32_t value = gpuDevice->getRegVal(pcie_index2_reg); + DPRINTF(AMDGPUDevice, "Read PCIe index2 %lx data2 %x\n", + pcie_index2_reg, value); pkt->setLE(value); } break; + case AMDGPU_PCIE_INDEX: + pkt->setLE(pcie_index_reg); + break; + case AMDGPU_PCIE_INDEX2: + pkt->setLE(pcie_index2_reg); + break; case AMDGPU_MM_DATA: pkt->setLE(gpuDevice->getRegVal(mm_index_reg)); break; @@ -130,14 +145,24 @@ AMDGPUNbio::writeMMIO(PacketPtr pkt, Addr offset) DPRINTF(AMDGPUDevice, "MM write to reg %#lx data %#lx\n", mm_index_reg, pkt->getLE()); gpuDevice->setRegVal(AMDGPU_MM_DATA, pkt->getLE()); + // PCIE_DATA, PCIE_DATA2, PCIE_INDEX, and PCIE_INDEX2 handle "indirect + // "register reads/writes from the driver. This provides a way to read + // any register by providing a 32-bit address to one of the two INDEX + // registers and then reading the corresponding DATA register. See: + // https://github.com/ROCm/ROCK-Kernel-Driver/blob/roc-6.0.x/drivers/ + // gpu/drm/amd/amdgpu/amdgpu_device.c#L459 } else if (offset == AMDGPU_PCIE_INDEX) { assert(pkt->getSize() == 4); - pcie_index_reg = insertBits(pcie_index_reg, 31, 0, - pkt->getLE()); + pcie_index_reg = pkt->getLE(); + } else if (offset == AMDGPU_PCIE_DATA) { + assert(pkt->getSize() == 4); + gpuDevice->setRegVal(pcie_index_reg, pkt->getLE()); } else if (offset == AMDGPU_PCIE_INDEX2) { assert(pkt->getSize() == 4); - pcie_index_reg = insertBits(pcie_index_reg, 63, 32, - pkt->getLE()); + pcie_index2_reg = pkt->getLE(); + } else if (offset == AMDGPU_PCIE_DATA2) { + assert(pkt->getSize() == 4); + gpuDevice->setRegVal(pcie_index2_reg, pkt->getLE()); } else if (offset == AMDGPU_MP0_SMN_C2PMSG_35) { // See psp_v3_1_bootloader_load_sos in amdgpu driver code. if (pkt->getLE() == 0x10000) { diff --git a/src/dev/amdgpu/amdgpu_nbio.hh b/src/dev/amdgpu/amdgpu_nbio.hh index 0d839d0e22..87afb02c41 100644 --- a/src/dev/amdgpu/amdgpu_nbio.hh +++ b/src/dev/amdgpu/amdgpu_nbio.hh @@ -110,7 +110,8 @@ class AMDGPUNbio * Driver initialization sequence helper variables. */ uint64_t mm_index_reg = 0; - uint64_t pcie_index_reg = 0; + uint32_t pcie_index_reg = 0; + uint32_t pcie_index2_reg = 0; std::unordered_map triggered_reads; /* From 823b5a6eb87e45f2cb54d3b1c736dad11e4e70e4 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Tue, 13 Feb 2024 17:43:23 -0600 Subject: [PATCH 9/9] dev-amdgpu: Support multiple CPs and MMIO AddrRanges Currently gem5 assumes that there is only one command processor (CP) which contains the PM4 packet processor. Some GPU devices have multiple CPs which the driver tests individually during POST if they are used or not. Therefore, these additional CPs need to be supported. This commit allows for multiple PM4 packet processors which represent multiple CPs. Each of these processors will have its own independent MMIO address range. To more easily support ranges, the MMIO addresses now use AddrRange to index a PM4 packet processor instead of the hard-coded constexpr MMIO start and size pairs. By default only one PM4 packet processor is created, meaning the functionality of the simulation is unchanged for devices currently supported in gem5. Change-Id: I977f4fd3a169ef4a78671a4fb58c8ea0e19bf52c --- configs/example/gpufs/system/system.py | 18 ++- src/dev/amdgpu/AMDGPU.py | 6 +- src/dev/amdgpu/amdgpu_defines.hh | 37 ++---- src/dev/amdgpu/amdgpu_device.cc | 160 +++++++++++++++++-------- src/dev/amdgpu/amdgpu_device.hh | 16 ++- src/dev/amdgpu/amdgpu_vm.cc | 30 +++++ src/dev/amdgpu/amdgpu_vm.hh | 53 ++++---- src/dev/amdgpu/pm4_mmio.hh | 54 ++++----- src/dev/amdgpu/pm4_packet_processor.cc | 15 ++- src/dev/amdgpu/pm4_packet_processor.hh | 7 ++ 10 files changed, 245 insertions(+), 151 deletions(-) diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py index 671d4efdc9..1f89bd935b 100644 --- a/configs/example/gpufs/system/system.py +++ b/configs/example/gpufs/system/system.py @@ -188,9 +188,15 @@ def makeGpuFSSystem(args): system.pc.south_bridge.gpu.sdmas = sdma_engines - # Setup PM4 packet processor - pm4_pkt_proc = PM4PacketProcessor() - system.pc.south_bridge.gpu.pm4_pkt_proc = pm4_pkt_proc + # Setup PM4 packet processors + pm4_procs = [] + pm4_procs.append( + PM4PacketProcessor( + ip_id=0, mmio_range=AddrRange(start=0xC000, end=0xD000) + ) + ) + + system.pc.south_bridge.gpu.pm4_pkt_procs = pm4_procs # GPU data path gpu_mem_mgr = AMDGPUMemoryManager() @@ -207,7 +213,8 @@ def makeGpuFSSystem(args): for sdma in sdma_engines: system._dma_ports.append(sdma) system._dma_ports.append(device_ih) - system._dma_ports.append(pm4_pkt_proc) + for pm4_proc in pm4_procs: + system._dma_ports.append(pm4_proc) system._dma_ports.append(system_hub) system._dma_ports.append(gpu_mem_mgr) system._dma_ports.append(hsapp_pt_walker) @@ -221,7 +228,8 @@ def makeGpuFSSystem(args): for sdma in sdma_engines: sdma.pio = system.iobus.mem_side_ports device_ih.pio = system.iobus.mem_side_ports - pm4_pkt_proc.pio = system.iobus.mem_side_ports + for pm4_proc in pm4_procs: + pm4_proc.pio = system.iobus.mem_side_ports system_hub.pio = system.iobus.mem_side_ports # Full system needs special TLBs for SQC, Scalar, and vector data ports diff --git a/src/dev/amdgpu/AMDGPU.py b/src/dev/amdgpu/AMDGPU.py index 0370f09e01..0e0f597927 100644 --- a/src/dev/amdgpu/AMDGPU.py +++ b/src/dev/amdgpu/AMDGPU.py @@ -95,7 +95,7 @@ class AMDGPUDevice(PciDevice): # The config script should not create a new cp here but rather assign the # same cp that is assigned to the Shader SimObject. cp = Param.GPUCommandProcessor(NULL, "Command Processor") - pm4_pkt_proc = Param.PM4PacketProcessor("PM4 Packet Processor") + pm4_pkt_procs = VectorParam.PM4PacketProcessor("PM4 Packet Processor") memory_manager = Param.AMDGPUMemoryManager("GPU Memory Manager") memories = VectorParam.AbstractMemory([], "All memories in the device") device_ih = Param.AMDGPUInterruptHandler("GPU Interrupt handler") @@ -118,6 +118,10 @@ class PM4PacketProcessor(DmaVirtDevice): cxx_header = "dev/amdgpu/pm4_packet_processor.hh" cxx_class = "gem5::PM4PacketProcessor" + # Default to 0 as the common case is one PM4 packet processor + ip_id = Param.Int(0, "Instance ID of this PM4 processor") + mmio_range = Param.AddrRange("Range of MMIO addresses") + class AMDGPUMemoryManager(ClockedObject): type = "AMDGPUMemoryManager" diff --git a/src/dev/amdgpu/amdgpu_defines.hh b/src/dev/amdgpu/amdgpu_defines.hh index bc6377fbbc..883501b84d 100644 --- a/src/dev/amdgpu/amdgpu_defines.hh +++ b/src/dev/amdgpu/amdgpu_defines.hh @@ -49,6 +49,16 @@ enum QueueType RLC }; +/* + * Hold information about doorbells including queue type and the IP + * block ID if the IP can have multiple instances. + */ +typedef struct +{ + QueueType qtype; + int ip_id; +} DoorbellInfo; + // AMD GPUs support 16 different virtual address spaces static constexpr int AMDGPU_VM_COUNT = 16; @@ -61,36 +71,11 @@ constexpr int MMIO_BAR = 5; constexpr uint32_t VGA_ROM_DEFAULT = 0xc0000; constexpr uint32_t ROM_SIZE = 0x20000; // 128kB -/* SDMA base, size, mmio offset shift. */ -static constexpr uint32_t SDMA0_BASE = 0x4980; -static constexpr uint32_t SDMA1_BASE = 0x5180; -static constexpr uint32_t SDMA_SIZE = 0x800; -static constexpr uint32_t SDMA_OFFSET_SHIFT = 2; - -/* Interrupt handler base, size, mmio offset shift. */ -static constexpr uint32_t IH_BASE = 0x4280; -static constexpr uint32_t IH_SIZE = 0x700; +/* Most MMIOs use DWORD addresses and thus need to be shifted. */ static constexpr uint32_t IH_OFFSET_SHIFT = 2; - -/* Graphics register bus manager base, size, mmio offset shift. */ -static constexpr uint32_t GRBM_BASE = 0x8000; -static constexpr uint32_t GRBM_SIZE = 0x5000; static constexpr uint32_t GRBM_OFFSET_SHIFT = 2; - -/* GFX base, size, mmio offset shift. */ -static constexpr uint32_t GFX_BASE = 0x28000; -static constexpr uint32_t GFX_SIZE = 0x17000; -static constexpr uint32_t GFX_OFFSET_SHIFT = 2; - -/* MMHUB base, size, mmio offset shift. */ -static constexpr uint32_t MMHUB_BASE = 0x68000; -static constexpr uint32_t MMHUB_SIZE = 0x2120; static constexpr uint32_t MMHUB_OFFSET_SHIFT = 2; -/* NBIO base and size. */ -static constexpr uint32_t NBIO_BASE = 0x0; -static constexpr uint32_t NBIO_SIZE = 0x4280; - } // namespace gem5 #endif // __DEV_AMDGPU_AMDGPU_DEFINES_HH__ diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc index 4b684aa221..5ddd7756ba 100644 --- a/src/dev/amdgpu/amdgpu_device.cc +++ b/src/dev/amdgpu/amdgpu_device.cc @@ -54,8 +54,7 @@ namespace gem5 AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) : PciDevice(p), gpuMemMgr(p.memory_manager), deviceIH(p.device_ih), - pm4PktProc(p.pm4_pkt_proc), cp(p.cp), - checkpoint_before_mmios(p.checkpoint_before_mmios), + cp(p.cp), checkpoint_before_mmios(p.checkpoint_before_mmios), init_interrupt_count(0), _lastVMID(0), deviceMem(name() + ".deviceMem", p.memories, false, "", false) { @@ -81,6 +80,16 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) romRange = RangeSize(VGA_ROM_DEFAULT, ROM_SIZE); } + if (p.device_name == "Vega10") { + gfx_version = GfxVersion::gfx900; + } else if (p.device_name == "MI100") { + gfx_version = GfxVersion::gfx908; + } else if (p.device_name == "MI200") { + gfx_version = GfxVersion::gfx90a; + } else { + panic("Unknown GPU device %s\n", p.device_name); + } + if (p.trace_file != "") { mmioReader.readMMIOTrace(p.trace_file); } @@ -126,8 +135,22 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) panic("Unknown GPU device %s\n", p.device_name); } + // Setup PM4 packet processors and sanity check IDs + std::set pm4_ids; + for (auto& pm4 : p.pm4_pkt_procs) { + pm4->setGPUDevice(this); + fatal_if(pm4_ids.count(pm4->getIpId()), + "Two PM4s with same IP IDs is not allowed"); + pm4_ids.insert(pm4->getIpId()); + pm4PktProcs.insert({pm4->getIpId(), pm4}); + + pm4Ranges.insert({pm4->getMMIORange(), pm4}); + } + + // There should be at least one PM4 packet processor with ID 0 + fatal_if(!pm4PktProcs.count(0), "No default PM4 processor found"); + deviceIH->setGPUDevice(this); - pm4PktProc->setGPUDevice(this); cp->hsaPacketProc().setGPUDevice(this); cp->setGPUDevice(this); nbio.setGPUDevice(this); @@ -136,6 +159,23 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) // could possibly be anything, but these are the values used by hardware. uint64_t mmhubBase = 0x8000ULL << 24; uint64_t mmhubTop = 0x83ffULL << 24; + uint64_t mem_size = 0x3ff0; // 16 GB of memory + + gpuvm.setMMHUBBase(mmhubBase); + gpuvm.setMMHUBTop(mmhubTop); + + // Map other MMIO apertures based on gfx version. This must be done before + // any calls to get/setRegVal. + // NBIO 0x0 - 0x4280 + // IH 0x4280 - 0x4980 + // GRBM 0x8000 - 0xC000 + // GFX 0x28000 - 0x3F000 + // MMHUB 0x68000 - 0x6a120 + gpuvm.setMMIOAperture(NBIO_MMIO_RANGE, AddrRange(0x0, 0x4280)); + gpuvm.setMMIOAperture(IH_MMIO_RANGE, AddrRange(0x4280, 0x4980)); + gpuvm.setMMIOAperture(GRBM_MMIO_RANGE, AddrRange(0x8000, 0xC000)); + gpuvm.setMMIOAperture(GFX_MMIO_RANGE, AddrRange(0x28000, 0x3F000)); + gpuvm.setMMIOAperture(MMHUB_MMIO_RANGE, AddrRange(0x68000, 0x6A120)); // These are hardcoded register values to return what the driver expects setRegVal(AMDGPU_MP0_SMN_C2PMSG_33, 0x80000000); @@ -145,25 +185,19 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) if (p.device_name == "Vega10") { setRegVal(VEGA10_FB_LOCATION_BASE, mmhubBase >> 24); setRegVal(VEGA10_FB_LOCATION_TOP, mmhubTop >> 24); - gfx_version = GfxVersion::gfx900; } else if (p.device_name == "MI100") { setRegVal(MI100_FB_LOCATION_BASE, mmhubBase >> 24); setRegVal(MI100_FB_LOCATION_TOP, mmhubTop >> 24); - setRegVal(MI100_MEM_SIZE_REG, 0x3ff0); // 16GB of memory - gfx_version = GfxVersion::gfx908; + setRegVal(MI100_MEM_SIZE_REG, mem_size); } else if (p.device_name == "MI200") { // This device can have either 64GB or 128GB of device memory. // This limits to 16GB for simulation. setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24); setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24); - setRegVal(MI200_MEM_SIZE_REG, 0x3ff0); - gfx_version = GfxVersion::gfx90a; + setRegVal(MI200_MEM_SIZE_REG, mem_size); } else { panic("Unknown GPU device %s\n", p.device_name); } - - gpuvm.setMMHUBBase(mmhubBase); - gpuvm.setMMHUBTop(mmhubTop); } void @@ -356,29 +390,28 @@ AMDGPUDevice::readDoorbell(PacketPtr pkt, Addr offset) void AMDGPUDevice::readMMIO(PacketPtr pkt, Addr offset) { - Addr aperture = gpuvm.getMmioAperture(offset); - Addr aperture_offset = offset - aperture; + AddrRange aperture = gpuvm.getMMIOAperture(offset); + Addr aperture_offset = offset - aperture.start(); // By default read from MMIO trace. Overwrite the packet for a select // few more dynamic MMIOs. DPRINTF(AMDGPUDevice, "Read MMIO %#lx\n", offset); mmioReader.readFromTrace(pkt, MMIO_BAR, offset); - switch (aperture) { - case NBIO_BASE: + if (aperture == gpuvm.getMMIORange(NBIO_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "NBIO base\n"); nbio.readMMIO(pkt, aperture_offset); - break; - case GRBM_BASE: + } else if (aperture == gpuvm.getMMIORange(GRBM_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "GRBM base\n"); gpuvm.readMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT); - break; - case GFX_BASE: + } else if (aperture == gpuvm.getMMIORange(GFX_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "GFX base\n"); gfx.readMMIO(pkt, aperture_offset); - break; - case MMHUB_BASE: + } else if (aperture == gpuvm.getMMIORange(MMHUB_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "MMHUB base\n"); gpuvm.readMMIO(pkt, aperture_offset >> MMHUB_OFFSET_SHIFT); - break; - default: - break; + } else { + DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for read %#x\n", offset); } } @@ -422,17 +455,22 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset) DPRINTF(AMDGPUDevice, "Wrote doorbell %#lx\n", offset); if (doorbells.find(offset) != doorbells.end()) { - QueueType q_type = doorbells[offset]; + QueueType q_type = doorbells[offset].qtype; + int ip_id = doorbells[offset].ip_id; DPRINTF(AMDGPUDevice, "Doorbell offset %p queue: %d\n", offset, q_type); switch (q_type) { case Compute: - pm4PktProc->process(pm4PktProc->getQueue(offset), - pkt->getLE()); + assert(pm4PktProcs.count(ip_id)); + pm4PktProcs[ip_id]->process( + pm4PktProcs[ip_id]->getQueue(offset), + pkt->getLE()); break; case Gfx: - pm4PktProc->process(pm4PktProc->getQueue(offset, true), - pkt->getLE()); + assert(pm4PktProcs.count(ip_id)); + pm4PktProcs[ip_id]->process( + pm4PktProcs[ip_id]->getQueue(offset, true), + pkt->getLE()); break; case SDMAGfx: { SDMAEngine *sdmaEng = getSDMAEngine(offset); @@ -443,9 +481,11 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset) sdmaEng->processPage(pkt->getLE()); } break; case ComputeAQL: { + assert(pm4PktProcs.count(ip_id)); cp->hsaPacketProc().hwScheduler()->write(offset, pkt->getLE() + 1); - pm4PktProc->updateReadIndex(offset, pkt->getLE() + 1); + pm4PktProcs[ip_id]->updateReadIndex(offset, + pkt->getLE() + 1); } break; case InterruptHandler: deviceIH->updateRptr(pkt->getLE()); @@ -475,12 +515,12 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset) void AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset) { - Addr aperture = gpuvm.getMmioAperture(offset); - Addr aperture_offset = offset - aperture; + AddrRange aperture = gpuvm.getMMIOAperture(offset); + Addr aperture_offset = offset - aperture.start(); DPRINTF(AMDGPUDevice, "Wrote MMIO %#lx\n", offset); - // Check SDMA functions first, then fallback to switch statement + // Check SDMA functions first, then fallback to MMIO ranges. for (int idx = 0; idx < sdmaIds.size(); ++idx) { if (sdmaMmios[idx].contains(offset)) { Addr sdma_offset = (offset - sdmaMmios[idx].start()) >> 2; @@ -498,26 +538,31 @@ AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset) } } - switch (aperture) { - /* Write a general register to the graphics register bus manager. */ - case GRBM_BASE: + // Check PM4s next, returning to avoid duplicate writes. + for (auto& [range, pm4_proc] : pm4Ranges) { + if (range.contains(offset)) { + // PM4 MMIOs are offset based on the MMIO range start + Addr ip_offset = offset - range.start(); + pm4_proc->writeMMIO(pkt, ip_offset >> GRBM_OFFSET_SHIFT); + + return; + } + } + + if (aperture == gpuvm.getMMIORange(GRBM_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "GRBM base\n"); gpuvm.writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT); - pm4PktProc->writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT); - break; - /* Write a register to the interrupt handler. */ - case IH_BASE: + } else if (aperture == gpuvm.getMMIORange(IH_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "IH base\n"); deviceIH->writeMMIO(pkt, aperture_offset >> IH_OFFSET_SHIFT); - break; - /* Write an IO space register */ - case NBIO_BASE: + } else if (aperture == gpuvm.getMMIORange(NBIO_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "NBIO base\n"); nbio.writeMMIO(pkt, aperture_offset); - break; - case GFX_BASE: + } else if (aperture == gpuvm.getMMIORange(GFX_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "GFX base\n"); gfx.writeMMIO(pkt, aperture_offset); - break; - default: - DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for %#x\n", offset); - break; + } else { + DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for write %#x\n", offset); } } @@ -638,10 +683,11 @@ AMDGPUDevice::setRegVal(uint64_t addr, uint32_t value) } void -AMDGPUDevice::setDoorbellType(uint32_t offset, QueueType qt) +AMDGPUDevice::setDoorbellType(uint32_t offset, QueueType qt, int ip_id) { DPRINTF(AMDGPUDevice, "Setting doorbell type for %x\n", offset); - doorbells[offset] = qt; + doorbells[offset].qtype = qt; + doorbells[offset].ip_id = ip_id; } void @@ -692,6 +738,7 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const // Make a c-style array of the regs to serialize uint32_t doorbells_offset[doorbells_size]; QueueType doorbells_queues[doorbells_size]; + int doorbells_ip_ids[doorbells_size]; uint32_t sdma_engs_offset[sdma_engs_size]; int sdma_engs[sdma_engs_size]; int used_vmids[used_vmid_map_size]; @@ -701,7 +748,8 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const int idx = 0; for (auto & it : doorbells) { doorbells_offset[idx] = it.first; - doorbells_queues[idx] = it.second; + doorbells_queues[idx] = it.second.qtype; + doorbells_ip_ids[idx] = it.second.ip_id; ++idx; } @@ -730,6 +778,8 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const sizeof(doorbells_offset[0])); SERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/ sizeof(doorbells_queues[0])); + SERIALIZE_ARRAY(doorbells_ip_ids, sizeof(doorbells_ip_ids)/ + sizeof(doorbells_ip_ids[0])); SERIALIZE_ARRAY(sdma_engs_offset, sizeof(sdma_engs_offset)/ sizeof(sdma_engs_offset[0])); SERIALIZE_ARRAY(sdma_engs, sizeof(sdma_engs)/sizeof(sdma_engs[0])); @@ -768,14 +818,18 @@ AMDGPUDevice::unserialize(CheckpointIn &cp) if (doorbells_size > 0) { uint32_t doorbells_offset[doorbells_size]; QueueType doorbells_queues[doorbells_size]; + int doorbells_ip_ids[doorbells_size]; UNSERIALIZE_ARRAY(doorbells_offset, sizeof(doorbells_offset)/ sizeof(doorbells_offset[0])); UNSERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/ sizeof(doorbells_queues[0])); + UNSERIALIZE_ARRAY(doorbells_ip_ids, sizeof(doorbells_ip_ids)/ + sizeof(doorbells_ip_ids[0])); for (int idx = 0; idx < doorbells_size; ++idx) { - doorbells[doorbells_offset[idx]] = doorbells_queues[idx]; + doorbells[doorbells_offset[idx]].qtype = doorbells_queues[idx]; + doorbells[doorbells_offset[idx]].ip_id = doorbells_ip_ids[idx]; } } diff --git a/src/dev/amdgpu/amdgpu_device.hh b/src/dev/amdgpu/amdgpu_device.hh index fface5fb3e..33b6a9f3e7 100644 --- a/src/dev/amdgpu/amdgpu_device.hh +++ b/src/dev/amdgpu/amdgpu_device.hh @@ -87,7 +87,7 @@ class AMDGPUDevice : public PciDevice /** * Structures to hold registers, doorbells, and some frame memory */ - std::unordered_map doorbells; + std::unordered_map doorbells; std::unordered_map pendingDoorbellPkts; /** @@ -113,9 +113,19 @@ class AMDGPUDevice : public PciDevice AMDGPUMemoryManager *gpuMemMgr; AMDGPUInterruptHandler *deviceIH; AMDGPUVM gpuvm; - PM4PacketProcessor *pm4PktProc; GPUCommandProcessor *cp; + struct AddrRangeHasher + { + std::size_t operator()(const AddrRange& k) const + { + return k.start(); + } + }; + std::unordered_map pm4PktProcs; + std::unordered_map pm4Ranges; + // SDMAs mapped by doorbell offset std::unordered_map sdmaEngs; // SDMAs mapped by ID @@ -185,7 +195,7 @@ class AMDGPUDevice : public PciDevice /** * Set handles to GPU blocks. */ - void setDoorbellType(uint32_t offset, QueueType qt); + void setDoorbellType(uint32_t offset, QueueType qt, int ip_id = 0); void processPendingDoorbells(uint32_t offset); void setSDMAEngine(Addr offset, SDMAEngine *eng); diff --git a/src/dev/amdgpu/amdgpu_vm.cc b/src/dev/amdgpu/amdgpu_vm.cc index 5a13ac9ba0..0eea590c5a 100644 --- a/src/dev/amdgpu/amdgpu_vm.cc +++ b/src/dev/amdgpu/amdgpu_vm.cc @@ -37,6 +37,7 @@ #include "base/trace.hh" #include "debug/AMDGPUDevice.hh" #include "dev/amdgpu/amdgpu_defines.hh" +#include "dev/amdgpu/amdgpu_device.hh" #include "mem/packet_access.hh" namespace gem5 @@ -51,6 +52,35 @@ AMDGPUVM::AMDGPUVM() for (int i = 0; i < AMDGPU_VM_COUNT; ++i) { memset(&vmContexts[0], 0, sizeof(AMDGPUVMContext)); } + + for (int i = 0; i < NUM_MMIO_RANGES; ++i) { + mmioRanges[i] = AddrRange(); + } +} + +void +AMDGPUVM::setMMIOAperture(mmio_range_t mmio_aperture, AddrRange range) +{ + mmioRanges[mmio_aperture] = range; +} + +AddrRange +AMDGPUVM::getMMIORange(mmio_range_t mmio_aperture) +{ + return mmioRanges[mmio_aperture]; +} + +const AddrRange& +AMDGPUVM::getMMIOAperture(Addr offset) +{ + for (int i = 0; i < NUM_MMIO_RANGES; ++i) { + if (mmioRanges[i].contains(offset)) { + return mmioRanges[i]; + } + } + + // Default to NBIO + return mmioRanges[NBIO_MMIO_RANGE]; } Addr diff --git a/src/dev/amdgpu/amdgpu_vm.hh b/src/dev/amdgpu/amdgpu_vm.hh index 5af666f379..857ef724da 100644 --- a/src/dev/amdgpu/amdgpu_vm.hh +++ b/src/dev/amdgpu/amdgpu_vm.hh @@ -99,9 +99,23 @@ static constexpr int AMDGPU_USER_PAGE_SIZE = 4096; namespace gem5 { +typedef enum : int +{ + NBIO_MMIO_RANGE, + MMHUB_MMIO_RANGE, + GFX_MMIO_RANGE, + GRBM_MMIO_RANGE, + IH_MMIO_RANGE, + NUM_MMIO_RANGES +} mmio_range_t; + +class AMDGPUDevice; + class AMDGPUVM : public Serializable { private: + AMDGPUDevice *gpuDevice; + typedef struct GEM5_PACKED { // Page table addresses: from (Base + Start) to (End) @@ -160,9 +174,13 @@ class AMDGPUVM : public Serializable */ std::vector gpu_tlbs; + std::array mmioRanges; + public: AMDGPUVM(); + void setGPUDevice(AMDGPUDevice *gpu_device) { gpuDevice = gpu_device; } + /** * Return base address of GART table in framebuffer. */ @@ -232,38 +250,11 @@ class AMDGPUVM : public Serializable Addr getSysAddrRangeLow () { return vmContext0.sysAddrL; } Addr getSysAddrRangeHigh () { return vmContext0.sysAddrH; } - Addr - getMmioAperture(Addr addr) - { - // Aperture ranges: - // NBIO 0x0 - 0x4280 - // IH 0x4280 - 0x4980 - // SDMA0 0x4980 - 0x5180 - // SDMA1 0x5180 - 0x5980 - // GRBM 0x8000 - 0xD000 - // GFX 0x28000 - 0x3F000 - // MMHUB 0x68000 - 0x6a120 + void setMMIOAperture(mmio_range_t mmio_aperture, AddrRange range); + const AddrRange& getMMIOAperture(Addr addr); + AddrRange getMMIORange(mmio_range_t mmio_aperture); - if (IH_BASE <= addr && addr < IH_BASE + IH_SIZE) - return IH_BASE; - else if (SDMA0_BASE <= addr && addr < SDMA0_BASE + SDMA_SIZE) - return SDMA0_BASE; - else if (SDMA1_BASE <= addr && addr < SDMA1_BASE + SDMA_SIZE) - return SDMA1_BASE; - else if (GRBM_BASE <= addr && addr < GRBM_BASE + GRBM_SIZE) - return GRBM_BASE; - else if (GFX_BASE <= addr && addr < GFX_BASE + GFX_SIZE) - return GFX_BASE; - else if (MMHUB_BASE <= addr && addr < MMHUB_BASE + MMHUB_SIZE) - return MMHUB_BASE; - else { - warn_once("Accessing unsupported MMIO aperture! Assuming NBIO\n"); - return NBIO_BASE; - } - - } - - // Gettig mapped aperture base addresses + // Getting mapped aperture base addresses Addr getFrameAperture(Addr addr) { diff --git a/src/dev/amdgpu/pm4_mmio.hh b/src/dev/amdgpu/pm4_mmio.hh index 3801223175..e9e504c3cd 100644 --- a/src/dev/amdgpu/pm4_mmio.hh +++ b/src/dev/amdgpu/pm4_mmio.hh @@ -36,34 +36,34 @@ namespace gem5 { -#define mmCP_RB0_BASE 0x1040 -#define mmCP_RB0_CNTL 0x1041 -#define mmCP_RB_WPTR_POLL_ADDR_LO 0x1046 -#define mmCP_RB_WPTR_POLL_ADDR_HI 0x1047 -#define mmCP_RB_VMID 0x1051 -#define mmCP_RB0_RPTR_ADDR 0x1043 -#define mmCP_RB0_RPTR_ADDR_HI 0x1044 -#define mmCP_RB0_WPTR 0x1054 -#define mmCP_RB0_WPTR_HI 0x1055 -#define mmCP_RB_DOORBELL_CONTROL 0x1059 -#define mmCP_RB_DOORBELL_RANGE_LOWER 0x105a -#define mmCP_RB_DOORBELL_RANGE_UPPER 0x105b -#define mmCP_RB0_BASE_HI 0x10b1 +#define mmCP_RB0_BASE 0x040 +#define mmCP_RB0_CNTL 0x041 +#define mmCP_RB_WPTR_POLL_ADDR_LO 0x046 +#define mmCP_RB_WPTR_POLL_ADDR_HI 0x047 +#define mmCP_RB_VMID 0x051 +#define mmCP_RB0_RPTR_ADDR 0x043 +#define mmCP_RB0_RPTR_ADDR_HI 0x044 +#define mmCP_RB0_WPTR 0x054 +#define mmCP_RB0_WPTR_HI 0x055 +#define mmCP_RB_DOORBELL_CONTROL 0x059 +#define mmCP_RB_DOORBELL_RANGE_LOWER 0x05a +#define mmCP_RB_DOORBELL_RANGE_UPPER 0x05b +#define mmCP_RB0_BASE_HI 0x0b1 -#define mmCP_HQD_ACTIVE 0x1247 -#define mmCP_HQD_VMID 0x1248 -#define mmCP_HQD_PQ_BASE 0x124d -#define mmCP_HQD_PQ_BASE_HI 0x124e -#define mmCP_HQD_PQ_DOORBELL_CONTROL 0x1254 -#define mmCP_HQD_PQ_RPTR 0x124f -#define mmCP_HQD_PQ_RPTR_REPORT_ADDR 0x1250 -#define mmCP_HQD_PQ_RPTR_REPORT_ADDR_HI 0x1251 -#define mmCP_HQD_PQ_WPTR_POLL_ADDR 0x1252 -#define mmCP_HQD_PQ_WPTR_POLL_ADDR_HI 0x1253 -#define mmCP_HQD_PQ_CONTROL 0x1256 -#define mmCP_HQD_IB_CONTROL 0x125a -#define mmCP_HQD_PQ_WPTR_LO 0x127b -#define mmCP_HQD_PQ_WPTR_HI 0x127c +#define mmCP_HQD_ACTIVE 0x247 +#define mmCP_HQD_VMID 0x248 +#define mmCP_HQD_PQ_BASE 0x24d +#define mmCP_HQD_PQ_BASE_HI 0x24e +#define mmCP_HQD_PQ_DOORBELL_CONTROL 0x254 +#define mmCP_HQD_PQ_RPTR 0x24f +#define mmCP_HQD_PQ_RPTR_REPORT_ADDR 0x250 +#define mmCP_HQD_PQ_RPTR_REPORT_ADDR_HI 0x251 +#define mmCP_HQD_PQ_WPTR_POLL_ADDR 0x252 +#define mmCP_HQD_PQ_WPTR_POLL_ADDR_HI 0x253 +#define mmCP_HQD_PQ_CONTROL 0x256 +#define mmCP_HQD_IB_CONTROL 0x25a +#define mmCP_HQD_PQ_WPTR_LO 0x27b +#define mmCP_HQD_PQ_WPTR_HI 0x27c } // namespace gem5 diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc index c8baa5eab4..62e817aa98 100644 --- a/src/dev/amdgpu/pm4_packet_processor.cc +++ b/src/dev/amdgpu/pm4_packet_processor.cc @@ -49,7 +49,7 @@ namespace gem5 { PM4PacketProcessor::PM4PacketProcessor(const PM4PacketProcessorParams &p) - : DmaVirtDevice(p) + : DmaVirtDevice(p), _ipId(p.ip_id), _mmioRange(p.mmio_range) { memset(&kiq, 0, sizeof(QueueDesc)); memset(&pq, 0, sizeof(QueueDesc)); @@ -144,7 +144,7 @@ PM4PacketProcessor::newQueue(QueueDesc *mqd, Addr offset, QueueType qt; qt = mqd->aql ? QueueType::ComputeAQL : QueueType::Compute; - gpuDevice->setDoorbellType(offset, qt); + gpuDevice->setDoorbellType(offset, qt, getIpId()); DPRINTF(PM4PacketProcessor, "New PM4 queue %d, base: %p offset: %p, me: " "%d, pipe %d queue: %d size: %d\n", id, q->base(), q->offset(), @@ -521,7 +521,7 @@ PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, // Register doorbell with GPU device gpuDevice->setSDMAEngine(pkt->doorbellOffset << 2, sdma_eng); - gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC); + gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC, getIpId()); gpuDevice->processPendingDoorbells(pkt->doorbellOffset << 2); } @@ -774,9 +774,14 @@ PM4PacketProcessor::setUconfigReg(PM4Queue *q, PM4SetUconfigReg *pkt) { q->incRptr(sizeof(PM4SetUconfigReg)); + DPRINTF(PM4PacketProcessor, "SetUconfig offset %x data %x\n", + pkt->offset, pkt->data); + // SET_UCONFIG_REG_START and pkt->offset are dword addresses uint32_t reg_addr = (PACKET3_SET_UCONFIG_REG_START + pkt->offset) * 4; + // Additional CPs respond to addresses 0x40000 apart. + reg_addr += 0x40000 * getIpId(); gpuDevice->setRegVal(reg_addr, pkt->data); decodeNext(q); @@ -851,7 +856,7 @@ PM4PacketProcessor::writeMMIO(PacketPtr pkt, Addr mmio_offset) break; case mmCP_HQD_PQ_DOORBELL_CONTROL: setHqdPqDoorbellCtrl(pkt->getLE()); - gpuDevice->setDoorbellType(getKiqDoorbellOffset(), Compute); + gpuDevice->setDoorbellType(getKiqDoorbellOffset(), Compute, getIpId()); break; case mmCP_HQD_PQ_RPTR: setHqdPqPtr(pkt->getLE()); @@ -913,7 +918,7 @@ PM4PacketProcessor::writeMMIO(PacketPtr pkt, Addr mmio_offset) break; case mmCP_RB_DOORBELL_CONTROL: setRbDoorbellCntrl(pkt->getLE()); - gpuDevice->setDoorbellType(getPqDoorbellOffset(), Gfx); + gpuDevice->setDoorbellType(getPqDoorbellOffset(), Gfx, getIpId()); break; case mmCP_RB_DOORBELL_RANGE_LOWER: setRbDoorbellRangeLo(pkt->getLE()); diff --git a/src/dev/amdgpu/pm4_packet_processor.hh b/src/dev/amdgpu/pm4_packet_processor.hh index 4782e70829..82c3c2716f 100644 --- a/src/dev/amdgpu/pm4_packet_processor.hh +++ b/src/dev/amdgpu/pm4_packet_processor.hh @@ -63,6 +63,10 @@ class PM4PacketProcessor : public DmaVirtDevice std::unordered_map queues; /* A map of PM4 queues based on doorbell offset */ std::unordered_map queuesMap; + + int _ipId; + AddrRange _mmioRange; + public: PM4PacketProcessor(const PM4PacketProcessorParams &p); @@ -188,6 +192,9 @@ class PM4PacketProcessor : public DmaVirtDevice void setRbDoorbellCntrl(uint32_t data); void setRbDoorbellRangeLo(uint32_t data); void setRbDoorbellRangeHi(uint32_t data); + + int getIpId() const { return _ipId; } + AddrRange getMMIORange() const { return _mmioRange; } }; } // namespace gem5