diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py index 671d4efdc9..1f89bd935b 100644 --- a/configs/example/gpufs/system/system.py +++ b/configs/example/gpufs/system/system.py @@ -188,9 +188,15 @@ def makeGpuFSSystem(args): system.pc.south_bridge.gpu.sdmas = sdma_engines - # Setup PM4 packet processor - pm4_pkt_proc = PM4PacketProcessor() - system.pc.south_bridge.gpu.pm4_pkt_proc = pm4_pkt_proc + # Setup PM4 packet processors + pm4_procs = [] + pm4_procs.append( + PM4PacketProcessor( + ip_id=0, mmio_range=AddrRange(start=0xC000, end=0xD000) + ) + ) + + system.pc.south_bridge.gpu.pm4_pkt_procs = pm4_procs # GPU data path gpu_mem_mgr = AMDGPUMemoryManager() @@ -207,7 +213,8 @@ def makeGpuFSSystem(args): for sdma in sdma_engines: system._dma_ports.append(sdma) system._dma_ports.append(device_ih) - system._dma_ports.append(pm4_pkt_proc) + for pm4_proc in pm4_procs: + system._dma_ports.append(pm4_proc) system._dma_ports.append(system_hub) system._dma_ports.append(gpu_mem_mgr) system._dma_ports.append(hsapp_pt_walker) @@ -221,7 +228,8 @@ def makeGpuFSSystem(args): for sdma in sdma_engines: sdma.pio = system.iobus.mem_side_ports device_ih.pio = system.iobus.mem_side_ports - pm4_pkt_proc.pio = system.iobus.mem_side_ports + for pm4_proc in pm4_procs: + pm4_proc.pio = system.iobus.mem_side_ports system_hub.pio = system.iobus.mem_side_ports # Full system needs special TLBs for SQC, Scalar, and vector data ports diff --git a/configs/example/gpufs/vega10.py b/configs/example/gpufs/vega10.py index ae74efd39b..9c3116d415 100644 --- a/configs/example/gpufs/vega10.py +++ b/configs/example/gpufs/vega10.py @@ -52,7 +52,7 @@ if [ ! -f /lib/modules/`uname -r`/updates/dkms/amdgpu.ko ]; then echo "ERROR: Missing DKMS package for kernel `uname -r`. Exiting gem5." /sbin/m5 exit fi -modprobe -v amdgpu ip_block_mask=0xff ppfeaturemask=0 dpm=0 audio=0 +modprobe -v amdgpu ip_block_mask=0xdf ppfeaturemask=0 dpm=0 audio=0 echo "Running {} {}" echo "{}" | base64 -d > myapp chmod +x myapp diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc index 969d318c06..6f34301f48 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.cc +++ b/src/arch/amdgpu/vega/gpu_decoder.cc @@ -500,10 +500,10 @@ namespace VegaISA &Decoder::subDecode_OP_FLAT, &Decoder::subDecode_OP_FLAT, &Decoder::subDecode_OP_FLAT, - &Decoder::decode_invalid, - &Decoder::decode_invalid, - &Decoder::decode_invalid, - &Decoder::decode_invalid, + &Decoder::subDecode_OP_FLAT, + &Decoder::subDecode_OP_FLAT, + &Decoder::subDecode_OP_FLAT, + &Decoder::subDecode_OP_FLAT, &Decoder::subDecode_OP_MUBUF, &Decoder::subDecode_OP_MUBUF, &Decoder::subDecode_OP_MUBUF, @@ -1091,7 +1091,7 @@ namespace VegaISA &Decoder::decode_OPU_VOP3__V_MAD_I16, &Decoder::decode_OPU_VOP3__V_FMA_F16, &Decoder::decode_OPU_VOP3__V_DIV_FIXUP_F16, - &Decoder::decode_invalid, + &Decoder::decode_OPU_VOP3__V_LSHL_ADD_U64, &Decoder::decode_invalid, &Decoder::decode_invalid, &Decoder::decode_invalid, @@ -7053,6 +7053,12 @@ namespace VegaISA return new Inst_VOP3__V_DIV_FIXUP_F16(&iFmt->iFmt_VOP3A); } + GPUStaticInst* + Decoder::decode_OPU_VOP3__V_LSHL_ADD_U64(MachInst iFmt) + { + return new Inst_VOP3__V_LSHL_ADD_U64(&iFmt->iFmt_VOP3A); + } + GPUStaticInst* Decoder::decode_OPU_VOP3__V_INTERP_P1_F32(MachInst iFmt) { diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh index 48084a6913..d3b39fd945 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.hh +++ b/src/arch/amdgpu/vega/gpu_decoder.hh @@ -470,6 +470,7 @@ namespace VegaISA GPUStaticInst* decode_OPU_VOP3__V_MAD_I16(MachInst); GPUStaticInst* decode_OPU_VOP3__V_FMA_F16(MachInst); GPUStaticInst* decode_OPU_VOP3__V_DIV_FIXUP_F16(MachInst); + GPUStaticInst* decode_OPU_VOP3__V_LSHL_ADD_U64(MachInst); GPUStaticInst* decode_OPU_VOP3__V_INTERP_P1_F32(MachInst); GPUStaticInst* decode_OPU_VOP3__V_INTERP_P2_F32(MachInst); GPUStaticInst* decode_OPU_VOP3__V_INTERP_MOV_F32(MachInst); diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh index 4151c2cb8b..34fc448c87 100644 --- a/src/arch/amdgpu/vega/insts/instructions.hh +++ b/src/arch/amdgpu/vega/insts/instructions.hh @@ -30192,6 +30192,42 @@ namespace VegaISA void execute(GPUDynInstPtr) override; }; // Inst_VOP3__V_DIV_FIXUP_F16 + class Inst_VOP3__V_LSHL_ADD_U64 : public Inst_VOP3A + { + public: + Inst_VOP3__V_LSHL_ADD_U64(InFmt_VOP3A*); + ~Inst_VOP3__V_LSHL_ADD_U64(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 3; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: //src_0 + return 8; + case 1: //src_1 + return 4; + case 2: //src_2 + return 8; + case 3: //vdst + return 8; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + }; // Inst_VOP3__V_LSHL_ADD_U64 + class Inst_VOP3__V_CVT_PKACCUM_U8_F32 : public Inst_VOP3A { public: diff --git a/src/arch/amdgpu/vega/insts/vop3.cc b/src/arch/amdgpu/vega/insts/vop3.cc index 8f6794c9c2..f78f64bc91 100644 --- a/src/arch/amdgpu/vega/insts/vop3.cc +++ b/src/arch/amdgpu/vega/insts/vop3.cc @@ -7630,6 +7630,54 @@ namespace VegaISA { panicUnimplemented(); } // execute + // --- Inst_VOP3__V_LSHL_ADD_U64 class methods --- + + Inst_VOP3__V_LSHL_ADD_U64::Inst_VOP3__V_LSHL_ADD_U64(InFmt_VOP3A *iFmt) + : Inst_VOP3A(iFmt, "v_lshl_add_u64", false) + { + setFlag(ALU); + } // Inst_VOP3__V_LSHL_ADD_U64 + + Inst_VOP3__V_LSHL_ADD_U64::~Inst_VOP3__V_LSHL_ADD_U64() + { + } // ~Inst_VOP3__V_LSHL_ADD_U64 + + // --- description from .arch file --- + // D.u = (S0.u << S1.u[4:0]) + S2.u. + void + Inst_VOP3__V_LSHL_ADD_U64::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandU64 src0(gpuDynInst, extData.SRC0); + ConstVecOperandU32 src1(gpuDynInst, extData.SRC1); + ConstVecOperandU64 src2(gpuDynInst, extData.SRC2); + VecOperandU64 vdst(gpuDynInst, instData.VDST); + + src0.readSrc(); + src1.readSrc(); + src2.readSrc(); + + /** + * input modifiers are supported by FP operations only + */ + assert(!(instData.ABS & 0x1)); + assert(!(instData.ABS & 0x2)); + assert(!(instData.ABS & 0x4)); + assert(!(extData.NEG & 0x1)); + assert(!(extData.NEG & 0x2)); + assert(!(extData.NEG & 0x4)); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + int shift_amount = bits(src1[lane], 2, 0); + shift_amount = shift_amount > 4 ? 0 : shift_amount; + vdst[lane] = (src0[lane] << shift_amount) + + src2[lane]; + } + } + + vdst.write(); + } // execute // --- Inst_VOP3__V_CVT_PKACCUM_U8_F32 class methods --- Inst_VOP3__V_CVT_PKACCUM_U8_F32::Inst_VOP3__V_CVT_PKACCUM_U8_F32( diff --git a/src/dev/amdgpu/AMDGPU.py b/src/dev/amdgpu/AMDGPU.py index 0370f09e01..0e0f597927 100644 --- a/src/dev/amdgpu/AMDGPU.py +++ b/src/dev/amdgpu/AMDGPU.py @@ -95,7 +95,7 @@ class AMDGPUDevice(PciDevice): # The config script should not create a new cp here but rather assign the # same cp that is assigned to the Shader SimObject. cp = Param.GPUCommandProcessor(NULL, "Command Processor") - pm4_pkt_proc = Param.PM4PacketProcessor("PM4 Packet Processor") + pm4_pkt_procs = VectorParam.PM4PacketProcessor("PM4 Packet Processor") memory_manager = Param.AMDGPUMemoryManager("GPU Memory Manager") memories = VectorParam.AbstractMemory([], "All memories in the device") device_ih = Param.AMDGPUInterruptHandler("GPU Interrupt handler") @@ -118,6 +118,10 @@ class PM4PacketProcessor(DmaVirtDevice): cxx_header = "dev/amdgpu/pm4_packet_processor.hh" cxx_class = "gem5::PM4PacketProcessor" + # Default to 0 as the common case is one PM4 packet processor + ip_id = Param.Int(0, "Instance ID of this PM4 processor") + mmio_range = Param.AddrRange("Range of MMIO addresses") + class AMDGPUMemoryManager(ClockedObject): type = "AMDGPUMemoryManager" diff --git a/src/dev/amdgpu/amdgpu_defines.hh b/src/dev/amdgpu/amdgpu_defines.hh index bc6377fbbc..883501b84d 100644 --- a/src/dev/amdgpu/amdgpu_defines.hh +++ b/src/dev/amdgpu/amdgpu_defines.hh @@ -49,6 +49,16 @@ enum QueueType RLC }; +/* + * Hold information about doorbells including queue type and the IP + * block ID if the IP can have multiple instances. + */ +typedef struct +{ + QueueType qtype; + int ip_id; +} DoorbellInfo; + // AMD GPUs support 16 different virtual address spaces static constexpr int AMDGPU_VM_COUNT = 16; @@ -61,36 +71,11 @@ constexpr int MMIO_BAR = 5; constexpr uint32_t VGA_ROM_DEFAULT = 0xc0000; constexpr uint32_t ROM_SIZE = 0x20000; // 128kB -/* SDMA base, size, mmio offset shift. */ -static constexpr uint32_t SDMA0_BASE = 0x4980; -static constexpr uint32_t SDMA1_BASE = 0x5180; -static constexpr uint32_t SDMA_SIZE = 0x800; -static constexpr uint32_t SDMA_OFFSET_SHIFT = 2; - -/* Interrupt handler base, size, mmio offset shift. */ -static constexpr uint32_t IH_BASE = 0x4280; -static constexpr uint32_t IH_SIZE = 0x700; +/* Most MMIOs use DWORD addresses and thus need to be shifted. */ static constexpr uint32_t IH_OFFSET_SHIFT = 2; - -/* Graphics register bus manager base, size, mmio offset shift. */ -static constexpr uint32_t GRBM_BASE = 0x8000; -static constexpr uint32_t GRBM_SIZE = 0x5000; static constexpr uint32_t GRBM_OFFSET_SHIFT = 2; - -/* GFX base, size, mmio offset shift. */ -static constexpr uint32_t GFX_BASE = 0x28000; -static constexpr uint32_t GFX_SIZE = 0x17000; -static constexpr uint32_t GFX_OFFSET_SHIFT = 2; - -/* MMHUB base, size, mmio offset shift. */ -static constexpr uint32_t MMHUB_BASE = 0x68000; -static constexpr uint32_t MMHUB_SIZE = 0x2120; static constexpr uint32_t MMHUB_OFFSET_SHIFT = 2; -/* NBIO base and size. */ -static constexpr uint32_t NBIO_BASE = 0x0; -static constexpr uint32_t NBIO_SIZE = 0x4280; - } // namespace gem5 #endif // __DEV_AMDGPU_AMDGPU_DEFINES_HH__ diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc index 48f450c2b2..5ddd7756ba 100644 --- a/src/dev/amdgpu/amdgpu_device.cc +++ b/src/dev/amdgpu/amdgpu_device.cc @@ -54,8 +54,7 @@ namespace gem5 AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) : PciDevice(p), gpuMemMgr(p.memory_manager), deviceIH(p.device_ih), - pm4PktProc(p.pm4_pkt_proc), cp(p.cp), - checkpoint_before_mmios(p.checkpoint_before_mmios), + cp(p.cp), checkpoint_before_mmios(p.checkpoint_before_mmios), init_interrupt_count(0), _lastVMID(0), deviceMem(name() + ".deviceMem", p.memories, false, "", false) { @@ -81,6 +80,16 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) romRange = RangeSize(VGA_ROM_DEFAULT, ROM_SIZE); } + if (p.device_name == "Vega10") { + gfx_version = GfxVersion::gfx900; + } else if (p.device_name == "MI100") { + gfx_version = GfxVersion::gfx908; + } else if (p.device_name == "MI200") { + gfx_version = GfxVersion::gfx90a; + } else { + panic("Unknown GPU device %s\n", p.device_name); + } + if (p.trace_file != "") { mmioReader.readMMIOTrace(p.trace_file); } @@ -126,15 +135,47 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) panic("Unknown GPU device %s\n", p.device_name); } + // Setup PM4 packet processors and sanity check IDs + std::set pm4_ids; + for (auto& pm4 : p.pm4_pkt_procs) { + pm4->setGPUDevice(this); + fatal_if(pm4_ids.count(pm4->getIpId()), + "Two PM4s with same IP IDs is not allowed"); + pm4_ids.insert(pm4->getIpId()); + pm4PktProcs.insert({pm4->getIpId(), pm4}); + + pm4Ranges.insert({pm4->getMMIORange(), pm4}); + } + + // There should be at least one PM4 packet processor with ID 0 + fatal_if(!pm4PktProcs.count(0), "No default PM4 processor found"); + deviceIH->setGPUDevice(this); - pm4PktProc->setGPUDevice(this); cp->hsaPacketProc().setGPUDevice(this); cp->setGPUDevice(this); + nbio.setGPUDevice(this); // Address aperture for device memory. We tell this to the driver and // could possibly be anything, but these are the values used by hardware. uint64_t mmhubBase = 0x8000ULL << 24; uint64_t mmhubTop = 0x83ffULL << 24; + uint64_t mem_size = 0x3ff0; // 16 GB of memory + + gpuvm.setMMHUBBase(mmhubBase); + gpuvm.setMMHUBTop(mmhubTop); + + // Map other MMIO apertures based on gfx version. This must be done before + // any calls to get/setRegVal. + // NBIO 0x0 - 0x4280 + // IH 0x4280 - 0x4980 + // GRBM 0x8000 - 0xC000 + // GFX 0x28000 - 0x3F000 + // MMHUB 0x68000 - 0x6a120 + gpuvm.setMMIOAperture(NBIO_MMIO_RANGE, AddrRange(0x0, 0x4280)); + gpuvm.setMMIOAperture(IH_MMIO_RANGE, AddrRange(0x4280, 0x4980)); + gpuvm.setMMIOAperture(GRBM_MMIO_RANGE, AddrRange(0x8000, 0xC000)); + gpuvm.setMMIOAperture(GFX_MMIO_RANGE, AddrRange(0x28000, 0x3F000)); + gpuvm.setMMIOAperture(MMHUB_MMIO_RANGE, AddrRange(0x68000, 0x6A120)); // These are hardcoded register values to return what the driver expects setRegVal(AMDGPU_MP0_SMN_C2PMSG_33, 0x80000000); @@ -144,27 +185,19 @@ AMDGPUDevice::AMDGPUDevice(const AMDGPUDeviceParams &p) if (p.device_name == "Vega10") { setRegVal(VEGA10_FB_LOCATION_BASE, mmhubBase >> 24); setRegVal(VEGA10_FB_LOCATION_TOP, mmhubTop >> 24); - gfx_version = GfxVersion::gfx900; } else if (p.device_name == "MI100") { setRegVal(MI100_FB_LOCATION_BASE, mmhubBase >> 24); setRegVal(MI100_FB_LOCATION_TOP, mmhubTop >> 24); - setRegVal(MI100_MEM_SIZE_REG, 0x3ff0); // 16GB of memory - gfx_version = GfxVersion::gfx908; + setRegVal(MI100_MEM_SIZE_REG, mem_size); } else if (p.device_name == "MI200") { // This device can have either 64GB or 128GB of device memory. // This limits to 16GB for simulation. setRegVal(MI200_FB_LOCATION_BASE, mmhubBase >> 24); setRegVal(MI200_FB_LOCATION_TOP, mmhubTop >> 24); - setRegVal(MI200_MEM_SIZE_REG, 0x3ff0); - gfx_version = GfxVersion::gfx90a; + setRegVal(MI200_MEM_SIZE_REG, mem_size); } else { panic("Unknown GPU device %s\n", p.device_name); } - - gpuvm.setMMHUBBase(mmhubBase); - gpuvm.setMMHUBTop(mmhubTop); - - nbio.setGPUDevice(this); } void @@ -357,36 +390,28 @@ AMDGPUDevice::readDoorbell(PacketPtr pkt, Addr offset) void AMDGPUDevice::readMMIO(PacketPtr pkt, Addr offset) { - Addr aperture = gpuvm.getMmioAperture(offset); - Addr aperture_offset = offset - aperture; + AddrRange aperture = gpuvm.getMMIOAperture(offset); + Addr aperture_offset = offset - aperture.start(); // By default read from MMIO trace. Overwrite the packet for a select // few more dynamic MMIOs. DPRINTF(AMDGPUDevice, "Read MMIO %#lx\n", offset); mmioReader.readFromTrace(pkt, MMIO_BAR, offset); - if (regs.find(offset) != regs.end()) { - uint64_t value = regs[offset]; - DPRINTF(AMDGPUDevice, "Reading what kernel wrote before: %#x\n", - value); - pkt->setUintX(value, ByteOrder::little); - } - - switch (aperture) { - case NBIO_BASE: + if (aperture == gpuvm.getMMIORange(NBIO_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "NBIO base\n"); nbio.readMMIO(pkt, aperture_offset); - break; - case GRBM_BASE: + } else if (aperture == gpuvm.getMMIORange(GRBM_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "GRBM base\n"); gpuvm.readMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT); - break; - case GFX_BASE: + } else if (aperture == gpuvm.getMMIORange(GFX_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "GFX base\n"); gfx.readMMIO(pkt, aperture_offset); - break; - case MMHUB_BASE: + } else if (aperture == gpuvm.getMMIORange(MMHUB_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "MMHUB base\n"); gpuvm.readMMIO(pkt, aperture_offset >> MMHUB_OFFSET_SHIFT); - break; - default: - break; + } else { + DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for read %#x\n", offset); } } @@ -430,17 +455,22 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset) DPRINTF(AMDGPUDevice, "Wrote doorbell %#lx\n", offset); if (doorbells.find(offset) != doorbells.end()) { - QueueType q_type = doorbells[offset]; + QueueType q_type = doorbells[offset].qtype; + int ip_id = doorbells[offset].ip_id; DPRINTF(AMDGPUDevice, "Doorbell offset %p queue: %d\n", offset, q_type); switch (q_type) { case Compute: - pm4PktProc->process(pm4PktProc->getQueue(offset), - pkt->getLE()); + assert(pm4PktProcs.count(ip_id)); + pm4PktProcs[ip_id]->process( + pm4PktProcs[ip_id]->getQueue(offset), + pkt->getLE()); break; case Gfx: - pm4PktProc->process(pm4PktProc->getQueue(offset, true), - pkt->getLE()); + assert(pm4PktProcs.count(ip_id)); + pm4PktProcs[ip_id]->process( + pm4PktProcs[ip_id]->getQueue(offset, true), + pkt->getLE()); break; case SDMAGfx: { SDMAEngine *sdmaEng = getSDMAEngine(offset); @@ -451,9 +481,11 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset) sdmaEng->processPage(pkt->getLE()); } break; case ComputeAQL: { + assert(pm4PktProcs.count(ip_id)); cp->hsaPacketProc().hwScheduler()->write(offset, pkt->getLE() + 1); - pm4PktProc->updateReadIndex(offset, pkt->getLE() + 1); + pm4PktProcs[ip_id]->updateReadIndex(offset, + pkt->getLE() + 1); } break; case InterruptHandler: deviceIH->updateRptr(pkt->getLE()); @@ -483,12 +515,12 @@ AMDGPUDevice::writeDoorbell(PacketPtr pkt, Addr offset) void AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset) { - Addr aperture = gpuvm.getMmioAperture(offset); - Addr aperture_offset = offset - aperture; + AddrRange aperture = gpuvm.getMMIOAperture(offset); + Addr aperture_offset = offset - aperture.start(); DPRINTF(AMDGPUDevice, "Wrote MMIO %#lx\n", offset); - // Check SDMA functions first, then fallback to switch statement + // Check SDMA functions first, then fallback to MMIO ranges. for (int idx = 0; idx < sdmaIds.size(); ++idx) { if (sdmaMmios[idx].contains(offset)) { Addr sdma_offset = (offset - sdmaMmios[idx].start()) >> 2; @@ -506,26 +538,31 @@ AMDGPUDevice::writeMMIO(PacketPtr pkt, Addr offset) } } - switch (aperture) { - /* Write a general register to the graphics register bus manager. */ - case GRBM_BASE: + // Check PM4s next, returning to avoid duplicate writes. + for (auto& [range, pm4_proc] : pm4Ranges) { + if (range.contains(offset)) { + // PM4 MMIOs are offset based on the MMIO range start + Addr ip_offset = offset - range.start(); + pm4_proc->writeMMIO(pkt, ip_offset >> GRBM_OFFSET_SHIFT); + + return; + } + } + + if (aperture == gpuvm.getMMIORange(GRBM_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "GRBM base\n"); gpuvm.writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT); - pm4PktProc->writeMMIO(pkt, aperture_offset >> GRBM_OFFSET_SHIFT); - break; - /* Write a register to the interrupt handler. */ - case IH_BASE: + } else if (aperture == gpuvm.getMMIORange(IH_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "IH base\n"); deviceIH->writeMMIO(pkt, aperture_offset >> IH_OFFSET_SHIFT); - break; - /* Write an IO space register */ - case NBIO_BASE: + } else if (aperture == gpuvm.getMMIORange(NBIO_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "NBIO base\n"); nbio.writeMMIO(pkt, aperture_offset); - break; - case GFX_BASE: + } else if (aperture == gpuvm.getMMIORange(GFX_MMIO_RANGE)) { + DPRINTF(AMDGPUDevice, "GFX base\n"); gfx.writeMMIO(pkt, aperture_offset); - break; - default: - DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for %#x\n", offset); - break; + } else { + DPRINTF(AMDGPUDevice, "Unknown MMIO aperture for write %#x\n", offset); } } @@ -610,33 +647,47 @@ AMDGPUDevice::processPendingDoorbells(uint32_t offset) } } -bool -AMDGPUDevice::haveRegVal(uint32_t addr) -{ - return regs.count(addr); -} - uint32_t -AMDGPUDevice::getRegVal(uint32_t addr) +AMDGPUDevice::getRegVal(uint64_t addr) { + // This is somewhat of a guess based on amdgpu_device_mm_access + // in amdgpu_device.c in the ROCk driver. If bit 32 is 1 then + // assume VRAM and use full address, otherwise assume register + // address and only user lower 31 bits. + Addr fixup_addr = bits(addr, 31, 31) ? addr : addr & 0x7fffffff; + + uint32_t pkt_data = 0; + RequestPtr request = std::make_shared(fixup_addr, + sizeof(uint32_t), 0 /* flags */, vramRequestorId()); + PacketPtr pkt = Packet::createRead(request); + pkt->dataStatic((uint8_t *)&pkt_data); + readMMIO(pkt, addr); DPRINTF(AMDGPUDevice, "Getting register 0x%lx = %x\n", - addr, regs[addr]); - return regs[addr]; + fixup_addr, pkt->getLE()); + + return pkt->getLE(); } void -AMDGPUDevice::setRegVal(uint32_t addr, uint32_t value) +AMDGPUDevice::setRegVal(uint64_t addr, uint32_t value) { DPRINTF(AMDGPUDevice, "Setting register 0x%lx to %x\n", addr, value); - regs[addr] = value; + + uint32_t pkt_data = value; + RequestPtr request = std::make_shared(addr, + sizeof(uint32_t), 0 /* flags */, vramRequestorId()); + PacketPtr pkt = Packet::createWrite(request); + pkt->dataStatic((uint8_t *)&pkt_data); + writeMMIO(pkt, addr); } void -AMDGPUDevice::setDoorbellType(uint32_t offset, QueueType qt) +AMDGPUDevice::setDoorbellType(uint32_t offset, QueueType qt, int ip_id) { DPRINTF(AMDGPUDevice, "Setting doorbell type for %x\n", offset); - doorbells[offset] = qt; + doorbells[offset].qtype = qt; + doorbells[offset].ip_id = ip_id; } void @@ -675,22 +726,19 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const // Serialize the PciDevice base class PciDevice::serialize(cp); - uint64_t regs_size = regs.size(); uint64_t doorbells_size = doorbells.size(); uint64_t sdma_engs_size = sdmaEngs.size(); uint64_t used_vmid_map_size = usedVMIDs.size(); - SERIALIZE_SCALAR(regs_size); SERIALIZE_SCALAR(doorbells_size); SERIALIZE_SCALAR(sdma_engs_size); // Save the number of vmids used SERIALIZE_SCALAR(used_vmid_map_size); // Make a c-style array of the regs to serialize - uint32_t reg_addrs[regs_size]; - uint64_t reg_values[regs_size]; uint32_t doorbells_offset[doorbells_size]; QueueType doorbells_queues[doorbells_size]; + int doorbells_ip_ids[doorbells_size]; uint32_t sdma_engs_offset[sdma_engs_size]; int sdma_engs[sdma_engs_size]; int used_vmids[used_vmid_map_size]; @@ -698,16 +746,10 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const std::vector used_vmid_sets; int idx = 0; - for (auto & it : regs) { - reg_addrs[idx] = it.first; - reg_values[idx] = it.second; - ++idx; - } - - idx = 0; for (auto & it : doorbells) { doorbells_offset[idx] = it.first; - doorbells_queues[idx] = it.second; + doorbells_queues[idx] = it.second.qtype; + doorbells_ip_ids[idx] = it.second.ip_id; ++idx; } @@ -732,12 +774,12 @@ AMDGPUDevice::serialize(CheckpointOut &cp) const int* vmid_array = new int[num_queue_id]; std::copy(used_vmid_sets.begin(), used_vmid_sets.end(), vmid_array); - SERIALIZE_ARRAY(reg_addrs, sizeof(reg_addrs)/sizeof(reg_addrs[0])); - SERIALIZE_ARRAY(reg_values, sizeof(reg_values)/sizeof(reg_values[0])); SERIALIZE_ARRAY(doorbells_offset, sizeof(doorbells_offset)/ sizeof(doorbells_offset[0])); SERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/ sizeof(doorbells_queues[0])); + SERIALIZE_ARRAY(doorbells_ip_ids, sizeof(doorbells_ip_ids)/ + sizeof(doorbells_ip_ids[0])); SERIALIZE_ARRAY(sdma_engs_offset, sizeof(sdma_engs_offset)/ sizeof(sdma_engs_offset[0])); SERIALIZE_ARRAY(sdma_engs, sizeof(sdma_engs)/sizeof(sdma_engs[0])); @@ -764,43 +806,30 @@ AMDGPUDevice::unserialize(CheckpointIn &cp) // Unserialize the PciDevice base class PciDevice::unserialize(cp); - uint64_t regs_size = 0; uint64_t doorbells_size = 0; uint64_t sdma_engs_size = 0; uint64_t used_vmid_map_size = 0; - UNSERIALIZE_SCALAR(regs_size); UNSERIALIZE_SCALAR(doorbells_size); UNSERIALIZE_SCALAR(sdma_engs_size); UNSERIALIZE_SCALAR(used_vmid_map_size); - if (regs_size > 0) { - uint32_t reg_addrs[regs_size]; - uint64_t reg_values[regs_size]; - - UNSERIALIZE_ARRAY(reg_addrs, sizeof(reg_addrs)/sizeof(reg_addrs[0])); - UNSERIALIZE_ARRAY(reg_values, - sizeof(reg_values)/sizeof(reg_values[0])); - - for (int idx = 0; idx < regs_size; ++idx) { - regs.insert(std::make_pair(reg_addrs[idx], reg_values[idx])); - } - } - if (doorbells_size > 0) { uint32_t doorbells_offset[doorbells_size]; QueueType doorbells_queues[doorbells_size]; + int doorbells_ip_ids[doorbells_size]; UNSERIALIZE_ARRAY(doorbells_offset, sizeof(doorbells_offset)/ sizeof(doorbells_offset[0])); UNSERIALIZE_ARRAY(doorbells_queues, sizeof(doorbells_queues)/ sizeof(doorbells_queues[0])); + UNSERIALIZE_ARRAY(doorbells_ip_ids, sizeof(doorbells_ip_ids)/ + sizeof(doorbells_ip_ids[0])); for (int idx = 0; idx < doorbells_size; ++idx) { - regs.insert(std::make_pair(doorbells_offset[idx], - doorbells_queues[idx])); - doorbells[doorbells_offset[idx]] = doorbells_queues[idx]; + doorbells[doorbells_offset[idx]].qtype = doorbells_queues[idx]; + doorbells[doorbells_offset[idx]].ip_id = doorbells_ip_ids[idx]; } } diff --git a/src/dev/amdgpu/amdgpu_device.hh b/src/dev/amdgpu/amdgpu_device.hh index b6b6e2a81a..33b6a9f3e7 100644 --- a/src/dev/amdgpu/amdgpu_device.hh +++ b/src/dev/amdgpu/amdgpu_device.hh @@ -87,9 +87,7 @@ class AMDGPUDevice : public PciDevice /** * Structures to hold registers, doorbells, and some frame memory */ - using GPURegMap = std::unordered_map; - GPURegMap regs; - std::unordered_map doorbells; + std::unordered_map doorbells; std::unordered_map pendingDoorbellPkts; /** @@ -115,9 +113,19 @@ class AMDGPUDevice : public PciDevice AMDGPUMemoryManager *gpuMemMgr; AMDGPUInterruptHandler *deviceIH; AMDGPUVM gpuvm; - PM4PacketProcessor *pm4PktProc; GPUCommandProcessor *cp; + struct AddrRangeHasher + { + std::size_t operator()(const AddrRange& k) const + { + return k.start(); + } + }; + std::unordered_map pm4PktProcs; + std::unordered_map pm4Ranges; + // SDMAs mapped by doorbell offset std::unordered_map sdmaEngs; // SDMAs mapped by ID @@ -187,7 +195,7 @@ class AMDGPUDevice : public PciDevice /** * Set handles to GPU blocks. */ - void setDoorbellType(uint32_t offset, QueueType qt); + void setDoorbellType(uint32_t offset, QueueType qt, int ip_id = 0); void processPendingDoorbells(uint32_t offset); void setSDMAEngine(Addr offset, SDMAEngine *eng); @@ -195,9 +203,8 @@ class AMDGPUDevice : public PciDevice * Register value getter/setter. Used by other GPU blocks to change * values from incoming driver/user packets. */ - bool haveRegVal(uint32_t addr); - uint32_t getRegVal(uint32_t addr); - void setRegVal(uint32_t addr, uint32_t value); + uint32_t getRegVal(uint64_t addr); + void setRegVal(uint64_t addr, uint32_t value); /** * Methods related to translations and system/device memory. diff --git a/src/dev/amdgpu/amdgpu_gfx.cc b/src/dev/amdgpu/amdgpu_gfx.cc index 3d5b274b86..60fabaf31d 100644 --- a/src/dev/amdgpu/amdgpu_gfx.cc +++ b/src/dev/amdgpu/amdgpu_gfx.cc @@ -37,6 +37,13 @@ namespace gem5 { +AMDGPUGfx::AMDGPUGfx() +{ + for (int i = 0; i < SCRATCH_REGS; ++i) { + scratchRegs[i] = 0; + } +} + void AMDGPUGfx::readMMIO(PacketPtr pkt, Addr offset) { @@ -47,6 +54,9 @@ AMDGPUGfx::readMMIO(PacketPtr pkt, Addr offset) case AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB: pkt->setLE(captured_clock_count >> 32); break; + case AMDGPU_MM_SCRATCH_REG0: + pkt->setLE(scratchRegs[0]); + break; default: break; } @@ -65,6 +75,9 @@ AMDGPUGfx::writeMMIO(PacketPtr pkt, Addr offset) captured_clock_count = curTick() / sim_clock::as_int::ns; } break; + case AMDGPU_MM_SCRATCH_REG0: + scratchRegs[0] = pkt->getLE(); + break; default: break; } diff --git a/src/dev/amdgpu/amdgpu_gfx.hh b/src/dev/amdgpu/amdgpu_gfx.hh index c32b8624cf..9fb1d82553 100644 --- a/src/dev/amdgpu/amdgpu_gfx.hh +++ b/src/dev/amdgpu/amdgpu_gfx.hh @@ -52,13 +52,16 @@ #define AMDGPU_MM_RLC_GPU_CLOCK_COUNT_MSB 0x13094 #define AMDGPU_MM_RLC_CAPTURE_GPU_CLOCK_COUNT 0x13098 +// Scratch registers used for GPU post +#define AMDGPU_MM_SCRATCH_REG0 0x08100 + namespace gem5 { class AMDGPUGfx { public: - AMDGPUGfx() { } + AMDGPUGfx(); void readMMIO(PacketPtr pkt, Addr offset); void writeMMIO(PacketPtr pkt, Addr offset); @@ -68,6 +71,12 @@ class AMDGPUGfx * GPU clock count at the time capture MMIO is received. */ uint64_t captured_clock_count = 1; + + /* + * Scratch registers. + */ + static constexpr int SCRATCH_REGS = 8; + std::array scratchRegs; }; } // namespace gem5 diff --git a/src/dev/amdgpu/amdgpu_nbio.cc b/src/dev/amdgpu/amdgpu_nbio.cc index 07027c3765..ec44f16250 100644 --- a/src/dev/amdgpu/amdgpu_nbio.cc +++ b/src/dev/amdgpu/amdgpu_nbio.cc @@ -53,22 +53,44 @@ AMDGPUNbio::setGPUDevice(AMDGPUDevice *gpu_device) void AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset) { + // For Vega10 we rely on the golden values in an MMIO trace. Return + // immediately as to not clobber those values. + if (gpuDevice->getGfxVersion() == GfxVersion::gfx900) { + if (offset == AMDGPU_PCIE_DATA || offset == AMDGPU_PCIE_DATA2) { + return; + } + } + switch (offset) { - // This is a PCIe status register. At some point during driver init - // the driver checks that interrupts are enabled. This is only - // checked once, so if the MMIO trace does not exactly line up with - // what the driver is doing in gem5, this may still have the first - // bit zero causing driver to fail. Therefore, we always set this - // bit to one as there is no harm to do so. - case AMDGPU_PCIE_DATA_REG: + // PCIE_DATA, PCIE_DATA2, PCIE_INDEX, and PCIE_INDEX2 handle "indirect + // "register reads/writes from the driver. This provides a way to read + // any register by providing a 32-bit address to one of the two INDEX + // registers and then reading the corresponding DATA register. See: + // https://github.com/ROCm/ROCK-Kernel-Driver/blob/roc-6.0.x/drivers/ + // gpu/drm/amd/amdgpu/amdgpu_device.c#L459 + case AMDGPU_PCIE_DATA: { - uint32_t value = pkt->getLE() | 0x1; - DPRINTF(AMDGPUDevice, "Marking interrupts enabled: %#lx\n", value); + uint32_t value = gpuDevice->getRegVal(pcie_index_reg); + DPRINTF(AMDGPUDevice, "Read PCIe index %lx data %x\n", + pcie_index_reg, value); pkt->setLE(value); } break; + case AMDGPU_PCIE_DATA2: + { + uint32_t value = gpuDevice->getRegVal(pcie_index2_reg); + DPRINTF(AMDGPUDevice, "Read PCIe index2 %lx data2 %x\n", + pcie_index2_reg, value); + pkt->setLE(value); + } + break; + case AMDGPU_PCIE_INDEX: + pkt->setLE(pcie_index_reg); + break; + case AMDGPU_PCIE_INDEX2: + pkt->setLE(pcie_index2_reg); + break; case AMDGPU_MM_DATA: - //pkt->setLE(regs[mm_index_reg]); pkt->setLE(gpuDevice->getRegVal(mm_index_reg)); break; case VEGA10_INV_ENG17_ACK1: @@ -89,17 +111,17 @@ AMDGPUNbio::readMMIO(PacketPtr pkt, Addr offset) case AMDGPU_MP0_SMN_C2PMSG_35: pkt->setLE(0x80000000); break; + case AMDGPU_MP1_SMN_C2PMSG_90: + pkt->setLE(0x1); + break; default: if (triggered_reads.count(offset)) { DPRINTF(AMDGPUDevice, "Found triggered read for %#x\n", offset); pkt->setLE(triggered_reads[offset]); - } else if (gpuDevice->haveRegVal(offset)) { - uint32_t reg_val = gpuDevice->getRegVal(offset); - - DPRINTF(AMDGPUDevice, "Reading value of %#lx from regs: %#lx\n", - offset, reg_val); - - pkt->setLE(reg_val); + } else if (regs.count(offset)) { + DPRINTF(AMDGPUDevice, "Returning value of unknown MMIO offset " + "%x: %x\n", offset, regs[offset]); + pkt->setLE(regs[offset]); } else { DPRINTF(AMDGPUDevice, "NBIO Unknown MMIO %#x (%#x)\n", offset, pkt->getAddr()); @@ -123,6 +145,24 @@ AMDGPUNbio::writeMMIO(PacketPtr pkt, Addr offset) DPRINTF(AMDGPUDevice, "MM write to reg %#lx data %#lx\n", mm_index_reg, pkt->getLE()); gpuDevice->setRegVal(AMDGPU_MM_DATA, pkt->getLE()); + // PCIE_DATA, PCIE_DATA2, PCIE_INDEX, and PCIE_INDEX2 handle "indirect + // "register reads/writes from the driver. This provides a way to read + // any register by providing a 32-bit address to one of the two INDEX + // registers and then reading the corresponding DATA register. See: + // https://github.com/ROCm/ROCK-Kernel-Driver/blob/roc-6.0.x/drivers/ + // gpu/drm/amd/amdgpu/amdgpu_device.c#L459 + } else if (offset == AMDGPU_PCIE_INDEX) { + assert(pkt->getSize() == 4); + pcie_index_reg = pkt->getLE(); + } else if (offset == AMDGPU_PCIE_DATA) { + assert(pkt->getSize() == 4); + gpuDevice->setRegVal(pcie_index_reg, pkt->getLE()); + } else if (offset == AMDGPU_PCIE_INDEX2) { + assert(pkt->getSize() == 4); + pcie_index2_reg = pkt->getLE(); + } else if (offset == AMDGPU_PCIE_DATA2) { + assert(pkt->getSize() == 4); + gpuDevice->setRegVal(pcie_index2_reg, pkt->getLE()); } else if (offset == AMDGPU_MP0_SMN_C2PMSG_35) { // See psp_v3_1_bootloader_load_sos in amdgpu driver code. if (pkt->getLE() == 0x10000) { @@ -144,6 +184,14 @@ AMDGPUNbio::writeMMIO(PacketPtr pkt, Addr offset) } else if (offset == AMDGPU_MP0_SMN_C2PMSG_71) { // PSP ring size psp_ring_size = pkt->getLE(); + } else { + // Fallback to a map of register values. This was previously in the + // AMDGPUDevice, however that short-circuited some reads from other + // IP blocks. Since this is an end point IP block it is safer to use + // here. + regs[offset] = pkt->getLE(); + DPRINTF(AMDGPUDevice, "Writing value of unknown MMIO offset " + "%x: %x\n", offset, regs[offset]); } } diff --git a/src/dev/amdgpu/amdgpu_nbio.hh b/src/dev/amdgpu/amdgpu_nbio.hh index dc95443916..87afb02c41 100644 --- a/src/dev/amdgpu/amdgpu_nbio.hh +++ b/src/dev/amdgpu/amdgpu_nbio.hh @@ -56,7 +56,11 @@ class AMDGPUDevice; #define AMDGPU_MM_INDEX 0x00000 #define AMDGPU_MM_INDEX_HI 0x00018 #define AMDGPU_MM_DATA 0x00004 -#define AMDGPU_PCIE_DATA_REG 0x0003c + +#define AMDGPU_PCIE_INDEX 0x00030 +#define AMDGPU_PCIE_INDEX2 0x00038 +#define AMDGPU_PCIE_DATA 0x00034 +#define AMDGPU_PCIE_DATA2 0x0003c // Message bus related to psp #define AMDGPU_MP0_SMN_C2PMSG_33 0x58184 @@ -66,6 +70,7 @@ class AMDGPUDevice; #define AMDGPU_MP0_SMN_C2PMSG_70 0x58218 #define AMDGPU_MP0_SMN_C2PMSG_71 0x5821c #define AMDGPU_MP0_SMN_C2PMSG_81 0x58244 +#define AMDGPU_MP1_SMN_C2PMSG_90 0x58a68 // Device specific invalidation engines used during initialization #define VEGA10_INV_ENG17_ACK1 0x0a318 @@ -105,6 +110,8 @@ class AMDGPUNbio * Driver initialization sequence helper variables. */ uint64_t mm_index_reg = 0; + uint32_t pcie_index_reg = 0; + uint32_t pcie_index2_reg = 0; std::unordered_map triggered_reads; /* @@ -115,6 +122,12 @@ class AMDGPUNbio Addr psp_ring_listen_addr = 0; int psp_ring_size = 0; int psp_ring_value = 0; + + /* + * Hold values of other registers not explicitly modelled by other blocks. + */ + using GPURegMap = std::unordered_map; + GPURegMap regs; }; } // namespace gem5 diff --git a/src/dev/amdgpu/amdgpu_vm.cc b/src/dev/amdgpu/amdgpu_vm.cc index 5a13ac9ba0..0eea590c5a 100644 --- a/src/dev/amdgpu/amdgpu_vm.cc +++ b/src/dev/amdgpu/amdgpu_vm.cc @@ -37,6 +37,7 @@ #include "base/trace.hh" #include "debug/AMDGPUDevice.hh" #include "dev/amdgpu/amdgpu_defines.hh" +#include "dev/amdgpu/amdgpu_device.hh" #include "mem/packet_access.hh" namespace gem5 @@ -51,6 +52,35 @@ AMDGPUVM::AMDGPUVM() for (int i = 0; i < AMDGPU_VM_COUNT; ++i) { memset(&vmContexts[0], 0, sizeof(AMDGPUVMContext)); } + + for (int i = 0; i < NUM_MMIO_RANGES; ++i) { + mmioRanges[i] = AddrRange(); + } +} + +void +AMDGPUVM::setMMIOAperture(mmio_range_t mmio_aperture, AddrRange range) +{ + mmioRanges[mmio_aperture] = range; +} + +AddrRange +AMDGPUVM::getMMIORange(mmio_range_t mmio_aperture) +{ + return mmioRanges[mmio_aperture]; +} + +const AddrRange& +AMDGPUVM::getMMIOAperture(Addr offset) +{ + for (int i = 0; i < NUM_MMIO_RANGES; ++i) { + if (mmioRanges[i].contains(offset)) { + return mmioRanges[i]; + } + } + + // Default to NBIO + return mmioRanges[NBIO_MMIO_RANGE]; } Addr diff --git a/src/dev/amdgpu/amdgpu_vm.hh b/src/dev/amdgpu/amdgpu_vm.hh index f35a735111..857ef724da 100644 --- a/src/dev/amdgpu/amdgpu_vm.hh +++ b/src/dev/amdgpu/amdgpu_vm.hh @@ -99,9 +99,23 @@ static constexpr int AMDGPU_USER_PAGE_SIZE = 4096; namespace gem5 { +typedef enum : int +{ + NBIO_MMIO_RANGE, + MMHUB_MMIO_RANGE, + GFX_MMIO_RANGE, + GRBM_MMIO_RANGE, + IH_MMIO_RANGE, + NUM_MMIO_RANGES +} mmio_range_t; + +class AMDGPUDevice; + class AMDGPUVM : public Serializable { private: + AMDGPUDevice *gpuDevice; + typedef struct GEM5_PACKED { // Page table addresses: from (Base + Start) to (End) @@ -160,9 +174,13 @@ class AMDGPUVM : public Serializable */ std::vector gpu_tlbs; + std::array mmioRanges; + public: AMDGPUVM(); + void setGPUDevice(AMDGPUDevice *gpu_device) { gpuDevice = gpu_device; } + /** * Return base address of GART table in framebuffer. */ @@ -172,6 +190,12 @@ class AMDGPUVM : public Serializable */ Addr gartSize(); + bool + inGARTRange(Addr paddr) + { + return ((paddr >= gartBase()) && (paddr <= (gartBase() + gartSize()))); + } + /** * Copy of GART table. Typically resides in device memory, however we use * a copy in gem5 to simplify the interface. @@ -226,38 +250,11 @@ class AMDGPUVM : public Serializable Addr getSysAddrRangeLow () { return vmContext0.sysAddrL; } Addr getSysAddrRangeHigh () { return vmContext0.sysAddrH; } - Addr - getMmioAperture(Addr addr) - { - // Aperture ranges: - // NBIO 0x0 - 0x4280 - // IH 0x4280 - 0x4980 - // SDMA0 0x4980 - 0x5180 - // SDMA1 0x5180 - 0x5980 - // GRBM 0x8000 - 0xD000 - // GFX 0x28000 - 0x3F000 - // MMHUB 0x68000 - 0x6a120 + void setMMIOAperture(mmio_range_t mmio_aperture, AddrRange range); + const AddrRange& getMMIOAperture(Addr addr); + AddrRange getMMIORange(mmio_range_t mmio_aperture); - if (IH_BASE <= addr && addr < IH_BASE + IH_SIZE) - return IH_BASE; - else if (SDMA0_BASE <= addr && addr < SDMA0_BASE + SDMA_SIZE) - return SDMA0_BASE; - else if (SDMA1_BASE <= addr && addr < SDMA1_BASE + SDMA_SIZE) - return SDMA1_BASE; - else if (GRBM_BASE <= addr && addr < GRBM_BASE + GRBM_SIZE) - return GRBM_BASE; - else if (GFX_BASE <= addr && addr < GFX_BASE + GFX_SIZE) - return GFX_BASE; - else if (MMHUB_BASE <= addr && addr < MMHUB_BASE + MMHUB_SIZE) - return MMHUB_BASE; - else { - warn_once("Accessing unsupported MMIO aperture! Assuming NBIO\n"); - return NBIO_BASE; - } - - } - - // Gettig mapped aperture base addresses + // Getting mapped aperture base addresses Addr getFrameAperture(Addr addr) { diff --git a/src/dev/amdgpu/interrupt_handler.cc b/src/dev/amdgpu/interrupt_handler.cc index 6f277a1618..cb99ba7a39 100644 --- a/src/dev/amdgpu/interrupt_handler.cc +++ b/src/dev/amdgpu/interrupt_handler.cc @@ -75,7 +75,8 @@ void AMDGPUInterruptHandler::prepareInterruptCookie(ContextID cntxt_id, uint32_t ring_id, uint32_t client_id, - uint32_t source_id) + uint32_t source_id, + unsigned node_id) { assert(client_id == SOC15_IH_CLIENTID_RLC || client_id == SOC15_IH_CLIENTID_SDMA0 || @@ -112,6 +113,7 @@ AMDGPUInterruptHandler::prepareInterruptCookie(ContextID cntxt_id, cookie->clientId = client_id; cookie->sourceId = source_id; cookie->ringId = ring_id; + cookie->nodeId = node_id; cookie->source_data_dw1 = cntxt_id; interruptQueue.push(cookie); } diff --git a/src/dev/amdgpu/interrupt_handler.hh b/src/dev/amdgpu/interrupt_handler.hh index 9b80e081cc..a895eabafc 100644 --- a/src/dev/amdgpu/interrupt_handler.hh +++ b/src/dev/amdgpu/interrupt_handler.hh @@ -101,7 +101,8 @@ typedef struct uint32_t reserved2 : 15; uint32_t timestamp_src : 1; uint32_t pasid : 16; - uint32_t reserved3 : 15; + uint32_t nodeId : 8; + uint32_t reserved3 : 7; uint32_t pasid_src : 1; uint32_t source_data_dw1; uint32_t source_data_dw2; @@ -171,7 +172,7 @@ class AMDGPUInterruptHandler : public DmaDevice void setGPUDevice(AMDGPUDevice *gpu_device) { gpuDevice = gpu_device; } void prepareInterruptCookie(ContextID cntxtId, uint32_t ring_id, - uint32_t client_id, uint32_t source_id); + uint32_t client_id, uint32_t source_id, unsigned node_id); void submitInterruptCookie(); void submitWritePointer(); void intrPost(); diff --git a/src/dev/amdgpu/pm4_mmio.hh b/src/dev/amdgpu/pm4_mmio.hh index 3801223175..e9e504c3cd 100644 --- a/src/dev/amdgpu/pm4_mmio.hh +++ b/src/dev/amdgpu/pm4_mmio.hh @@ -36,34 +36,34 @@ namespace gem5 { -#define mmCP_RB0_BASE 0x1040 -#define mmCP_RB0_CNTL 0x1041 -#define mmCP_RB_WPTR_POLL_ADDR_LO 0x1046 -#define mmCP_RB_WPTR_POLL_ADDR_HI 0x1047 -#define mmCP_RB_VMID 0x1051 -#define mmCP_RB0_RPTR_ADDR 0x1043 -#define mmCP_RB0_RPTR_ADDR_HI 0x1044 -#define mmCP_RB0_WPTR 0x1054 -#define mmCP_RB0_WPTR_HI 0x1055 -#define mmCP_RB_DOORBELL_CONTROL 0x1059 -#define mmCP_RB_DOORBELL_RANGE_LOWER 0x105a -#define mmCP_RB_DOORBELL_RANGE_UPPER 0x105b -#define mmCP_RB0_BASE_HI 0x10b1 +#define mmCP_RB0_BASE 0x040 +#define mmCP_RB0_CNTL 0x041 +#define mmCP_RB_WPTR_POLL_ADDR_LO 0x046 +#define mmCP_RB_WPTR_POLL_ADDR_HI 0x047 +#define mmCP_RB_VMID 0x051 +#define mmCP_RB0_RPTR_ADDR 0x043 +#define mmCP_RB0_RPTR_ADDR_HI 0x044 +#define mmCP_RB0_WPTR 0x054 +#define mmCP_RB0_WPTR_HI 0x055 +#define mmCP_RB_DOORBELL_CONTROL 0x059 +#define mmCP_RB_DOORBELL_RANGE_LOWER 0x05a +#define mmCP_RB_DOORBELL_RANGE_UPPER 0x05b +#define mmCP_RB0_BASE_HI 0x0b1 -#define mmCP_HQD_ACTIVE 0x1247 -#define mmCP_HQD_VMID 0x1248 -#define mmCP_HQD_PQ_BASE 0x124d -#define mmCP_HQD_PQ_BASE_HI 0x124e -#define mmCP_HQD_PQ_DOORBELL_CONTROL 0x1254 -#define mmCP_HQD_PQ_RPTR 0x124f -#define mmCP_HQD_PQ_RPTR_REPORT_ADDR 0x1250 -#define mmCP_HQD_PQ_RPTR_REPORT_ADDR_HI 0x1251 -#define mmCP_HQD_PQ_WPTR_POLL_ADDR 0x1252 -#define mmCP_HQD_PQ_WPTR_POLL_ADDR_HI 0x1253 -#define mmCP_HQD_PQ_CONTROL 0x1256 -#define mmCP_HQD_IB_CONTROL 0x125a -#define mmCP_HQD_PQ_WPTR_LO 0x127b -#define mmCP_HQD_PQ_WPTR_HI 0x127c +#define mmCP_HQD_ACTIVE 0x247 +#define mmCP_HQD_VMID 0x248 +#define mmCP_HQD_PQ_BASE 0x24d +#define mmCP_HQD_PQ_BASE_HI 0x24e +#define mmCP_HQD_PQ_DOORBELL_CONTROL 0x254 +#define mmCP_HQD_PQ_RPTR 0x24f +#define mmCP_HQD_PQ_RPTR_REPORT_ADDR 0x250 +#define mmCP_HQD_PQ_RPTR_REPORT_ADDR_HI 0x251 +#define mmCP_HQD_PQ_WPTR_POLL_ADDR 0x252 +#define mmCP_HQD_PQ_WPTR_POLL_ADDR_HI 0x253 +#define mmCP_HQD_PQ_CONTROL 0x256 +#define mmCP_HQD_IB_CONTROL 0x25a +#define mmCP_HQD_PQ_WPTR_LO 0x27b +#define mmCP_HQD_PQ_WPTR_HI 0x27c } // namespace gem5 diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc index 5f270a0c70..62e817aa98 100644 --- a/src/dev/amdgpu/pm4_packet_processor.cc +++ b/src/dev/amdgpu/pm4_packet_processor.cc @@ -49,7 +49,7 @@ namespace gem5 { PM4PacketProcessor::PM4PacketProcessor(const PM4PacketProcessorParams &p) - : DmaVirtDevice(p) + : DmaVirtDevice(p), _ipId(p.ip_id), _mmioRange(p.mmio_range) { memset(&kiq, 0, sizeof(QueueDesc)); memset(&pq, 0, sizeof(QueueDesc)); @@ -144,7 +144,7 @@ PM4PacketProcessor::newQueue(QueueDesc *mqd, Addr offset, QueueType qt; qt = mqd->aql ? QueueType::ComputeAQL : QueueType::Compute; - gpuDevice->setDoorbellType(offset, qt); + gpuDevice->setDoorbellType(offset, qt, getIpId()); DPRINTF(PM4PacketProcessor, "New PM4 queue %d, base: %p offset: %p, me: " "%d, pipe %d queue: %d size: %d\n", id, q->base(), q->offset(), @@ -227,9 +227,11 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header) } break; case IT_WRITE_DATA: { dmaBuffer = new PM4WriteData(); + DPRINTF(PM4PacketProcessor, "PM4 writeData header: %x, count: %d\n", + header.ordinal, header.count); cb = new DmaVirtCallback( [ = ] (const uint64_t &) - { writeData(q, (PM4WriteData *)dmaBuffer); }); + { writeData(q, (PM4WriteData *)dmaBuffer, header); }); dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4WriteData), cb, dmaBuffer); } break; @@ -350,21 +352,46 @@ PM4PacketProcessor::decodeHeader(PM4Queue *q, PM4Header header) } void -PM4PacketProcessor::writeData(PM4Queue *q, PM4WriteData *pkt) +PM4PacketProcessor::writeData(PM4Queue *q, PM4WriteData *pkt, PM4Header header) { q->incRptr(sizeof(PM4WriteData)); - Addr addr = getGARTAddr(pkt->destAddr); - DPRINTF(PM4PacketProcessor, "PM4 write addr: %p data: %p.\n", addr, - pkt->data); - auto cb = new DmaVirtCallback( - [ = ](const uint32_t &) { writeDataDone(q, pkt, addr); }); - //TODO: the specs indicate that pkt->data holds the number of dword that - //need to be written. - dmaWriteVirt(addr, sizeof(uint32_t), cb, &pkt->data); + DPRINTF(PM4PacketProcessor, "PM4 write addr: %p data: %p destSel: %d " + "addrIncr: %d resume: %d writeConfirm: %d cachePolicy: %d\n", + pkt->destAddr, pkt->data, pkt->destSel, pkt->addrIncr, + pkt->resume, pkt->writeConfirm, pkt->cachePolicy); - if (!pkt->writeConfirm) + if (pkt->destSel == 5) { + // Memory address destination + Addr addr = getGARTAddr(pkt->destAddr); + + // This is a variable length packet. The size of the packet is in + // the header.count field and is set as Number Of Dwords - 1. This + // packet is 4 bytes minuimum meaning the count is minimum 3. To + // get the number of dwords of data subtract two from the count. + unsigned size = (header.count - 2) * sizeof(uint32_t); + + DPRINTF(PM4PacketProcessor, "Writing %d bytes to %p\n", size, addr); + auto cb = new DmaVirtCallback( + [ = ](const uint32_t &) { writeDataDone(q, pkt, addr); }); + dmaWriteVirt(addr, size, cb, &pkt->data); + + if (!pkt->writeConfirm) { + decodeNext(q); + } + } else if (pkt->destSel == 0) { + // Register dword address destination + Addr byte_addr = pkt->destAddr << 2; + + gpuDevice->setRegVal(byte_addr, pkt->data); + + // setRegVal is instant on the simulated device so we ignore write + // confirm. + delete pkt; decodeNext(q); + } else { + fatal("Unknown PM4 writeData destination %d\n", pkt->destSel); + } } void @@ -373,8 +400,9 @@ PM4PacketProcessor::writeDataDone(PM4Queue *q, PM4WriteData *pkt, Addr addr) DPRINTF(PM4PacketProcessor, "PM4 write completed to %p, %p.\n", addr, pkt->data); - if (pkt->writeConfirm) + if (pkt->writeConfirm) { decodeNext(q); + } delete pkt; } @@ -493,7 +521,7 @@ PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, // Register doorbell with GPU device gpuDevice->setSDMAEngine(pkt->doorbellOffset << 2, sdma_eng); - gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC); + gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC, getIpId()); gpuDevice->processPendingDoorbells(pkt->doorbellOffset << 2); } @@ -537,7 +565,8 @@ PM4PacketProcessor::releaseMemDone(PM4Queue *q, PM4ReleaseMem *pkt, Addr addr) ringId = (q->queue() << 4) | (q->me() << 2) | q->pipe(); } gpuDevice->getIH()->prepareInterruptCookie(pkt->intCtxId, ringId, - SOC15_IH_CLIENTID_GRBM_CP, CP_EOP); + SOC15_IH_CLIENTID_GRBM_CP, CP_EOP, + 0); gpuDevice->getIH()->submitInterruptCookie(); } @@ -745,9 +774,14 @@ PM4PacketProcessor::setUconfigReg(PM4Queue *q, PM4SetUconfigReg *pkt) { q->incRptr(sizeof(PM4SetUconfigReg)); + DPRINTF(PM4PacketProcessor, "SetUconfig offset %x data %x\n", + pkt->offset, pkt->data); + // SET_UCONFIG_REG_START and pkt->offset are dword addresses uint32_t reg_addr = (PACKET3_SET_UCONFIG_REG_START + pkt->offset) * 4; + // Additional CPs respond to addresses 0x40000 apart. + reg_addr += 0x40000 * getIpId(); gpuDevice->setRegVal(reg_addr, pkt->data); decodeNext(q); @@ -822,7 +856,7 @@ PM4PacketProcessor::writeMMIO(PacketPtr pkt, Addr mmio_offset) break; case mmCP_HQD_PQ_DOORBELL_CONTROL: setHqdPqDoorbellCtrl(pkt->getLE()); - gpuDevice->setDoorbellType(getKiqDoorbellOffset(), Compute); + gpuDevice->setDoorbellType(getKiqDoorbellOffset(), Compute, getIpId()); break; case mmCP_HQD_PQ_RPTR: setHqdPqPtr(pkt->getLE()); @@ -884,7 +918,7 @@ PM4PacketProcessor::writeMMIO(PacketPtr pkt, Addr mmio_offset) break; case mmCP_RB_DOORBELL_CONTROL: setRbDoorbellCntrl(pkt->getLE()); - gpuDevice->setDoorbellType(getPqDoorbellOffset(), Gfx); + gpuDevice->setDoorbellType(getPqDoorbellOffset(), Gfx, getIpId()); break; case mmCP_RB_DOORBELL_RANGE_LOWER: setRbDoorbellRangeLo(pkt->getLE()); diff --git a/src/dev/amdgpu/pm4_packet_processor.hh b/src/dev/amdgpu/pm4_packet_processor.hh index 3fb055148c..82c3c2716f 100644 --- a/src/dev/amdgpu/pm4_packet_processor.hh +++ b/src/dev/amdgpu/pm4_packet_processor.hh @@ -63,6 +63,10 @@ class PM4PacketProcessor : public DmaVirtDevice std::unordered_map queues; /* A map of PM4 queues based on doorbell offset */ std::unordered_map queuesMap; + + int _ipId; + AddrRange _mmioRange; + public: PM4PacketProcessor(const PM4PacketProcessorParams &p); @@ -136,7 +140,7 @@ class PM4PacketProcessor : public DmaVirtDevice void decodeHeader(PM4Queue *q, PM4Header header); /* Methods that implement PM4 packets */ - void writeData(PM4Queue *q, PM4WriteData *pkt); + void writeData(PM4Queue *q, PM4WriteData *pkt, PM4Header header); void writeDataDone(PM4Queue *q, PM4WriteData *pkt, Addr addr); void mapQueues(PM4Queue *q, PM4MapQueues *pkt); void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt); @@ -188,6 +192,9 @@ class PM4PacketProcessor : public DmaVirtDevice void setRbDoorbellCntrl(uint32_t data); void setRbDoorbellRangeLo(uint32_t data); void setRbDoorbellRangeHi(uint32_t data); + + int getIpId() const { return _ipId; } + AddrRange getMMIORange() const { return _mmioRange; } }; } // namespace gem5 diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc index 4015e83eaf..070c04fe64 100644 --- a/src/dev/amdgpu/sdma_engine.cc +++ b/src/dev/amdgpu/sdma_engine.cc @@ -81,9 +81,9 @@ SDMAEngine::setGPUDevice(AMDGPUDevice *gpu_device) } int -SDMAEngine::getIHClientId() +SDMAEngine::getIHClientId(int _id) { - switch (id) { + switch (_id) { case 0: return SOC15_IH_CLIENTID_SDMA0; case 1: @@ -627,10 +627,14 @@ SDMAEngine::writeReadData(SDMAQueue *q, sdmaWrite *pkt, uint32_t *dmaBuffer) // lastly we write read data to the destination address if (gpuDevice->getVM().inMMHUB(pkt->dest)) { - Addr mmhubAddr = pkt->dest - gpuDevice->getVM().getMMHUBBase(); + Addr mmhub_addr = pkt->dest - gpuDevice->getVM().getMMHUBBase(); + + fatal_if(gpuDevice->getVM().inGARTRange(mmhub_addr), + "SDMA write to GART not implemented"); + auto cb = new EventFunctionWrapper( [ = ]{ writeDone(q, pkt, dmaBuffer); }, name()); - gpuDevice->getMemMgr()->writeRequest(mmhubAddr, (uint8_t *)dmaBuffer, + gpuDevice->getMemMgr()->writeRequest(mmhub_addr, (uint8_t *)dmaBuffer, bufferSize, 0, cb); } else { if (q->priv()) { @@ -663,9 +667,11 @@ SDMAEngine::copy(SDMAQueue *q, sdmaCopy *pkt) // count represents the number of bytes - 1 to be copied pkt->count++; if (q->priv()) { - DPRINTF(SDMAEngine, "Getting GART addr for %lx\n", pkt->source); - pkt->source = getGARTAddr(pkt->source); - DPRINTF(SDMAEngine, "GART addr %lx\n", pkt->source); + if (!gpuDevice->getVM().inMMHUB(pkt->source)) { + DPRINTF(SDMAEngine, "Getting GART addr for %lx\n", pkt->source); + pkt->source = getGARTAddr(pkt->source); + DPRINTF(SDMAEngine, "GART addr %lx\n", pkt->source); + } } // Read data from the source first, then call the copyReadData method @@ -742,6 +748,19 @@ SDMAEngine::copyReadData(SDMAQueue *q, sdmaCopy *pkt, uint8_t *dmaBuffer) [ = ] (const uint64_t &) { copyDone(q, pkt, dmaBuffer); }); dmaWriteVirt(pkt->dest, pkt->count, cb, (void *)dmaBuffer); } + + // For destinations in the GART table, gem5 uses a mapping tables instead + // of functionally going to device memory, so we need to update that copy. + if (gpuDevice->getVM().inGARTRange(device_addr)) { + // GART entries are always 8 bytes. + assert((pkt->count % 8) == 0); + for (int i = 0; i < pkt->count/8; ++i) { + Addr gart_addr = device_addr + i*8 - gpuDevice->getVM().gartBase(); + DPRINTF(SDMAEngine, "Shadow copying to GART table %lx -> %lx\n", + gart_addr, dmaBuffer64[i]); + gpuDevice->getVM().gartTable[gart_addr] = dmaBuffer64[i]; + } + } } /* Completion of a copy packet. */ @@ -809,8 +828,12 @@ SDMAEngine::trap(SDMAQueue *q, sdmaTrap *pkt) uint32_t ring_id = (q->queueType() == SDMAPage) ? 3 : 0; + int node_id = 0; + int local_id = getId(); + gpuDevice->getIH()->prepareInterruptCookie(pkt->intrContext, ring_id, - getIHClientId(), TRAP_ID); + getIHClientId(local_id), + TRAP_ID, 2*node_id); gpuDevice->getIH()->submitInterruptCookie(); delete pkt; @@ -836,8 +859,7 @@ SDMAEngine::srbmWrite(SDMAQueue *q, sdmaSRBMWriteHeader *header, DPRINTF(SDMAEngine, "SRBM write to %#x with data %#x\n", reg_addr, pkt->data); - warn_once("SRBM write not performed, no SRBM model. This needs to be fixed" - " if correct system simulation is relying on SRBM registers."); + gpuDevice->setRegVal(reg_addr, pkt->data); delete header; delete pkt; @@ -967,10 +989,14 @@ SDMAEngine::ptePde(SDMAQueue *q, sdmaPtePde *pkt) // Writing generated data to the destination address. if (gpuDevice->getVM().inMMHUB(pkt->dest)) { - Addr mmhubAddr = pkt->dest - gpuDevice->getVM().getMMHUBBase(); + Addr mmhub_addr = pkt->dest - gpuDevice->getVM().getMMHUBBase(); + + fatal_if(gpuDevice->getVM().inGARTRange(mmhub_addr), + "SDMA write to GART not implemented"); + auto cb = new EventFunctionWrapper( [ = ]{ ptePdeDone(q, pkt, dmaBuffer); }, name()); - gpuDevice->getMemMgr()->writeRequest(mmhubAddr, (uint8_t *)dmaBuffer, + gpuDevice->getMemMgr()->writeRequest(mmhub_addr, (uint8_t *)dmaBuffer, sizeof(uint64_t) * pkt->count, 0, cb); } else { diff --git a/src/dev/amdgpu/sdma_engine.hh b/src/dev/amdgpu/sdma_engine.hh index d8ab31bbde..9407b97d73 100644 --- a/src/dev/amdgpu/sdma_engine.hh +++ b/src/dev/amdgpu/sdma_engine.hh @@ -172,7 +172,7 @@ class SDMAEngine : public DmaVirtDevice /** * Returns the client id for the Interrupt Handler. */ - int getIHClientId(); + int getIHClientId(int _id); /** * Methods for translation.