From 0faa9510f95c06047709eb3dac1bb89242d0a61f Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Wed, 24 Apr 2024 10:42:22 -0700 Subject: [PATCH 1/6] arch-vega,gpu-compute: Fix misc ubsan runtime errors Three main fixes: - Remove the initDynOperandInfo. UBSAN errors and exits due to things not being captured properly. After a few failed attempts playing with the capture list, just move the lambda to a new method. - Invalid data type size for some thread mask instructions. This might actually have caused silent bugs when the thread id was > 31. - Alignment issues with the operands. Change-Id: I0297e10df0f0ab9730b6f1bd132602cd36b5e7ac --- src/arch/amdgpu/vega/insts/sop2.cc | 3 +- src/arch/amdgpu/vega/insts/vop3.cc | 4 +- src/arch/amdgpu/vega/operand.hh | 4 +- src/gpu-compute/gpu_static_inst.cc | 82 ++++++++++++++++-------------- src/gpu-compute/gpu_static_inst.hh | 3 ++ 5 files changed, 54 insertions(+), 42 deletions(-) diff --git a/src/arch/amdgpu/vega/insts/sop2.cc b/src/arch/amdgpu/vega/insts/sop2.cc index 93618b2124..a2965763f7 100644 --- a/src/arch/amdgpu/vega/insts/sop2.cc +++ b/src/arch/amdgpu/vega/insts/sop2.cc @@ -1224,7 +1224,8 @@ namespace VegaISA src0.read(); src1.read(); - sdst = src0.rawData() * src1.rawData(); + ScalarRegI64 tmp = src0.rawData() * src1.rawData(); + sdst = tmp & mask(32); sdst.write(); } // execute diff --git a/src/arch/amdgpu/vega/insts/vop3.cc b/src/arch/amdgpu/vega/insts/vop3.cc index f78f64bc91..59d72ac9ed 100644 --- a/src/arch/amdgpu/vega/insts/vop3.cc +++ b/src/arch/amdgpu/vega/insts/vop3.cc @@ -8583,7 +8583,7 @@ namespace VegaISA for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { - threadMask = ((1LL << lane) - 1LL); + threadMask = ((1ULL << lane) - 1ULL); vdst[lane] = popCount(src0[lane] & bits(threadMask, 31, 0)) + src1[lane]; } @@ -8633,7 +8633,7 @@ namespace VegaISA for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { - threadMask = ((1LL << lane) - 1LL); + threadMask = ((1ULL << lane) - 1ULL); vdst[lane] = popCount(src0[lane] & bits(threadMask, 63, 32)) + src1[lane]; } diff --git a/src/arch/amdgpu/vega/operand.hh b/src/arch/amdgpu/vega/operand.hh index 698161d918..d4a7436c75 100644 --- a/src/arch/amdgpu/vega/operand.hh +++ b/src/arch/amdgpu/vega/operand.hh @@ -490,7 +490,7 @@ namespace VegaISA typename std::enable_if::type setBit(int bit, int bit_val) { - DataType &sgpr = *((DataType*)srfData.data()); + GEM5_ALIGNED(8) DataType &sgpr = *((DataType*)srfData.data()); replaceBits(sgpr, bit, bit_val); } @@ -739,7 +739,7 @@ namespace VegaISA * of a register is 1 dword. this class will take care to do the * proper packing/unpacking of sub-dword operands. */ - std::array srfData; + GEM5_ALIGNED(8) std::array srfData; }; // typedefs for the various sizes/types of scalar operands diff --git a/src/gpu-compute/gpu_static_inst.cc b/src/gpu-compute/gpu_static_inst.cc index 063e87eee1..e2dd9f54f2 100644 --- a/src/gpu-compute/gpu_static_inst.cc +++ b/src/gpu-compute/gpu_static_inst.cc @@ -54,55 +54,63 @@ GPUStaticInst::disassemble() return disassembly; } + +void +GPUStaticInst::generateVirtToPhysMap(Wavefront *wf, ComputeUnit *cu, + OperandInfo& op, + std::vector& opVec, + OpType opType) +{ + std::vector virt_idxs; + std::vector phys_idxs; + + int num_dwords = op.sizeInDWords(); + int virt_idx = op.registerIndex(wf->reservedScalarRegs); + + int phys_idx = -1; + for (int i = 0; i < num_dwords; i++) { + if (opType == OpType::SRC_VEC || opType == OpType::DST_VEC) { + phys_idx = cu->registerManager->mapVgpr(wf, virt_idx + i); + } else { + assert(opType == OpType::SRC_SCALAR || + opType == OpType::DST_SCALAR); + phys_idx = cu->registerManager->mapSgpr(wf, virt_idx + i); + } + virt_idxs.push_back(virt_idx + i); + phys_idxs.push_back(phys_idx); + } + DPRINTF(GPUInst, "%s adding %s %s (%d->%d) operand that uses " + "%d registers.\n", disassemble(), + (opType == OpType::SRC_VEC || opType == OpType::DST_VEC) ? + "vector" : "scalar", + (opType == OpType::SRC_VEC || opType == OpType::SRC_SCALAR) ? + "src" : "dst", virt_idxs[0], phys_idxs[0], num_dwords); + + op.setVirtToPhysMapping(virt_idxs, phys_idxs); + + opVec.emplace_back(op); +} + void GPUStaticInst::initDynOperandInfo(Wavefront *wf, ComputeUnit *cu) { - // Lambda function, as this is only ever used here - auto generateVirtToPhysMap = [&](OperandInfo& op, - std::vector& opVec, - MapRegFn mapFn, OpType opType) - { - std::vector virt_idxs; - std::vector phys_idxs; - - int num_dwords = op.sizeInDWords(); - int virt_idx = op.registerIndex(wf->reservedScalarRegs); - - int phys_idx = -1; - for (int i = 0; i < num_dwords; i++){ - phys_idx = (cu->registerManager->*mapFn)(wf, virt_idx + i); - virt_idxs.push_back(virt_idx + i); - phys_idxs.push_back(phys_idx); - } - DPRINTF(GPUInst, "%s adding %s %s (%d->%d) operand that uses " - "%d registers.\n", disassemble(), - (opType == OpType::SRC_VEC || opType == OpType::DST_VEC) ? - "vector" : "scalar", - (opType == OpType::SRC_VEC || opType == OpType::SRC_SCALAR) ? - "src" : "dst", virt_idxs[0], phys_idxs[0], num_dwords); - - op.setVirtToPhysMapping(virt_idxs, phys_idxs); - - opVec.emplace_back(op); - }; - for (auto& srcOp : srcOps) { if (srcOp.isVectorReg()) { - generateVirtToPhysMap(srcOp, srcVecRegOps, - &RegisterManager::mapVgpr, OpType::SRC_VEC); + generateVirtToPhysMap(wf, cu, srcOp, srcVecRegOps, + OpType::SRC_VEC); } else if (srcOp.isScalarReg()) { - generateVirtToPhysMap(srcOp, srcScalarRegOps, - &RegisterManager::mapSgpr, OpType::SRC_SCALAR); + generateVirtToPhysMap(wf, cu, srcOp, srcScalarRegOps, + OpType::SRC_SCALAR); } } for (auto& dstOp : dstOps) { if (dstOp.isVectorReg()) { - generateVirtToPhysMap(dstOp, dstVecRegOps, - &RegisterManager::mapVgpr, OpType::DST_VEC); + generateVirtToPhysMap(wf, cu, dstOp, dstVecRegOps, + OpType::DST_VEC); } else if (dstOp.isScalarReg()) { - generateVirtToPhysMap(dstOp, dstScalarRegOps, - &RegisterManager::mapSgpr, OpType::DST_SCALAR); + generateVirtToPhysMap(wf, cu, dstOp, dstScalarRegOps, + OpType::DST_SCALAR); } } } diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh index 156f0e529d..6132ab2d29 100644 --- a/src/gpu-compute/gpu_static_inst.hh +++ b/src/gpu-compute/gpu_static_inst.hh @@ -321,6 +321,9 @@ class GPUStaticInst : public GPUStaticInstFlags int _ipdInstNum; std::bitset _flags; + + void generateVirtToPhysMap(Wavefront *wf, ComputeUnit *cu, OperandInfo& op, + std::vector& opVec, OpType opType); }; class KernelLaunchStaticInst : public GPUStaticInst From 386fb3d1cc0836a512f1f00a190f6e58b7d15d13 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Wed, 24 Apr 2024 17:55:22 -0700 Subject: [PATCH 2/6] configs: Fix HSA packer processor address The address has one too many zeros and is therefore placed in a memory region usually used for system memory. As a result this causes failure when trying to run a simulation with a huge amount of memory. Change the address to be within the C000'0000h - FFFF'FFFFh X86 I/O hole as was intended. Change-Id: I5d03ac19ea3b2c01a8c431073c12fa1868b3df24 --- configs/example/gpufs/system/system.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py index b3943843e3..7c596f0ccf 100644 --- a/configs/example/gpufs/system/system.py +++ b/configs/example/gpufs/system/system.py @@ -108,7 +108,7 @@ def makeGpuFSSystem(args): system.cpu.append(shader) # This arbitrary address is something in the X86 I/O hole - hsapp_gpu_map_paddr = 0xE00000000 + hsapp_gpu_map_paddr = 0xE0000000 hsapp_pt_walker = VegaPagetableWalker() gpu_hsapp = HSAPacketProcessor( pioAddr=hsapp_gpu_map_paddr, From 2703fb56991f034bd7c633ae3ec367dcc7af7073 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Thu, 2 May 2024 11:39:48 -0700 Subject: [PATCH 3/6] gpu-compute: Fix valgrind memleak complaints Fixes several memory leaks, mostly of small and medium severity. Fixes mismatched new/new[] and delete/delete[] calls. Change-Id: Iedafc409389bd94e45f330bc587d6d72d1971219 --- src/dev/amdgpu/amdgpu_device.cc | 8 +++++++- src/dev/amdgpu/interrupt_handler.cc | 8 ++++++-- src/dev/amdgpu/interrupt_handler.hh | 6 ++++-- src/dev/amdgpu/pm4_packet_processor.cc | 13 +++++++++++-- src/dev/amdgpu/sdma_engine.cc | 2 +- src/gpu-compute/compute_unit.cc | 4 ++-- 6 files changed, 31 insertions(+), 10 deletions(-) diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc index f5bf0192bc..6bb5f9c2c5 100644 --- a/src/dev/amdgpu/amdgpu_device.cc +++ b/src/dev/amdgpu/amdgpu_device.cc @@ -453,6 +453,8 @@ AMDGPUDevice::writeFrame(PacketPtr pkt, Addr offset) auto system = cp->shader()->gpuCmdProc.system(); system->getDeviceMemory(writePkt)->access(writePkt); + + delete writePkt; } void @@ -671,7 +673,10 @@ AMDGPUDevice::getRegVal(uint64_t addr) DPRINTF(AMDGPUDevice, "Getting register 0x%lx = %x\n", fixup_addr, pkt->getLE()); - return pkt->getLE(); + pkt_data = pkt->getLE(); + delete pkt; + + return pkt_data; } void @@ -686,6 +691,7 @@ AMDGPUDevice::setRegVal(uint64_t addr, uint32_t value) PacketPtr pkt = Packet::createWrite(request); pkt->dataStatic((uint8_t *)&pkt_data); writeMMIO(pkt, addr); + delete pkt; } void diff --git a/src/dev/amdgpu/interrupt_handler.cc b/src/dev/amdgpu/interrupt_handler.cc index cb99ba7a39..4ad2527108 100644 --- a/src/dev/amdgpu/interrupt_handler.cc +++ b/src/dev/amdgpu/interrupt_handler.cc @@ -130,6 +130,10 @@ AMDGPUInterruptHandler::DmaEvent::process() } else { fatal("Interrupt Handler DMA event returned bad value: %d\n", data); } + + if (dataPtr) { + delete [] dataPtr; + } } void @@ -140,7 +144,7 @@ AMDGPUInterruptHandler::submitWritePointer() Addr paddr = regs.WptrAddr; std::memcpy(dataPtr, ®s.IH_Wptr, sizeof(uint32_t)); - dmaEvent = new AMDGPUInterruptHandler::DmaEvent(this, 2); + dmaEvent = new AMDGPUInterruptHandler::DmaEvent(this, 2, dataPtr); dmaWrite(paddr, sizeof(uint32_t), dmaEvent, dataPtr); } @@ -157,7 +161,7 @@ AMDGPUInterruptHandler::submitInterruptCookie() DPRINTF(AMDGPUDevice, "InterruptHandler rptr: 0x%x wptr: 0x%x\n", regs.IH_Rptr, regs.IH_Wptr); - dmaEvent = new AMDGPUInterruptHandler::DmaEvent(this, 1); + dmaEvent = new AMDGPUInterruptHandler::DmaEvent(this, 1, dataPtr); dmaWrite(paddr, cookieSize, dmaEvent, dataPtr); interruptQueue.pop(); diff --git a/src/dev/amdgpu/interrupt_handler.hh b/src/dev/amdgpu/interrupt_handler.hh index a895eabafc..b7ac4b29ee 100644 --- a/src/dev/amdgpu/interrupt_handler.hh +++ b/src/dev/amdgpu/interrupt_handler.hh @@ -136,10 +136,12 @@ class AMDGPUInterruptHandler : public DmaDevice private: AMDGPUInterruptHandler *deviceIh; uint32_t data; + uint8_t *dataPtr; public: - DmaEvent(AMDGPUInterruptHandler *deviceIh, uint32_t data) - : Event(), deviceIh(deviceIh), data(data) + DmaEvent(AMDGPUInterruptHandler *deviceIh, uint32_t data, + uint8_t* _dataPtr) + : Event(), deviceIh(deviceIh), data(data), dataPtr(_dataPtr) { setFlags(Event::AutoDelete); } diff --git a/src/dev/amdgpu/pm4_packet_processor.cc b/src/dev/amdgpu/pm4_packet_processor.cc index 62e817aa98..a921942678 100644 --- a/src/dev/amdgpu/pm4_packet_processor.cc +++ b/src/dev/amdgpu/pm4_packet_processor.cc @@ -456,8 +456,6 @@ PM4PacketProcessor::mapQueues(PM4Queue *q, PM4MapQueues *pkt) } else { panic("Unknown engine for MQD: %d\n", pkt->engineSel); } - - decodeNext(q); } void @@ -494,6 +492,9 @@ PM4PacketProcessor::processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, "hqdAQL %d.\n", mqd->base, mqd->mqdBase, mqd->aql); gpuDevice->processPendingDoorbells(offset); + + delete pkt; + decodeNext(q); } void @@ -524,6 +525,9 @@ PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC, getIpId()); gpuDevice->processPendingDoorbells(pkt->doorbellOffset << 2); + + delete pkt; + decodeNext(q); } void @@ -656,6 +660,7 @@ PM4PacketProcessor::unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt) dmaWriteVirt(addr, sizeof(QueueDesc), cb, mqd); queues.erase(id); hsa_pp.unsetDeviceQueueDesc(id, 8); + delete mqd; } } gpuDevice->deallocateAllQueues(); @@ -754,6 +759,7 @@ PM4PacketProcessor::indirectBuffer(PM4Queue *q, PM4IndirectBuf *pkt) q->ibBase(pkt->ibBase); q->wptr(pkt->ibSize * sizeof(uint32_t)); + delete pkt; decodeNext(q); } @@ -766,6 +772,7 @@ PM4PacketProcessor::switchBuffer(PM4Queue *q, PM4SwitchBuf *pkt) DPRINTF(PM4PacketProcessor, "PM4 switching buffer, rptr: %p.\n", q->wptr()); + delete pkt; decodeNext(q); } @@ -784,6 +791,7 @@ PM4PacketProcessor::setUconfigReg(PM4Queue *q, PM4SetUconfigReg *pkt) reg_addr += 0x40000 * getIpId(); gpuDevice->setRegVal(reg_addr, pkt->data); + delete pkt; decodeNext(q); } @@ -800,6 +808,7 @@ PM4PacketProcessor::waitRegMem(PM4Queue *q, PM4WaitRegMem *pkt) DPRINTF(PM4PacketProcessor, " Mask: %lx\n", pkt->mask); DPRINTF(PM4PacketProcessor, " Poll Interval: %lx\n", pkt->pollInterval); + delete pkt; decodeNext(q); } diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc index 070c04fe64..dcf0acac1a 100644 --- a/src/dev/amdgpu/sdma_engine.cc +++ b/src/dev/amdgpu/sdma_engine.cc @@ -1132,7 +1132,7 @@ SDMAEngine::constFillDone(SDMAQueue *q, sdmaConstFill *pkt, uint8_t *fill_data) { DPRINTF(SDMAEngine, "ConstFill to %lx done\n", pkt->addr); - delete fill_data; + delete [] fill_data; delete pkt; decodeNext(q); } diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index daad5e9b40..f12293500d 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -1735,7 +1735,7 @@ ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt) SystemHubEvent *resp_event = new SystemHubEvent(pkt, this); compute_unit->shader->systemHub->sendRequest(pkt, resp_event); } else if (!(sendTimingReq(pkt))) { - retries.push_back(std::make_pair(pkt, gpuDynInst)); + retries.emplace_back(pkt, gpuDynInst); if (gpuDynInst) { DPRINTF(GPUPort, @@ -1772,7 +1772,7 @@ ComputeUnit::ScalarDataPort::MemReqEvent::process() SystemHubEvent *resp_event = new SystemHubEvent(pkt, &scalarDataPort); compute_unit->shader->systemHub->sendRequest(pkt, resp_event); } else if (!(scalarDataPort.sendTimingReq(pkt))) { - scalarDataPort.retries.push_back(pkt); + scalarDataPort.retries.emplace_back(pkt); DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x data req failed!\n", From 8249d6d1cd767949ac908d593c34901eab2b4116 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Thu, 2 May 2024 15:58:47 -0700 Subject: [PATCH 4/6] arch-vega: Remove FP asserts in VOP3 lane manip insts The VOP3 instruction encoding generally states that ABS/NEG modifiers in the instruction encoding are only valid on floating point data types. This is currently coded in gem5 to mean floating point *instructions*. For untyped instructions like V_CNDMASK_B32, we don't actually know what the data type is. We must trust that the compiler did not attempt to apply these bits to non-FP data types. This commit simply removes the asserts. The ABS/NEG modifiers are therefore ignored which is consistent with the ISA documentation. This is done on the lane manipulation instructions V_CNDMASK_B32, V_READLINE_B32, and V_WRITELANE_B32 which are typically used to mask off or move data between registers. Other bitwise instructions (e.g., V_OR_B32) keep the asserts as bitwise operations on FP types are genernally illegal in languages like C++. Change-Id: I478c5272ba96383a063b2828de21d60948b25c8f --- src/arch/amdgpu/vega/insts/vop3.cc | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/src/arch/amdgpu/vega/insts/vop3.cc b/src/arch/amdgpu/vega/insts/vop3.cc index 59d72ac9ed..18446d2e2b 100644 --- a/src/arch/amdgpu/vega/insts/vop3.cc +++ b/src/arch/amdgpu/vega/insts/vop3.cc @@ -66,16 +66,6 @@ namespace VegaISA src1.readSrc(); vcc.read(); - /** - * input modifiers are supported by FP operations only - */ - assert(!(instData.ABS & 0x1)); - assert(!(instData.ABS & 0x2)); - assert(!(instData.ABS & 0x4)); - assert(!(extData.NEG & 0x1)); - assert(!(extData.NEG & 0x2)); - assert(!(extData.NEG & 0x4)); - for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { if (wf->execMask(lane)) { vdst[lane] = bits(vcc.rawData(), lane) @@ -8440,16 +8430,6 @@ namespace VegaISA src0.readSrc(); src1.read(); - /** - * input modifiers are supported by FP operations only - */ - assert(!(instData.ABS & 0x1)); - assert(!(instData.ABS & 0x2)); - assert(!(instData.ABS & 0x4)); - assert(!(extData.NEG & 0x1)); - assert(!(extData.NEG & 0x2)); - assert(!(extData.NEG & 0x4)); - sdst = src0[src1.rawData() & 0x3f]; sdst.write(); @@ -8484,16 +8464,6 @@ namespace VegaISA src1.read(); vdst.read(); - /** - * input modifiers are supported by FP operations only - */ - assert(!(instData.ABS & 0x1)); - assert(!(instData.ABS & 0x2)); - assert(!(instData.ABS & 0x4)); - assert(!(extData.NEG & 0x1)); - assert(!(extData.NEG & 0x2)); - assert(!(extData.NEG & 0x4)); - vdst[src1.rawData() & 0x3f] = src0.rawData(); vdst.write(); From 29f63f630b3db1435569dd2157a6198dc4155084 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Fri, 3 May 2024 10:06:34 -0700 Subject: [PATCH 5/6] dev-amdgpu: Correct missing GART warning SDMA ptePde packets are generating a warning that a GART address is missing, causing a wrong address to be clobbered by the operation. This commit fixes this by converting the GART address when the queue is running in privledged mode, which is the only mode allowed to use GART addresses. This removes the warnings and writes to the correct memory region. Change-Id: I64acac308db2431c5996b876bf4cda704f51cf25 --- src/dev/amdgpu/sdma_engine.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc index dcf0acac1a..735be554b4 100644 --- a/src/dev/amdgpu/sdma_engine.cc +++ b/src/dev/amdgpu/sdma_engine.cc @@ -1000,6 +1000,9 @@ SDMAEngine::ptePde(SDMAQueue *q, sdmaPtePde *pkt) sizeof(uint64_t) * pkt->count, 0, cb); } else { + if (q->priv()) { + pkt->dest = getGARTAddr(pkt->dest); + } auto cb = new DmaVirtCallback( [ = ] (const uint64_t &) { ptePdeDone(q, pkt, dmaBuffer); }); dmaWriteVirt(pkt->dest, sizeof(uint64_t) * pkt->count, cb, From 3490d5bf189eb2600a35572341a8073fdbd0d333 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Wed, 24 Apr 2024 10:42:58 -0700 Subject: [PATCH 6/6] gpu-compute: Add DebugFlag for LDS This prints what values are read/written to LDS and the previous value on write. This is useful for debugging problems with LDS instructions. Change-Id: I30063327bec1a1a808914a018467d5d78d5d58b4 --- src/gpu-compute/SConscript | 3 ++- src/gpu-compute/lds_state.hh | 50 ++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript index e4536ba2a5..23e3377f50 100644 --- a/src/gpu-compute/SConscript +++ b/src/gpu-compute/SConscript @@ -84,6 +84,7 @@ DebugFlag('GPUExec') DebugFlag('GPUFetch') DebugFlag('GPUInst') DebugFlag('GPUKernelInfo') +DebugFlag('GPULDS') DebugFlag('GPUMem') DebugFlag('GPUPort') DebugFlag('GPUPrefetch') @@ -106,4 +107,4 @@ DebugFlag('WavefrontStack') CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch', 'GPUMem', 'GPUPort', 'GPUSched', 'GPUSRF', 'GPUSync', 'GPUTLB', 'GPUVRF', 'GPURFC', 'GPUWgLatency', - 'GPUKernelInfo', 'GPUInitAbi']) + 'GPUKernelInfo', 'GPUInitAbi', 'GPULDS']) diff --git a/src/gpu-compute/lds_state.hh b/src/gpu-compute/lds_state.hh index 3228b7822c..d336d35079 100644 --- a/src/gpu-compute/lds_state.hh +++ b/src/gpu-compute/lds_state.hh @@ -39,6 +39,7 @@ #include #include +#include "debug/GPULDS.hh" #include "gpu-compute/misc.hh" #include "mem/port.hh" #include "params/LdsState.hh" @@ -75,10 +76,30 @@ class LdsChunk * chunk allocated to this WG we return 0. */ if (index >= chunk.size()) { + DPRINTF(GPULDS, "LDS[%d][%d]: Read 0 beyond size (%ld)\n", + dispatchId, wgId, chunk.size()); return (T)0; } T *p0 = (T *) (&(chunk.at(index))); + + if (sizeof(T) <= 4) { + [[maybe_unused]] uint32_t int_val = + *reinterpret_cast(p0); + DPRINTF(GPULDS, "LDS[%d][%d]: Read %08x from index %d\n", + dispatchId, wgId, int_val, index); + } else if (sizeof(T) <= 8) { + [[maybe_unused]] uint64_t int_val = + *reinterpret_cast(p0); + DPRINTF(GPULDS, "LDS[%d][%d]: Read %016lx from index %d\n", + dispatchId, wgId, int_val, index); + } else if (sizeof(T) <= 16) { + [[maybe_unused]] uint64_t *int_vals = + reinterpret_cast(p0); + DPRINTF(GPULDS, "LDS[%d][%d]: Read %016lx%016lx from index %d\n", + dispatchId, wgId, int_vals[1], int_vals[0], index); + } + return *p0; } @@ -94,10 +115,33 @@ class LdsChunk * chunk allocated to this WG are dropped. */ if (index >= chunk.size()) { + DPRINTF(GPULDS, "LDS[%d][%d]: Ignoring write beyond size (%ld)\n", + dispatchId, wgId, chunk.size()); return; } T *p0 = (T *) (&(chunk.at(index))); + + if (sizeof(T) <= 4) { + [[maybe_unused]] uint32_t prev_val = + *reinterpret_cast(p0); + DPRINTF(GPULDS, "LDS[%d][%d]: Write %08lx to index %d (was " + "%08lx)\n", dispatchId, wgId, value, index, prev_val); + } else if (sizeof(T) <= 8) { + [[maybe_unused]] uint64_t prev_val = + *reinterpret_cast(p0); + DPRINTF(GPULDS, "LDS[%d][%d]: Write %016lx to index %d (was " + "%016lx)\n", dispatchId, wgId, value, index, prev_val); + } else if (sizeof(T) <= 16) { + [[maybe_unused]] uint64_t *prev_vals = + reinterpret_cast(p0); + [[maybe_unused]] const uint64_t *next_vals = + reinterpret_cast(&value); + DPRINTF(GPULDS, "LDS[%d][%d]: Write %016lx%016lx to index %d " + "(was %016lx%016lx)\n", dispatchId, wgId, next_vals[1], + next_vals[0], index, prev_vals[1], prev_vals[0]); + } + *p0 = value; } @@ -131,6 +175,9 @@ class LdsChunk return chunk.size(); } + uint32_t dispatchId; + uint32_t wgId; + protected: // the actual data store for this slice of the LDS std::vector chunk; @@ -402,6 +449,9 @@ class LdsState: public ClockedObject // make an entry for this workgroup refCounter[dispatchId][wgId] = 0; + chunkMap[dispatchId][wgId].dispatchId = dispatchId; + chunkMap[dispatchId][wgId].wgId = wgId; + return &chunkMap[dispatchId][wgId]; } }