gpu: Consolidated fixes for v24.0 (#1103)

Includes fixes for several bugs reported via email, self found, and
internal reports. Also includes runs through Valgrind and UBsan. See
individual commits for more details.
This commit is contained in:
Matthew Poremba
2024-05-06 07:35:57 -07:00
committed by GitHub
14 changed files with 141 additions and 84 deletions

View File

@@ -108,7 +108,7 @@ def makeGpuFSSystem(args):
system.cpu.append(shader)
# This arbitrary address is something in the X86 I/O hole
hsapp_gpu_map_paddr = 0xE00000000
hsapp_gpu_map_paddr = 0xE0000000
hsapp_pt_walker = VegaPagetableWalker()
gpu_hsapp = HSAPacketProcessor(
pioAddr=hsapp_gpu_map_paddr,

View File

@@ -1224,7 +1224,8 @@ namespace VegaISA
src0.read();
src1.read();
sdst = src0.rawData() * src1.rawData();
ScalarRegI64 tmp = src0.rawData() * src1.rawData();
sdst = tmp & mask(32);
sdst.write();
} // execute

View File

@@ -66,16 +66,6 @@ namespace VegaISA
src1.readSrc();
vcc.read();
/**
* input modifiers are supported by FP operations only
*/
assert(!(instData.ABS & 0x1));
assert(!(instData.ABS & 0x2));
assert(!(instData.ABS & 0x4));
assert(!(extData.NEG & 0x1));
assert(!(extData.NEG & 0x2));
assert(!(extData.NEG & 0x4));
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
vdst[lane] = bits(vcc.rawData(), lane)
@@ -8440,16 +8430,6 @@ namespace VegaISA
src0.readSrc();
src1.read();
/**
* input modifiers are supported by FP operations only
*/
assert(!(instData.ABS & 0x1));
assert(!(instData.ABS & 0x2));
assert(!(instData.ABS & 0x4));
assert(!(extData.NEG & 0x1));
assert(!(extData.NEG & 0x2));
assert(!(extData.NEG & 0x4));
sdst = src0[src1.rawData() & 0x3f];
sdst.write();
@@ -8484,16 +8464,6 @@ namespace VegaISA
src1.read();
vdst.read();
/**
* input modifiers are supported by FP operations only
*/
assert(!(instData.ABS & 0x1));
assert(!(instData.ABS & 0x2));
assert(!(instData.ABS & 0x4));
assert(!(extData.NEG & 0x1));
assert(!(extData.NEG & 0x2));
assert(!(extData.NEG & 0x4));
vdst[src1.rawData() & 0x3f] = src0.rawData();
vdst.write();
@@ -8583,7 +8553,7 @@ namespace VegaISA
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
threadMask = ((1LL << lane) - 1LL);
threadMask = ((1ULL << lane) - 1ULL);
vdst[lane] = popCount(src0[lane] & bits(threadMask, 31, 0)) +
src1[lane];
}
@@ -8633,7 +8603,7 @@ namespace VegaISA
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
threadMask = ((1LL << lane) - 1LL);
threadMask = ((1ULL << lane) - 1ULL);
vdst[lane] = popCount(src0[lane] & bits(threadMask, 63, 32)) +
src1[lane];
}

View File

@@ -490,7 +490,7 @@ namespace VegaISA
typename std::enable_if<Condition, void>::type
setBit(int bit, int bit_val)
{
DataType &sgpr = *((DataType*)srfData.data());
GEM5_ALIGNED(8) DataType &sgpr = *((DataType*)srfData.data());
replaceBits(sgpr, bit, bit_val);
}
@@ -739,7 +739,7 @@ namespace VegaISA
* of a register is 1 dword. this class will take care to do the
* proper packing/unpacking of sub-dword operands.
*/
std::array<ScalarRegU32, NumDwords> srfData;
GEM5_ALIGNED(8) std::array<ScalarRegU32, NumDwords> srfData;
};
// typedefs for the various sizes/types of scalar operands

View File

@@ -453,6 +453,8 @@ AMDGPUDevice::writeFrame(PacketPtr pkt, Addr offset)
auto system = cp->shader()->gpuCmdProc.system();
system->getDeviceMemory(writePkt)->access(writePkt);
delete writePkt;
}
void
@@ -671,7 +673,10 @@ AMDGPUDevice::getRegVal(uint64_t addr)
DPRINTF(AMDGPUDevice, "Getting register 0x%lx = %x\n",
fixup_addr, pkt->getLE<uint32_t>());
return pkt->getLE<uint32_t>();
pkt_data = pkt->getLE<uint32_t>();
delete pkt;
return pkt_data;
}
void
@@ -686,6 +691,7 @@ AMDGPUDevice::setRegVal(uint64_t addr, uint32_t value)
PacketPtr pkt = Packet::createWrite(request);
pkt->dataStatic((uint8_t *)&pkt_data);
writeMMIO(pkt, addr);
delete pkt;
}
void

View File

@@ -130,6 +130,10 @@ AMDGPUInterruptHandler::DmaEvent::process()
} else {
fatal("Interrupt Handler DMA event returned bad value: %d\n", data);
}
if (dataPtr) {
delete [] dataPtr;
}
}
void
@@ -140,7 +144,7 @@ AMDGPUInterruptHandler::submitWritePointer()
Addr paddr = regs.WptrAddr;
std::memcpy(dataPtr, &regs.IH_Wptr, sizeof(uint32_t));
dmaEvent = new AMDGPUInterruptHandler::DmaEvent(this, 2);
dmaEvent = new AMDGPUInterruptHandler::DmaEvent(this, 2, dataPtr);
dmaWrite(paddr, sizeof(uint32_t), dmaEvent, dataPtr);
}
@@ -157,7 +161,7 @@ AMDGPUInterruptHandler::submitInterruptCookie()
DPRINTF(AMDGPUDevice, "InterruptHandler rptr: 0x%x wptr: 0x%x\n",
regs.IH_Rptr, regs.IH_Wptr);
dmaEvent = new AMDGPUInterruptHandler::DmaEvent(this, 1);
dmaEvent = new AMDGPUInterruptHandler::DmaEvent(this, 1, dataPtr);
dmaWrite(paddr, cookieSize, dmaEvent, dataPtr);
interruptQueue.pop();

View File

@@ -136,10 +136,12 @@ class AMDGPUInterruptHandler : public DmaDevice
private:
AMDGPUInterruptHandler *deviceIh;
uint32_t data;
uint8_t *dataPtr;
public:
DmaEvent(AMDGPUInterruptHandler *deviceIh, uint32_t data)
: Event(), deviceIh(deviceIh), data(data)
DmaEvent(AMDGPUInterruptHandler *deviceIh, uint32_t data,
uint8_t* _dataPtr)
: Event(), deviceIh(deviceIh), data(data), dataPtr(_dataPtr)
{
setFlags(Event::AutoDelete);
}

View File

@@ -456,8 +456,6 @@ PM4PacketProcessor::mapQueues(PM4Queue *q, PM4MapQueues *pkt)
} else {
panic("Unknown engine for MQD: %d\n", pkt->engineSel);
}
decodeNext(q);
}
void
@@ -494,6 +492,9 @@ PM4PacketProcessor::processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
"hqdAQL %d.\n", mqd->base, mqd->mqdBase, mqd->aql);
gpuDevice->processPendingDoorbells(offset);
delete pkt;
decodeNext(q);
}
void
@@ -524,6 +525,9 @@ PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC, getIpId());
gpuDevice->processPendingDoorbells(pkt->doorbellOffset << 2);
delete pkt;
decodeNext(q);
}
void
@@ -656,6 +660,7 @@ PM4PacketProcessor::unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt)
dmaWriteVirt(addr, sizeof(QueueDesc), cb, mqd);
queues.erase(id);
hsa_pp.unsetDeviceQueueDesc(id, 8);
delete mqd;
}
}
gpuDevice->deallocateAllQueues();
@@ -754,6 +759,7 @@ PM4PacketProcessor::indirectBuffer(PM4Queue *q, PM4IndirectBuf *pkt)
q->ibBase(pkt->ibBase);
q->wptr(pkt->ibSize * sizeof(uint32_t));
delete pkt;
decodeNext(q);
}
@@ -766,6 +772,7 @@ PM4PacketProcessor::switchBuffer(PM4Queue *q, PM4SwitchBuf *pkt)
DPRINTF(PM4PacketProcessor, "PM4 switching buffer, rptr: %p.\n",
q->wptr());
delete pkt;
decodeNext(q);
}
@@ -784,6 +791,7 @@ PM4PacketProcessor::setUconfigReg(PM4Queue *q, PM4SetUconfigReg *pkt)
reg_addr += 0x40000 * getIpId();
gpuDevice->setRegVal(reg_addr, pkt->data);
delete pkt;
decodeNext(q);
}
@@ -800,6 +808,7 @@ PM4PacketProcessor::waitRegMem(PM4Queue *q, PM4WaitRegMem *pkt)
DPRINTF(PM4PacketProcessor, " Mask: %lx\n", pkt->mask);
DPRINTF(PM4PacketProcessor, " Poll Interval: %lx\n", pkt->pollInterval);
delete pkt;
decodeNext(q);
}

View File

@@ -1000,6 +1000,9 @@ SDMAEngine::ptePde(SDMAQueue *q, sdmaPtePde *pkt)
sizeof(uint64_t) * pkt->count, 0,
cb);
} else {
if (q->priv()) {
pkt->dest = getGARTAddr(pkt->dest);
}
auto cb = new DmaVirtCallback<uint64_t>(
[ = ] (const uint64_t &) { ptePdeDone(q, pkt, dmaBuffer); });
dmaWriteVirt(pkt->dest, sizeof(uint64_t) * pkt->count, cb,
@@ -1132,7 +1135,7 @@ SDMAEngine::constFillDone(SDMAQueue *q, sdmaConstFill *pkt, uint8_t *fill_data)
{
DPRINTF(SDMAEngine, "ConstFill to %lx done\n", pkt->addr);
delete fill_data;
delete [] fill_data;
delete pkt;
decodeNext(q);
}

View File

@@ -84,6 +84,7 @@ DebugFlag('GPUExec')
DebugFlag('GPUFetch')
DebugFlag('GPUInst')
DebugFlag('GPUKernelInfo')
DebugFlag('GPULDS')
DebugFlag('GPUMem')
DebugFlag('GPUPort')
DebugFlag('GPUPrefetch')
@@ -106,4 +107,4 @@ DebugFlag('WavefrontStack')
CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
'GPUMem', 'GPUPort', 'GPUSched', 'GPUSRF', 'GPUSync',
'GPUTLB', 'GPUVRF', 'GPURFC', 'GPUWgLatency',
'GPUKernelInfo', 'GPUInitAbi'])
'GPUKernelInfo', 'GPUInitAbi', 'GPULDS'])

View File

@@ -1746,7 +1746,7 @@ ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt)
SystemHubEvent *resp_event = new SystemHubEvent(pkt, this);
compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
} else if (!(sendTimingReq(pkt))) {
retries.push_back(std::make_pair(pkt, gpuDynInst));
retries.emplace_back(pkt, gpuDynInst);
if (gpuDynInst) {
DPRINTF(GPUPort,
@@ -1783,7 +1783,7 @@ ComputeUnit::ScalarDataPort::MemReqEvent::process()
SystemHubEvent *resp_event = new SystemHubEvent(pkt, &scalarDataPort);
compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
} else if (!(scalarDataPort.sendTimingReq(pkt))) {
scalarDataPort.retries.push_back(pkt);
scalarDataPort.retries.emplace_back(pkt);
DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: addr %#x data req failed!\n",

View File

@@ -54,55 +54,63 @@ GPUStaticInst::disassemble()
return disassembly;
}
void
GPUStaticInst::generateVirtToPhysMap(Wavefront *wf, ComputeUnit *cu,
OperandInfo& op,
std::vector<OperandInfo>& opVec,
OpType opType)
{
std::vector<int> virt_idxs;
std::vector<int> phys_idxs;
int num_dwords = op.sizeInDWords();
int virt_idx = op.registerIndex(wf->reservedScalarRegs);
int phys_idx = -1;
for (int i = 0; i < num_dwords; i++) {
if (opType == OpType::SRC_VEC || opType == OpType::DST_VEC) {
phys_idx = cu->registerManager->mapVgpr(wf, virt_idx + i);
} else {
assert(opType == OpType::SRC_SCALAR ||
opType == OpType::DST_SCALAR);
phys_idx = cu->registerManager->mapSgpr(wf, virt_idx + i);
}
virt_idxs.push_back(virt_idx + i);
phys_idxs.push_back(phys_idx);
}
DPRINTF(GPUInst, "%s adding %s %s (%d->%d) operand that uses "
"%d registers.\n", disassemble(),
(opType == OpType::SRC_VEC || opType == OpType::DST_VEC) ?
"vector" : "scalar",
(opType == OpType::SRC_VEC || opType == OpType::SRC_SCALAR) ?
"src" : "dst", virt_idxs[0], phys_idxs[0], num_dwords);
op.setVirtToPhysMapping(virt_idxs, phys_idxs);
opVec.emplace_back(op);
}
void
GPUStaticInst::initDynOperandInfo(Wavefront *wf, ComputeUnit *cu)
{
// Lambda function, as this is only ever used here
auto generateVirtToPhysMap = [&](OperandInfo& op,
std::vector<OperandInfo>& opVec,
MapRegFn mapFn, OpType opType)
{
std::vector<int> virt_idxs;
std::vector<int> phys_idxs;
int num_dwords = op.sizeInDWords();
int virt_idx = op.registerIndex(wf->reservedScalarRegs);
int phys_idx = -1;
for (int i = 0; i < num_dwords; i++){
phys_idx = (cu->registerManager->*mapFn)(wf, virt_idx + i);
virt_idxs.push_back(virt_idx + i);
phys_idxs.push_back(phys_idx);
}
DPRINTF(GPUInst, "%s adding %s %s (%d->%d) operand that uses "
"%d registers.\n", disassemble(),
(opType == OpType::SRC_VEC || opType == OpType::DST_VEC) ?
"vector" : "scalar",
(opType == OpType::SRC_VEC || opType == OpType::SRC_SCALAR) ?
"src" : "dst", virt_idxs[0], phys_idxs[0], num_dwords);
op.setVirtToPhysMapping(virt_idxs, phys_idxs);
opVec.emplace_back(op);
};
for (auto& srcOp : srcOps) {
if (srcOp.isVectorReg()) {
generateVirtToPhysMap(srcOp, srcVecRegOps,
&RegisterManager::mapVgpr, OpType::SRC_VEC);
generateVirtToPhysMap(wf, cu, srcOp, srcVecRegOps,
OpType::SRC_VEC);
} else if (srcOp.isScalarReg()) {
generateVirtToPhysMap(srcOp, srcScalarRegOps,
&RegisterManager::mapSgpr, OpType::SRC_SCALAR);
generateVirtToPhysMap(wf, cu, srcOp, srcScalarRegOps,
OpType::SRC_SCALAR);
}
}
for (auto& dstOp : dstOps) {
if (dstOp.isVectorReg()) {
generateVirtToPhysMap(dstOp, dstVecRegOps,
&RegisterManager::mapVgpr, OpType::DST_VEC);
generateVirtToPhysMap(wf, cu, dstOp, dstVecRegOps,
OpType::DST_VEC);
} else if (dstOp.isScalarReg()) {
generateVirtToPhysMap(dstOp, dstScalarRegOps,
&RegisterManager::mapSgpr, OpType::DST_SCALAR);
generateVirtToPhysMap(wf, cu, dstOp, dstScalarRegOps,
OpType::DST_SCALAR);
}
}
}

View File

@@ -321,6 +321,9 @@ class GPUStaticInst : public GPUStaticInstFlags
int _ipdInstNum;
std::bitset<Num_Flags> _flags;
void generateVirtToPhysMap(Wavefront *wf, ComputeUnit *cu, OperandInfo& op,
std::vector<OperandInfo>& opVec, OpType opType);
};
class KernelLaunchStaticInst : public GPUStaticInst

View File

@@ -39,6 +39,7 @@
#include <utility>
#include <vector>
#include "debug/GPULDS.hh"
#include "gpu-compute/misc.hh"
#include "mem/port.hh"
#include "params/LdsState.hh"
@@ -75,10 +76,30 @@ class LdsChunk
* chunk allocated to this WG we return 0.
*/
if (index >= chunk.size()) {
DPRINTF(GPULDS, "LDS[%d][%d]: Read 0 beyond size (%ld)\n",
dispatchId, wgId, chunk.size());
return (T)0;
}
T *p0 = (T *) (&(chunk.at(index)));
if (sizeof(T) <= 4) {
[[maybe_unused]] uint32_t int_val =
*reinterpret_cast<uint32_t*>(p0);
DPRINTF(GPULDS, "LDS[%d][%d]: Read %08x from index %d\n",
dispatchId, wgId, int_val, index);
} else if (sizeof(T) <= 8) {
[[maybe_unused]] uint64_t int_val =
*reinterpret_cast<uint64_t*>(p0);
DPRINTF(GPULDS, "LDS[%d][%d]: Read %016lx from index %d\n",
dispatchId, wgId, int_val, index);
} else if (sizeof(T) <= 16) {
[[maybe_unused]] uint64_t *int_vals =
reinterpret_cast<uint64_t*>(p0);
DPRINTF(GPULDS, "LDS[%d][%d]: Read %016lx%016lx from index %d\n",
dispatchId, wgId, int_vals[1], int_vals[0], index);
}
return *p0;
}
@@ -94,10 +115,33 @@ class LdsChunk
* chunk allocated to this WG are dropped.
*/
if (index >= chunk.size()) {
DPRINTF(GPULDS, "LDS[%d][%d]: Ignoring write beyond size (%ld)\n",
dispatchId, wgId, chunk.size());
return;
}
T *p0 = (T *) (&(chunk.at(index)));
if (sizeof(T) <= 4) {
[[maybe_unused]] uint32_t prev_val =
*reinterpret_cast<uint32_t*>(p0);
DPRINTF(GPULDS, "LDS[%d][%d]: Write %08lx to index %d (was "
"%08lx)\n", dispatchId, wgId, value, index, prev_val);
} else if (sizeof(T) <= 8) {
[[maybe_unused]] uint64_t prev_val =
*reinterpret_cast<uint64_t*>(p0);
DPRINTF(GPULDS, "LDS[%d][%d]: Write %016lx to index %d (was "
"%016lx)\n", dispatchId, wgId, value, index, prev_val);
} else if (sizeof(T) <= 16) {
[[maybe_unused]] uint64_t *prev_vals =
reinterpret_cast<uint64_t*>(p0);
[[maybe_unused]] const uint64_t *next_vals =
reinterpret_cast<const uint64_t*>(&value);
DPRINTF(GPULDS, "LDS[%d][%d]: Write %016lx%016lx to index %d "
"(was %016lx%016lx)\n", dispatchId, wgId, next_vals[1],
next_vals[0], index, prev_vals[1], prev_vals[0]);
}
*p0 = value;
}
@@ -131,6 +175,9 @@ class LdsChunk
return chunk.size();
}
uint32_t dispatchId;
uint32_t wgId;
protected:
// the actual data store for this slice of the LDS
std::vector<uint8_t> chunk;
@@ -402,6 +449,9 @@ class LdsState: public ClockedObject
// make an entry for this workgroup
refCounter[dispatchId][wgId] = 0;
chunkMap[dispatchId][wgId].dispatchId = dispatchId;
chunkMap[dispatchId][wgId].wgId = wgId;
return &chunkMap[dispatchId][wgId];
}
}