gpu: Consolidated fixes for v24.0 (#1103)
Includes fixes for several bugs reported via email, self found, and internal reports. Also includes runs through Valgrind and UBsan. See individual commits for more details.
This commit is contained in:
@@ -108,7 +108,7 @@ def makeGpuFSSystem(args):
|
||||
system.cpu.append(shader)
|
||||
|
||||
# This arbitrary address is something in the X86 I/O hole
|
||||
hsapp_gpu_map_paddr = 0xE00000000
|
||||
hsapp_gpu_map_paddr = 0xE0000000
|
||||
hsapp_pt_walker = VegaPagetableWalker()
|
||||
gpu_hsapp = HSAPacketProcessor(
|
||||
pioAddr=hsapp_gpu_map_paddr,
|
||||
|
||||
@@ -1224,7 +1224,8 @@ namespace VegaISA
|
||||
src0.read();
|
||||
src1.read();
|
||||
|
||||
sdst = src0.rawData() * src1.rawData();
|
||||
ScalarRegI64 tmp = src0.rawData() * src1.rawData();
|
||||
sdst = tmp & mask(32);
|
||||
|
||||
sdst.write();
|
||||
} // execute
|
||||
|
||||
@@ -66,16 +66,6 @@ namespace VegaISA
|
||||
src1.readSrc();
|
||||
vcc.read();
|
||||
|
||||
/**
|
||||
* input modifiers are supported by FP operations only
|
||||
*/
|
||||
assert(!(instData.ABS & 0x1));
|
||||
assert(!(instData.ABS & 0x2));
|
||||
assert(!(instData.ABS & 0x4));
|
||||
assert(!(extData.NEG & 0x1));
|
||||
assert(!(extData.NEG & 0x2));
|
||||
assert(!(extData.NEG & 0x4));
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (wf->execMask(lane)) {
|
||||
vdst[lane] = bits(vcc.rawData(), lane)
|
||||
@@ -8440,16 +8430,6 @@ namespace VegaISA
|
||||
src0.readSrc();
|
||||
src1.read();
|
||||
|
||||
/**
|
||||
* input modifiers are supported by FP operations only
|
||||
*/
|
||||
assert(!(instData.ABS & 0x1));
|
||||
assert(!(instData.ABS & 0x2));
|
||||
assert(!(instData.ABS & 0x4));
|
||||
assert(!(extData.NEG & 0x1));
|
||||
assert(!(extData.NEG & 0x2));
|
||||
assert(!(extData.NEG & 0x4));
|
||||
|
||||
sdst = src0[src1.rawData() & 0x3f];
|
||||
|
||||
sdst.write();
|
||||
@@ -8484,16 +8464,6 @@ namespace VegaISA
|
||||
src1.read();
|
||||
vdst.read();
|
||||
|
||||
/**
|
||||
* input modifiers are supported by FP operations only
|
||||
*/
|
||||
assert(!(instData.ABS & 0x1));
|
||||
assert(!(instData.ABS & 0x2));
|
||||
assert(!(instData.ABS & 0x4));
|
||||
assert(!(extData.NEG & 0x1));
|
||||
assert(!(extData.NEG & 0x2));
|
||||
assert(!(extData.NEG & 0x4));
|
||||
|
||||
vdst[src1.rawData() & 0x3f] = src0.rawData();
|
||||
|
||||
vdst.write();
|
||||
@@ -8583,7 +8553,7 @@ namespace VegaISA
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (wf->execMask(lane)) {
|
||||
threadMask = ((1LL << lane) - 1LL);
|
||||
threadMask = ((1ULL << lane) - 1ULL);
|
||||
vdst[lane] = popCount(src0[lane] & bits(threadMask, 31, 0)) +
|
||||
src1[lane];
|
||||
}
|
||||
@@ -8633,7 +8603,7 @@ namespace VegaISA
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (wf->execMask(lane)) {
|
||||
threadMask = ((1LL << lane) - 1LL);
|
||||
threadMask = ((1ULL << lane) - 1ULL);
|
||||
vdst[lane] = popCount(src0[lane] & bits(threadMask, 63, 32)) +
|
||||
src1[lane];
|
||||
}
|
||||
|
||||
@@ -490,7 +490,7 @@ namespace VegaISA
|
||||
typename std::enable_if<Condition, void>::type
|
||||
setBit(int bit, int bit_val)
|
||||
{
|
||||
DataType &sgpr = *((DataType*)srfData.data());
|
||||
GEM5_ALIGNED(8) DataType &sgpr = *((DataType*)srfData.data());
|
||||
replaceBits(sgpr, bit, bit_val);
|
||||
}
|
||||
|
||||
@@ -739,7 +739,7 @@ namespace VegaISA
|
||||
* of a register is 1 dword. this class will take care to do the
|
||||
* proper packing/unpacking of sub-dword operands.
|
||||
*/
|
||||
std::array<ScalarRegU32, NumDwords> srfData;
|
||||
GEM5_ALIGNED(8) std::array<ScalarRegU32, NumDwords> srfData;
|
||||
};
|
||||
|
||||
// typedefs for the various sizes/types of scalar operands
|
||||
|
||||
@@ -453,6 +453,8 @@ AMDGPUDevice::writeFrame(PacketPtr pkt, Addr offset)
|
||||
|
||||
auto system = cp->shader()->gpuCmdProc.system();
|
||||
system->getDeviceMemory(writePkt)->access(writePkt);
|
||||
|
||||
delete writePkt;
|
||||
}
|
||||
|
||||
void
|
||||
@@ -671,7 +673,10 @@ AMDGPUDevice::getRegVal(uint64_t addr)
|
||||
DPRINTF(AMDGPUDevice, "Getting register 0x%lx = %x\n",
|
||||
fixup_addr, pkt->getLE<uint32_t>());
|
||||
|
||||
return pkt->getLE<uint32_t>();
|
||||
pkt_data = pkt->getLE<uint32_t>();
|
||||
delete pkt;
|
||||
|
||||
return pkt_data;
|
||||
}
|
||||
|
||||
void
|
||||
@@ -686,6 +691,7 @@ AMDGPUDevice::setRegVal(uint64_t addr, uint32_t value)
|
||||
PacketPtr pkt = Packet::createWrite(request);
|
||||
pkt->dataStatic((uint8_t *)&pkt_data);
|
||||
writeMMIO(pkt, addr);
|
||||
delete pkt;
|
||||
}
|
||||
|
||||
void
|
||||
|
||||
@@ -130,6 +130,10 @@ AMDGPUInterruptHandler::DmaEvent::process()
|
||||
} else {
|
||||
fatal("Interrupt Handler DMA event returned bad value: %d\n", data);
|
||||
}
|
||||
|
||||
if (dataPtr) {
|
||||
delete [] dataPtr;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
@@ -140,7 +144,7 @@ AMDGPUInterruptHandler::submitWritePointer()
|
||||
Addr paddr = regs.WptrAddr;
|
||||
std::memcpy(dataPtr, ®s.IH_Wptr, sizeof(uint32_t));
|
||||
|
||||
dmaEvent = new AMDGPUInterruptHandler::DmaEvent(this, 2);
|
||||
dmaEvent = new AMDGPUInterruptHandler::DmaEvent(this, 2, dataPtr);
|
||||
dmaWrite(paddr, sizeof(uint32_t), dmaEvent, dataPtr);
|
||||
}
|
||||
|
||||
@@ -157,7 +161,7 @@ AMDGPUInterruptHandler::submitInterruptCookie()
|
||||
|
||||
DPRINTF(AMDGPUDevice, "InterruptHandler rptr: 0x%x wptr: 0x%x\n",
|
||||
regs.IH_Rptr, regs.IH_Wptr);
|
||||
dmaEvent = new AMDGPUInterruptHandler::DmaEvent(this, 1);
|
||||
dmaEvent = new AMDGPUInterruptHandler::DmaEvent(this, 1, dataPtr);
|
||||
dmaWrite(paddr, cookieSize, dmaEvent, dataPtr);
|
||||
|
||||
interruptQueue.pop();
|
||||
|
||||
@@ -136,10 +136,12 @@ class AMDGPUInterruptHandler : public DmaDevice
|
||||
private:
|
||||
AMDGPUInterruptHandler *deviceIh;
|
||||
uint32_t data;
|
||||
uint8_t *dataPtr;
|
||||
|
||||
public:
|
||||
DmaEvent(AMDGPUInterruptHandler *deviceIh, uint32_t data)
|
||||
: Event(), deviceIh(deviceIh), data(data)
|
||||
DmaEvent(AMDGPUInterruptHandler *deviceIh, uint32_t data,
|
||||
uint8_t* _dataPtr)
|
||||
: Event(), deviceIh(deviceIh), data(data), dataPtr(_dataPtr)
|
||||
{
|
||||
setFlags(Event::AutoDelete);
|
||||
}
|
||||
|
||||
@@ -456,8 +456,6 @@ PM4PacketProcessor::mapQueues(PM4Queue *q, PM4MapQueues *pkt)
|
||||
} else {
|
||||
panic("Unknown engine for MQD: %d\n", pkt->engineSel);
|
||||
}
|
||||
|
||||
decodeNext(q);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -494,6 +492,9 @@ PM4PacketProcessor::processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
|
||||
"hqdAQL %d.\n", mqd->base, mqd->mqdBase, mqd->aql);
|
||||
|
||||
gpuDevice->processPendingDoorbells(offset);
|
||||
|
||||
delete pkt;
|
||||
decodeNext(q);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -524,6 +525,9 @@ PM4PacketProcessor::processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr,
|
||||
gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC, getIpId());
|
||||
|
||||
gpuDevice->processPendingDoorbells(pkt->doorbellOffset << 2);
|
||||
|
||||
delete pkt;
|
||||
decodeNext(q);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -656,6 +660,7 @@ PM4PacketProcessor::unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt)
|
||||
dmaWriteVirt(addr, sizeof(QueueDesc), cb, mqd);
|
||||
queues.erase(id);
|
||||
hsa_pp.unsetDeviceQueueDesc(id, 8);
|
||||
delete mqd;
|
||||
}
|
||||
}
|
||||
gpuDevice->deallocateAllQueues();
|
||||
@@ -754,6 +759,7 @@ PM4PacketProcessor::indirectBuffer(PM4Queue *q, PM4IndirectBuf *pkt)
|
||||
q->ibBase(pkt->ibBase);
|
||||
q->wptr(pkt->ibSize * sizeof(uint32_t));
|
||||
|
||||
delete pkt;
|
||||
decodeNext(q);
|
||||
}
|
||||
|
||||
@@ -766,6 +772,7 @@ PM4PacketProcessor::switchBuffer(PM4Queue *q, PM4SwitchBuf *pkt)
|
||||
DPRINTF(PM4PacketProcessor, "PM4 switching buffer, rptr: %p.\n",
|
||||
q->wptr());
|
||||
|
||||
delete pkt;
|
||||
decodeNext(q);
|
||||
}
|
||||
|
||||
@@ -784,6 +791,7 @@ PM4PacketProcessor::setUconfigReg(PM4Queue *q, PM4SetUconfigReg *pkt)
|
||||
reg_addr += 0x40000 * getIpId();
|
||||
gpuDevice->setRegVal(reg_addr, pkt->data);
|
||||
|
||||
delete pkt;
|
||||
decodeNext(q);
|
||||
}
|
||||
|
||||
@@ -800,6 +808,7 @@ PM4PacketProcessor::waitRegMem(PM4Queue *q, PM4WaitRegMem *pkt)
|
||||
DPRINTF(PM4PacketProcessor, " Mask: %lx\n", pkt->mask);
|
||||
DPRINTF(PM4PacketProcessor, " Poll Interval: %lx\n", pkt->pollInterval);
|
||||
|
||||
delete pkt;
|
||||
decodeNext(q);
|
||||
}
|
||||
|
||||
|
||||
@@ -1000,6 +1000,9 @@ SDMAEngine::ptePde(SDMAQueue *q, sdmaPtePde *pkt)
|
||||
sizeof(uint64_t) * pkt->count, 0,
|
||||
cb);
|
||||
} else {
|
||||
if (q->priv()) {
|
||||
pkt->dest = getGARTAddr(pkt->dest);
|
||||
}
|
||||
auto cb = new DmaVirtCallback<uint64_t>(
|
||||
[ = ] (const uint64_t &) { ptePdeDone(q, pkt, dmaBuffer); });
|
||||
dmaWriteVirt(pkt->dest, sizeof(uint64_t) * pkt->count, cb,
|
||||
@@ -1132,7 +1135,7 @@ SDMAEngine::constFillDone(SDMAQueue *q, sdmaConstFill *pkt, uint8_t *fill_data)
|
||||
{
|
||||
DPRINTF(SDMAEngine, "ConstFill to %lx done\n", pkt->addr);
|
||||
|
||||
delete fill_data;
|
||||
delete [] fill_data;
|
||||
delete pkt;
|
||||
decodeNext(q);
|
||||
}
|
||||
|
||||
@@ -84,6 +84,7 @@ DebugFlag('GPUExec')
|
||||
DebugFlag('GPUFetch')
|
||||
DebugFlag('GPUInst')
|
||||
DebugFlag('GPUKernelInfo')
|
||||
DebugFlag('GPULDS')
|
||||
DebugFlag('GPUMem')
|
||||
DebugFlag('GPUPort')
|
||||
DebugFlag('GPUPrefetch')
|
||||
@@ -106,4 +107,4 @@ DebugFlag('WavefrontStack')
|
||||
CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
|
||||
'GPUMem', 'GPUPort', 'GPUSched', 'GPUSRF', 'GPUSync',
|
||||
'GPUTLB', 'GPUVRF', 'GPURFC', 'GPUWgLatency',
|
||||
'GPUKernelInfo', 'GPUInitAbi'])
|
||||
'GPUKernelInfo', 'GPUInitAbi', 'GPULDS'])
|
||||
|
||||
@@ -1746,7 +1746,7 @@ ComputeUnit::DataPort::processMemReqEvent(PacketPtr pkt)
|
||||
SystemHubEvent *resp_event = new SystemHubEvent(pkt, this);
|
||||
compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
|
||||
} else if (!(sendTimingReq(pkt))) {
|
||||
retries.push_back(std::make_pair(pkt, gpuDynInst));
|
||||
retries.emplace_back(pkt, gpuDynInst);
|
||||
|
||||
if (gpuDynInst) {
|
||||
DPRINTF(GPUPort,
|
||||
@@ -1783,7 +1783,7 @@ ComputeUnit::ScalarDataPort::MemReqEvent::process()
|
||||
SystemHubEvent *resp_event = new SystemHubEvent(pkt, &scalarDataPort);
|
||||
compute_unit->shader->systemHub->sendRequest(pkt, resp_event);
|
||||
} else if (!(scalarDataPort.sendTimingReq(pkt))) {
|
||||
scalarDataPort.retries.push_back(pkt);
|
||||
scalarDataPort.retries.emplace_back(pkt);
|
||||
|
||||
DPRINTF(GPUPort,
|
||||
"CU%d: WF[%d][%d]: addr %#x data req failed!\n",
|
||||
|
||||
@@ -54,55 +54,63 @@ GPUStaticInst::disassemble()
|
||||
return disassembly;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
GPUStaticInst::generateVirtToPhysMap(Wavefront *wf, ComputeUnit *cu,
|
||||
OperandInfo& op,
|
||||
std::vector<OperandInfo>& opVec,
|
||||
OpType opType)
|
||||
{
|
||||
std::vector<int> virt_idxs;
|
||||
std::vector<int> phys_idxs;
|
||||
|
||||
int num_dwords = op.sizeInDWords();
|
||||
int virt_idx = op.registerIndex(wf->reservedScalarRegs);
|
||||
|
||||
int phys_idx = -1;
|
||||
for (int i = 0; i < num_dwords; i++) {
|
||||
if (opType == OpType::SRC_VEC || opType == OpType::DST_VEC) {
|
||||
phys_idx = cu->registerManager->mapVgpr(wf, virt_idx + i);
|
||||
} else {
|
||||
assert(opType == OpType::SRC_SCALAR ||
|
||||
opType == OpType::DST_SCALAR);
|
||||
phys_idx = cu->registerManager->mapSgpr(wf, virt_idx + i);
|
||||
}
|
||||
virt_idxs.push_back(virt_idx + i);
|
||||
phys_idxs.push_back(phys_idx);
|
||||
}
|
||||
DPRINTF(GPUInst, "%s adding %s %s (%d->%d) operand that uses "
|
||||
"%d registers.\n", disassemble(),
|
||||
(opType == OpType::SRC_VEC || opType == OpType::DST_VEC) ?
|
||||
"vector" : "scalar",
|
||||
(opType == OpType::SRC_VEC || opType == OpType::SRC_SCALAR) ?
|
||||
"src" : "dst", virt_idxs[0], phys_idxs[0], num_dwords);
|
||||
|
||||
op.setVirtToPhysMapping(virt_idxs, phys_idxs);
|
||||
|
||||
opVec.emplace_back(op);
|
||||
}
|
||||
|
||||
void
|
||||
GPUStaticInst::initDynOperandInfo(Wavefront *wf, ComputeUnit *cu)
|
||||
{
|
||||
// Lambda function, as this is only ever used here
|
||||
auto generateVirtToPhysMap = [&](OperandInfo& op,
|
||||
std::vector<OperandInfo>& opVec,
|
||||
MapRegFn mapFn, OpType opType)
|
||||
{
|
||||
std::vector<int> virt_idxs;
|
||||
std::vector<int> phys_idxs;
|
||||
|
||||
int num_dwords = op.sizeInDWords();
|
||||
int virt_idx = op.registerIndex(wf->reservedScalarRegs);
|
||||
|
||||
int phys_idx = -1;
|
||||
for (int i = 0; i < num_dwords; i++){
|
||||
phys_idx = (cu->registerManager->*mapFn)(wf, virt_idx + i);
|
||||
virt_idxs.push_back(virt_idx + i);
|
||||
phys_idxs.push_back(phys_idx);
|
||||
}
|
||||
DPRINTF(GPUInst, "%s adding %s %s (%d->%d) operand that uses "
|
||||
"%d registers.\n", disassemble(),
|
||||
(opType == OpType::SRC_VEC || opType == OpType::DST_VEC) ?
|
||||
"vector" : "scalar",
|
||||
(opType == OpType::SRC_VEC || opType == OpType::SRC_SCALAR) ?
|
||||
"src" : "dst", virt_idxs[0], phys_idxs[0], num_dwords);
|
||||
|
||||
op.setVirtToPhysMapping(virt_idxs, phys_idxs);
|
||||
|
||||
opVec.emplace_back(op);
|
||||
};
|
||||
|
||||
for (auto& srcOp : srcOps) {
|
||||
if (srcOp.isVectorReg()) {
|
||||
generateVirtToPhysMap(srcOp, srcVecRegOps,
|
||||
&RegisterManager::mapVgpr, OpType::SRC_VEC);
|
||||
generateVirtToPhysMap(wf, cu, srcOp, srcVecRegOps,
|
||||
OpType::SRC_VEC);
|
||||
} else if (srcOp.isScalarReg()) {
|
||||
generateVirtToPhysMap(srcOp, srcScalarRegOps,
|
||||
&RegisterManager::mapSgpr, OpType::SRC_SCALAR);
|
||||
generateVirtToPhysMap(wf, cu, srcOp, srcScalarRegOps,
|
||||
OpType::SRC_SCALAR);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto& dstOp : dstOps) {
|
||||
if (dstOp.isVectorReg()) {
|
||||
generateVirtToPhysMap(dstOp, dstVecRegOps,
|
||||
&RegisterManager::mapVgpr, OpType::DST_VEC);
|
||||
generateVirtToPhysMap(wf, cu, dstOp, dstVecRegOps,
|
||||
OpType::DST_VEC);
|
||||
} else if (dstOp.isScalarReg()) {
|
||||
generateVirtToPhysMap(dstOp, dstScalarRegOps,
|
||||
&RegisterManager::mapSgpr, OpType::DST_SCALAR);
|
||||
generateVirtToPhysMap(wf, cu, dstOp, dstScalarRegOps,
|
||||
OpType::DST_SCALAR);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -321,6 +321,9 @@ class GPUStaticInst : public GPUStaticInstFlags
|
||||
int _ipdInstNum;
|
||||
|
||||
std::bitset<Num_Flags> _flags;
|
||||
|
||||
void generateVirtToPhysMap(Wavefront *wf, ComputeUnit *cu, OperandInfo& op,
|
||||
std::vector<OperandInfo>& opVec, OpType opType);
|
||||
};
|
||||
|
||||
class KernelLaunchStaticInst : public GPUStaticInst
|
||||
|
||||
@@ -39,6 +39,7 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "debug/GPULDS.hh"
|
||||
#include "gpu-compute/misc.hh"
|
||||
#include "mem/port.hh"
|
||||
#include "params/LdsState.hh"
|
||||
@@ -75,10 +76,30 @@ class LdsChunk
|
||||
* chunk allocated to this WG we return 0.
|
||||
*/
|
||||
if (index >= chunk.size()) {
|
||||
DPRINTF(GPULDS, "LDS[%d][%d]: Read 0 beyond size (%ld)\n",
|
||||
dispatchId, wgId, chunk.size());
|
||||
return (T)0;
|
||||
}
|
||||
|
||||
T *p0 = (T *) (&(chunk.at(index)));
|
||||
|
||||
if (sizeof(T) <= 4) {
|
||||
[[maybe_unused]] uint32_t int_val =
|
||||
*reinterpret_cast<uint32_t*>(p0);
|
||||
DPRINTF(GPULDS, "LDS[%d][%d]: Read %08x from index %d\n",
|
||||
dispatchId, wgId, int_val, index);
|
||||
} else if (sizeof(T) <= 8) {
|
||||
[[maybe_unused]] uint64_t int_val =
|
||||
*reinterpret_cast<uint64_t*>(p0);
|
||||
DPRINTF(GPULDS, "LDS[%d][%d]: Read %016lx from index %d\n",
|
||||
dispatchId, wgId, int_val, index);
|
||||
} else if (sizeof(T) <= 16) {
|
||||
[[maybe_unused]] uint64_t *int_vals =
|
||||
reinterpret_cast<uint64_t*>(p0);
|
||||
DPRINTF(GPULDS, "LDS[%d][%d]: Read %016lx%016lx from index %d\n",
|
||||
dispatchId, wgId, int_vals[1], int_vals[0], index);
|
||||
}
|
||||
|
||||
return *p0;
|
||||
}
|
||||
|
||||
@@ -94,10 +115,33 @@ class LdsChunk
|
||||
* chunk allocated to this WG are dropped.
|
||||
*/
|
||||
if (index >= chunk.size()) {
|
||||
DPRINTF(GPULDS, "LDS[%d][%d]: Ignoring write beyond size (%ld)\n",
|
||||
dispatchId, wgId, chunk.size());
|
||||
return;
|
||||
}
|
||||
|
||||
T *p0 = (T *) (&(chunk.at(index)));
|
||||
|
||||
if (sizeof(T) <= 4) {
|
||||
[[maybe_unused]] uint32_t prev_val =
|
||||
*reinterpret_cast<uint32_t*>(p0);
|
||||
DPRINTF(GPULDS, "LDS[%d][%d]: Write %08lx to index %d (was "
|
||||
"%08lx)\n", dispatchId, wgId, value, index, prev_val);
|
||||
} else if (sizeof(T) <= 8) {
|
||||
[[maybe_unused]] uint64_t prev_val =
|
||||
*reinterpret_cast<uint64_t*>(p0);
|
||||
DPRINTF(GPULDS, "LDS[%d][%d]: Write %016lx to index %d (was "
|
||||
"%016lx)\n", dispatchId, wgId, value, index, prev_val);
|
||||
} else if (sizeof(T) <= 16) {
|
||||
[[maybe_unused]] uint64_t *prev_vals =
|
||||
reinterpret_cast<uint64_t*>(p0);
|
||||
[[maybe_unused]] const uint64_t *next_vals =
|
||||
reinterpret_cast<const uint64_t*>(&value);
|
||||
DPRINTF(GPULDS, "LDS[%d][%d]: Write %016lx%016lx to index %d "
|
||||
"(was %016lx%016lx)\n", dispatchId, wgId, next_vals[1],
|
||||
next_vals[0], index, prev_vals[1], prev_vals[0]);
|
||||
}
|
||||
|
||||
*p0 = value;
|
||||
}
|
||||
|
||||
@@ -131,6 +175,9 @@ class LdsChunk
|
||||
return chunk.size();
|
||||
}
|
||||
|
||||
uint32_t dispatchId;
|
||||
uint32_t wgId;
|
||||
|
||||
protected:
|
||||
// the actual data store for this slice of the LDS
|
||||
std::vector<uint8_t> chunk;
|
||||
@@ -402,6 +449,9 @@ class LdsState: public ClockedObject
|
||||
// make an entry for this workgroup
|
||||
refCounter[dispatchId][wgId] = 0;
|
||||
|
||||
chunkMap[dispatchId][wgId].dispatchId = dispatchId;
|
||||
chunkMap[dispatchId][wgId].wgId = wgId;
|
||||
|
||||
return &chunkMap[dispatchId][wgId];
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user