arch-vega: Implement buffer_atomic_cmpswap
This is a standard compare and swap but implemented on vector memory buffer instructions (i.e., it is the same as FLAT_ATOMIC_CMPSWAP with MUBUF's special address calculation). This was tested using a Tensile kernel, a backend for rocBLAS, which is used by PyTorch and Tensorflow. Prior to this patch both ML frameworks crashed. With this patch they both make forward progress. Change-Id: Ie76447a72d210f81624e01e1fa374e41c2c21e06
This commit is contained in:
@@ -40581,8 +40581,87 @@ namespace VegaISA
|
||||
void
|
||||
Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
panicUnimplemented();
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
|
||||
if (gpuDynInst->exec_mask.none()) {
|
||||
wf->decVMemInstsIssued();
|
||||
return;
|
||||
}
|
||||
|
||||
gpuDynInst->execUnitId = wf->execUnitId;
|
||||
gpuDynInst->latency.init(gpuDynInst->computeUnit());
|
||||
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
|
||||
|
||||
ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR);
|
||||
ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1);
|
||||
ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4);
|
||||
ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET);
|
||||
ConstVecOperandU32 src(gpuDynInst, extData.VDATA);
|
||||
ConstVecOperandU32 cmp(gpuDynInst, extData.VDATA + 1);
|
||||
|
||||
rsrcDesc.read();
|
||||
offset.read();
|
||||
src.read();
|
||||
cmp.read();
|
||||
|
||||
int inst_offset = instData.OFFSET;
|
||||
|
||||
if (!instData.IDXEN && !instData.OFFEN) {
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr0, addr1, rsrcDesc, offset, inst_offset);
|
||||
} else if (!instData.IDXEN && instData.OFFEN) {
|
||||
addr0.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr0, addr1, rsrcDesc, offset, inst_offset);
|
||||
} else if (instData.IDXEN && !instData.OFFEN) {
|
||||
addr0.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr1, addr0, rsrcDesc, offset, inst_offset);
|
||||
} else {
|
||||
addr0.read();
|
||||
addr1.read();
|
||||
calcAddr<ConstVecOperandU32, ConstVecOperandU32,
|
||||
ConstScalarOperandU128, ConstScalarOperandU32>(gpuDynInst,
|
||||
addr1, addr0, rsrcDesc, offset, inst_offset);
|
||||
}
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
(reinterpret_cast<VecElemU32*>(gpuDynInst->x_data))[lane]
|
||||
= src[lane];
|
||||
(reinterpret_cast<VecElemU32*>(gpuDynInst->a_data))[lane]
|
||||
= cmp[lane];
|
||||
}
|
||||
}
|
||||
|
||||
gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst);
|
||||
} // execute
|
||||
|
||||
void
|
||||
Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
initAtomicAccess<VecElemU32>(gpuDynInst);
|
||||
} // initiateAcc
|
||||
|
||||
void
|
||||
Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::completeAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
if (isAtomicRet()) {
|
||||
VecOperandU32 vdst(gpuDynInst, extData.VDATA);
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
vdst[lane] = (reinterpret_cast<VecElemU32*>(
|
||||
gpuDynInst->d_data))[lane];
|
||||
}
|
||||
}
|
||||
|
||||
vdst.write();
|
||||
}
|
||||
} // completeAcc
|
||||
// --- Inst_MUBUF__BUFFER_ATOMIC_ADD class methods ---
|
||||
|
||||
Inst_MUBUF__BUFFER_ATOMIC_ADD
|
||||
|
||||
@@ -37220,6 +37220,8 @@ namespace VegaISA
|
||||
} // getOperandSize
|
||||
|
||||
void execute(GPUDynInstPtr) override;
|
||||
void initiateAcc(GPUDynInstPtr) override;
|
||||
void completeAcc(GPUDynInstPtr) override;
|
||||
}; // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP
|
||||
|
||||
class Inst_MUBUF__BUFFER_ATOMIC_ADD : public Inst_MUBUF
|
||||
|
||||
@@ -713,6 +713,19 @@ namespace VegaISA
|
||||
gpuDynInst->exec_mask = old_exec_mask;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void
|
||||
initAtomicAccess(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
// temporarily modify exec_mask to supress memory accesses to oob
|
||||
// regions. Only issue memory requests for lanes that have their
|
||||
// exec_mask set and are not out of bounds.
|
||||
VectorMask old_exec_mask = gpuDynInst->exec_mask;
|
||||
gpuDynInst->exec_mask &= ~oobMask;
|
||||
initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
|
||||
gpuDynInst->exec_mask = old_exec_mask;
|
||||
}
|
||||
|
||||
void
|
||||
injectGlobalMemFence(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user