From 4b85a1710e0b33424de7315a08a9fbd08abd0ae2 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Fri, 6 Oct 2023 11:22:46 -0500 Subject: [PATCH] arch-vega: Implement buffer_atomic_cmpswap This is a standard compare and swap but implemented on vector memory buffer instructions (i.e., it is the same as FLAT_ATOMIC_CMPSWAP with MUBUF's special address calculation). This was tested using a Tensile kernel, a backend for rocBLAS, which is used by PyTorch and Tensorflow. Prior to this patch both ML frameworks crashed. With this patch they both make forward progress. Change-Id: Ie76447a72d210f81624e01e1fa374e41c2c21e06 --- src/arch/amdgpu/vega/insts/instructions.cc | 81 +++++++++++++++++++++- src/arch/amdgpu/vega/insts/instructions.hh | 2 + src/arch/amdgpu/vega/insts/op_encodings.hh | 13 ++++ 3 files changed, 95 insertions(+), 1 deletion(-) diff --git a/src/arch/amdgpu/vega/insts/instructions.cc b/src/arch/amdgpu/vega/insts/instructions.cc index bb6a2233cd..287c9a9541 100644 --- a/src/arch/amdgpu/vega/insts/instructions.cc +++ b/src/arch/amdgpu/vega/insts/instructions.cc @@ -40581,8 +40581,87 @@ namespace VegaISA void Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::execute(GPUDynInstPtr gpuDynInst) { - panicUnimplemented(); + Wavefront *wf = gpuDynInst->wavefront(); + + if (gpuDynInst->exec_mask.none()) { + wf->decVMemInstsIssued(); + return; + } + + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); + + ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR); + ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1); + ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4); + ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET); + ConstVecOperandU32 src(gpuDynInst, extData.VDATA); + ConstVecOperandU32 cmp(gpuDynInst, extData.VDATA + 1); + + rsrcDesc.read(); + offset.read(); + src.read(); + cmp.read(); + + int inst_offset = instData.OFFSET; + + if (!instData.IDXEN && !instData.OFFEN) { + calcAddr(gpuDynInst, + addr0, addr1, rsrcDesc, offset, inst_offset); + } else if (!instData.IDXEN && instData.OFFEN) { + addr0.read(); + calcAddr(gpuDynInst, + addr0, addr1, rsrcDesc, offset, inst_offset); + } else if (instData.IDXEN && !instData.OFFEN) { + addr0.read(); + calcAddr(gpuDynInst, + addr1, addr0, rsrcDesc, offset, inst_offset); + } else { + addr0.read(); + addr1.read(); + calcAddr(gpuDynInst, + addr1, addr0, rsrcDesc, offset, inst_offset); + } + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + (reinterpret_cast(gpuDynInst->x_data))[lane] + = src[lane]; + (reinterpret_cast(gpuDynInst->a_data))[lane] + = cmp[lane]; + } + } + + gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst); } // execute + + void + Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::initiateAcc(GPUDynInstPtr gpuDynInst) + { + initAtomicAccess(gpuDynInst); + } // initiateAcc + + void + Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP::completeAcc(GPUDynInstPtr gpuDynInst) + { + if (isAtomicRet()) { + VecOperandU32 vdst(gpuDynInst, extData.VDATA); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + vdst[lane] = (reinterpret_cast( + gpuDynInst->d_data))[lane]; + } + } + + vdst.write(); + } + } // completeAcc // --- Inst_MUBUF__BUFFER_ATOMIC_ADD class methods --- Inst_MUBUF__BUFFER_ATOMIC_ADD diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh index 0e4ec04764..ca349c365f 100644 --- a/src/arch/amdgpu/vega/insts/instructions.hh +++ b/src/arch/amdgpu/vega/insts/instructions.hh @@ -37220,6 +37220,8 @@ namespace VegaISA } // getOperandSize void execute(GPUDynInstPtr) override; + void initiateAcc(GPUDynInstPtr) override; + void completeAcc(GPUDynInstPtr) override; }; // Inst_MUBUF__BUFFER_ATOMIC_CMPSWAP class Inst_MUBUF__BUFFER_ATOMIC_ADD : public Inst_MUBUF diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh index 613d78b25e..a1c5e99c91 100644 --- a/src/arch/amdgpu/vega/insts/op_encodings.hh +++ b/src/arch/amdgpu/vega/insts/op_encodings.hh @@ -713,6 +713,19 @@ namespace VegaISA gpuDynInst->exec_mask = old_exec_mask; } + template + void + initAtomicAccess(GPUDynInstPtr gpuDynInst) + { + // temporarily modify exec_mask to supress memory accesses to oob + // regions. Only issue memory requests for lanes that have their + // exec_mask set and are not out of bounds. + VectorMask old_exec_mask = gpuDynInst->exec_mask; + gpuDynInst->exec_mask &= ~oobMask; + initMemReqHelper(gpuDynInst, MemCmd::SwapReq, true); + gpuDynInst->exec_mask = old_exec_mask; + } + void injectGlobalMemFence(GPUDynInstPtr gpuDynInst) {