diff --git a/src/arch/amdgpu/vega/insts/instructions.cc b/src/arch/amdgpu/vega/insts/instructions.cc index 0d3f2dc00b..ab9c1cecf2 100644 --- a/src/arch/amdgpu/vega/insts/instructions.cc +++ b/src/arch/amdgpu/vega/insts/instructions.cc @@ -6394,7 +6394,7 @@ namespace VegaISA } }; - vop2Helper(gpuDynInst, opImpl); + vop2Helper(gpuDynInst, opImpl); } // execute // --- Inst_VOP2__V_MUL_HI_U32_U24 class methods --- diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh index f1954723af..0f5f502add 100644 --- a/src/arch/amdgpu/vega/insts/op_encodings.hh +++ b/src/arch/amdgpu/vega/insts/op_encodings.hh @@ -339,7 +339,7 @@ namespace VegaISA return src0_dpp; } - template + template void vop2Helper(GPUDynInstPtr gpuDynInst, void (*fOpImpl)(T&, T&, T&, Wavefront*)) { @@ -359,7 +359,19 @@ namespace VegaISA T src0_dpp = dppHelper(gpuDynInst, src1); fOpImpl(src0_dpp, src1, vdst, wf); } else { - fOpImpl(src0, src1, vdst, wf); + // src0 is unmodified. We need to use the const container + // type to allow reading scalar operands from src0. Only + // src0 can index scalar operands. We copy this to vdst + // temporarily to pass to the lambda so the instruction + // does not need to write two lambda functions (one for + // a const src0 and one of a mutable src0). + ConstT const_src0(gpuDynInst, instData.SRC0); + const_src0.readSrc(); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + vdst[lane] = const_src0[lane]; + } + fOpImpl(vdst, src1, vdst, wf); } vdst.write();