diff --git a/src/arch/gcn3/insts/inst_util.hh b/src/arch/gcn3/insts/inst_util.hh index a3b2f4a51b..292e3ba301 100644 --- a/src/arch/gcn3/insts/inst_util.hh +++ b/src/arch/gcn3/insts/inst_util.hh @@ -547,8 +547,8 @@ namespace Gcn3ISA * operations are done on it. */ template - T sdwaInstSrcImpl_helper(T currOperVal, T origOperVal, SDWASelVals sel, - bool signExt) + T sdwaInstSrcImpl_helper(T currOperVal, const T origOperVal, + const SDWASelVals sel, const bool signExt) { // local variables int first_bit = 0, last_bit = 0; @@ -635,16 +635,14 @@ namespace Gcn3ISA * 2. if sign extend is set, then sign extend the value */ template - void sdwaInstSrcImpl(T & currOper, T & origCurrOper, SDWASelVals sel, - bool signExt) + void sdwaInstSrcImpl(T & currOper, T & origCurrOper, + const SDWASelVals sel, const bool signExt) { // iterate over all lanes, setting appropriate, selected value - currOper.read(); - origCurrOper.read(); for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { currOper[lane] = sdwaInstSrcImpl_helper(currOper[lane], - origCurrOper[lane], sel, - signExt); + origCurrOper[lane], sel, + signExt); } } @@ -656,8 +654,9 @@ namespace Gcn3ISA * operations are done on it. */ template - T sdwaInstDstImpl_helper(T currDstVal, T origDstVal, bool clamp, - SDWASelVals sel, SDWADstVals unusedBits_format) + T sdwaInstDstImpl_helper(T currDstVal, const T origDstVal, + const bool clamp, const SDWASelVals sel, + const SDWADstVals unusedBits_format) { // local variables int first_bit = 0, last_bit = 0; @@ -756,12 +755,11 @@ namespace Gcn3ISA * 2 (SDWA_UNUSED_PRESERVE): select data[31:0] */ template - void sdwaInstDstImpl(T & dstOper, T & origDstOper, bool clamp, - SDWASelVals sel, SDWADstVals unusedBits_format) + void sdwaInstDstImpl(T & dstOper, T & origDstOper, const bool clamp, + const SDWASelVals sel, + const SDWADstVals unusedBits_format) { // iterate over all lanes, setting appropriate, selected value - dstOper.read(); - origDstOper.read(); for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { dstOper[lane] = sdwaInstDstImpl_helper(dstOper[lane], origDstOper[lane], clamp, @@ -779,8 +777,9 @@ namespace Gcn3ISA */ template void processSDWA_src_helper(T & currSrc, T & origCurrSrc, - SDWASelVals src_sel, bool src_signExt, - bool src_abs, bool src_neg) + const SDWASelVals src_sel, + const bool src_signExt, const bool src_abs, + const bool src_neg) { /** * STEP 1: check if the absolute value (ABS) or negation (NEG) tags @@ -812,14 +811,13 @@ namespace Gcn3ISA * processSDWA_src is called before the math. */ template - void processSDWA_src(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst, - T & src0, T & origSrc0) + void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0) { // local variables - SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL; - bool src0_signExt = sdwaInst.SRC0_SEXT; - bool src0_neg = sdwaInst.SRC0_NEG; - bool src0_abs = sdwaInst.SRC0_ABS; + const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL; + const bool src0_signExt = sdwaInst.SRC0_SEXT; + const bool src0_neg = sdwaInst.SRC0_NEG; + const bool src0_abs = sdwaInst.SRC0_ABS; // NOTE: difference between VOP1 and VOP2/VOPC is that there is no src1 // operand. So ensure that SRC1 fields are not set, then call helper @@ -841,18 +839,18 @@ namespace Gcn3ISA * called before the math. */ template - void processSDWA_src(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst, - T & src0, T & origSrc0, T & src1, T & origSrc1) + void processSDWA_src(InFmt_VOP_SDWA sdwaInst, T & src0, T & origSrc0, + T & src1, T & origSrc1) { // local variables - SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL; - bool src0_signExt = sdwaInst.SRC0_SEXT; - bool src0_neg = sdwaInst.SRC0_NEG; - bool src0_abs = sdwaInst.SRC0_ABS; - SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL; - bool src1_signExt = sdwaInst.SRC1_SEXT; - bool src1_neg = sdwaInst.SRC1_NEG; - bool src1_abs = sdwaInst.SRC1_ABS; + const SDWASelVals src0_sel = (SDWASelVals)sdwaInst.SRC0_SEL; + const bool src0_signExt = sdwaInst.SRC0_SEXT; + const bool src0_neg = sdwaInst.SRC0_NEG; + const bool src0_abs = sdwaInst.SRC0_ABS; + const SDWASelVals src1_sel = (SDWASelVals)sdwaInst.SRC1_SEL; + const bool src1_signExt = sdwaInst.SRC1_SEXT; + const bool src1_neg = sdwaInst.SRC1_NEG; + const bool src1_abs = sdwaInst.SRC1_ABS; processSDWA_src_helper(src0, origSrc0, src0_sel, src0_signExt, src0_abs, src0_neg); @@ -869,13 +867,13 @@ namespace Gcn3ISA * processSDWA_src is called before the math. */ template - void processSDWA_dst(GPUDynInstPtr gpuDynInst, InFmt_VOP_SDWA sdwaInst, - T & dst, T & origDst) + void processSDWA_dst(InFmt_VOP_SDWA sdwaInst, T & dst, T & origDst) { // local variables - SDWADstVals dst_unusedBits_format = (SDWADstVals)sdwaInst.DST_UNUSED; - SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL; - bool clamp = sdwaInst.CLAMP; + const SDWADstVals dst_unusedBits_format = + (SDWADstVals)sdwaInst.DST_UNUSED; + const SDWASelVals dst_sel = (SDWASelVals)sdwaInst.DST_SEL; + const bool clamp = sdwaInst.CLAMP; /** * STEP 1: select the appropriate bits for dst and pad/sign-extend as diff --git a/src/arch/gcn3/insts/instructions.cc b/src/arch/gcn3/insts/instructions.cc index bd6e4f44e7..2789f3e7f8 100644 --- a/src/arch/gcn3/insts/instructions.cc +++ b/src/arch/gcn3/insts/instructions.cc @@ -5543,12 +5543,20 @@ namespace Gcn3ISA VecOperandU32 src1(gpuDynInst, instData.VSRC1); VecOperandU32 vdst(gpuDynInst, instData.VDST); + src0.readSrc(); + src1.read(); + if (isSDWAInst()) { VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0); - // use copies of original src0 and src1 during selecting + // use copies of original src0, src1, and dest during selecting VecOperandU32 origSrc0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0); VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1); + VecOperandU32 origVdst(gpuDynInst, instData.VDST); + + src0_sdwa.read(); + origSrc0_sdwa.read(); + origSrc1.read(); DPRINTF(GCN3, "Handling V_MUL_U32_U24 SRC SDWA. SRC0: register " "v[%d], DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: " @@ -5566,27 +5574,27 @@ namespace Gcn3ISA extData.iFmt_VOP_SDWA.SRC1_NEG, extData.iFmt_VOP_SDWA.SRC1_ABS); - processSDWA_src(gpuDynInst, extData.iFmt_VOP_SDWA, src0_sdwa, - origSrc0_sdwa, src1, origSrc1); - } + processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa, + src1, origSrc1); - src0.readSrc(); - src1.read(); + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = bits(src0_sdwa[lane], 23, 0) * + bits(src1[lane], 23, 0); + origVdst[lane] = vdst[lane]; // keep copy consistent + } + } - for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { - if (wf->execMask(lane)) { - vdst[lane] = bits(src0[lane], 23, 0) * bits(src1[lane], 23, 0); + processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst); + } else { + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = bits(src0[lane], 23, 0) * + bits(src1[lane], 23, 0); + } } } - // SDWA instructions also may select bytes/words of dest register - // (vdst) - if (isSDWAInst()) { - // use extra copy of dest to retain original values - VecOperandU32 vdst_orig(gpuDynInst, instData.VDST); - processSDWA_dst(gpuDynInst, extData.iFmt_VOP_SDWA, vdst, - vdst_orig); - } vdst.write(); } @@ -5895,12 +5903,20 @@ namespace Gcn3ISA VecOperandU32 src1(gpuDynInst, instData.VSRC1); VecOperandU32 vdst(gpuDynInst, instData.VDST); + src0.readSrc(); + src1.read(); + if (isSDWAInst()) { VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0); - // use copies of original src0 and src1 during selecting + // use copies of original src0, src1, and vdst during selecting VecOperandU32 origSrc0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0); VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1); + VecOperandU32 origVdst(gpuDynInst, instData.VDST); + + src0_sdwa.read(); + origSrc0_sdwa.read(); + origSrc1.read(); DPRINTF(GCN3, "Handling V_LSHLREV_B32 SRC SDWA. SRC0: register " "v[%d], DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: " @@ -5918,26 +5934,23 @@ namespace Gcn3ISA extData.iFmt_VOP_SDWA.SRC1_NEG, extData.iFmt_VOP_SDWA.SRC1_ABS); - processSDWA_src(gpuDynInst, extData.iFmt_VOP_SDWA, src0_sdwa, - origSrc0_sdwa, src1, origSrc1); - } + processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa, + src1, origSrc1); - src0.readSrc(); - src1.read(); - - for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { - if (wf->execMask(lane)) { - vdst[lane] = src1[lane] << bits(src0[lane], 4, 0); + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = src1[lane] << bits(src0_sdwa[lane], 4, 0); + origVdst[lane] = vdst[lane]; // keep copy consistent + } } - } - // SDWA instructions also may select bytes/words of dest register - // (vdst) - if (isSDWAInst()) { - // use extra copy of dest to retain original values - VecOperandU32 vdst_orig(gpuDynInst, instData.VDST); - processSDWA_dst(gpuDynInst, extData.iFmt_VOP_SDWA, vdst, - vdst_orig); + processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst); + } else { + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = src1[lane] << bits(src0[lane], 4, 0); + } + } } vdst.write(); @@ -5995,12 +6008,20 @@ namespace Gcn3ISA VecOperandU32 src1(gpuDynInst, instData.VSRC1); VecOperandU32 vdst(gpuDynInst, instData.VDST); + src0.readSrc(); + src1.read(); + if (isSDWAInst()) { VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0); - // use copies of original src0 and src1 during selecting + // use copies of original src0, src1, and dest during selecting VecOperandU32 origSrc0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0); VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1); + VecOperandU32 origVdst(gpuDynInst, instData.VDST); + + src0_sdwa.read(); + origSrc0_sdwa.read(); + origSrc1.read(); DPRINTF(GCN3, "Handling V_OR_B32 SRC SDWA. SRC0: register v[%d], " "DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: %d, " @@ -6018,26 +6039,23 @@ namespace Gcn3ISA extData.iFmt_VOP_SDWA.SRC1_NEG, extData.iFmt_VOP_SDWA.SRC1_ABS); - processSDWA_src(gpuDynInst, extData.iFmt_VOP_SDWA, src0_sdwa, - origSrc0_sdwa, src1, origSrc1); - } + processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa, + src1, origSrc1); - src0.readSrc(); - src1.read(); - - for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { - if (wf->execMask(lane)) { - vdst[lane] = src0[lane] | src1[lane]; + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = src0_sdwa[lane] | src1[lane]; + origVdst[lane] = vdst[lane]; // keep copy consistent + } } - } - // SDWA instructions also may select bytes/words of dest register - // (vdst) - if (isSDWAInst()) { - // use extra copy of dest to retain original values - VecOperandU32 vdst_orig(gpuDynInst, instData.VDST); - processSDWA_dst(gpuDynInst, extData.iFmt_VOP_SDWA, vdst, - vdst_orig); + processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst); + } else { + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = src0[lane] | src1[lane]; + } + } } vdst.write(); @@ -6222,12 +6240,20 @@ namespace Gcn3ISA VecOperandU32 vdst(gpuDynInst, instData.VDST); ScalarOperandU64 vcc(gpuDynInst, REG_VCC_LO); + src0.readSrc(); + src1.read(); + if (isSDWAInst()) { VecOperandU32 src0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0); - // use copies of original src0 and src1 during selecting + // use copies of original src0, src1, and dest during selecting VecOperandU32 origSrc0_sdwa(gpuDynInst, extData.iFmt_VOP_SDWA.SRC0); VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1); + VecOperandU32 origVdst(gpuDynInst, instData.VDST); + + src0_sdwa.read(); + origSrc0_sdwa.read(); + origSrc1.read(); DPRINTF(GCN3, "Handling V_ADD_U32 SRC SDWA. SRC0: register v[%d], " "DST_SEL: %d, DST_UNUSED: %d, CLAMP: %d, SRC0_SEL: %d, " @@ -6245,28 +6271,27 @@ namespace Gcn3ISA extData.iFmt_VOP_SDWA.SRC1_NEG, extData.iFmt_VOP_SDWA.SRC1_ABS); - processSDWA_src(gpuDynInst, extData.iFmt_VOP_SDWA, src0_sdwa, - origSrc0_sdwa, src1, origSrc1); - } + processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa, origSrc0_sdwa, + src1, origSrc1); - src0.readSrc(); - src1.read(); - - for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { - if (wf->execMask(lane)) { - vdst[lane] = src0[lane] + src1[lane]; - vcc.setBit(lane, ((VecElemU64)src0[lane] - + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0); + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = src0_sdwa[lane] + src1[lane]; + origVdst[lane] = vdst[lane]; // keep copy consistent + vcc.setBit(lane, ((VecElemU64)src0_sdwa[lane] + + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0); + } } - } - // SDWA instructions also may select bytes/words of dest register - // (vdst) - if (isSDWAInst()) { - // use extra copy of dest to retain original values - VecOperandU32 vdst_orig(gpuDynInst, instData.VDST); - processSDWA_dst(gpuDynInst, extData.iFmt_VOP_SDWA, vdst, - vdst_orig); + processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst); + } else { + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + vdst[lane] = src0[lane] + src1[lane]; + vcc.setBit(lane, ((VecElemU64)src0[lane] + + (VecElemU64)src1[lane] >= 0x100000000ULL) ? 1 : 0); + } + } } vcc.write();