diff --git a/src/arch/amdgpu/vega/insts/inst_util.hh b/src/arch/amdgpu/vega/insts/inst_util.hh index 01925f9d07..7ec2e2ddd3 100644 --- a/src/arch/amdgpu/vega/insts/inst_util.hh +++ b/src/arch/amdgpu/vega/insts/inst_util.hh @@ -303,9 +303,9 @@ namespace VegaISA * Currently the values are: * 0x0 - 0xFF: full permute of four threads * 0x100: reserved - * 0x101 - 0x10F: row shift right by 1-15 threads + * 0x101 - 0x10F: row shift left by 1-15 threads * 0x111 - 0x11F: row shift right by 1-15 threads - * 0x121 - 0x12F: row shift right by 1-15 threads + * 0x121 - 0x12F: row rotate right by 1-15 threads * 0x130: wavefront left shift by 1 thread * 0x134: wavefront left rotate by 1 thread * 0x138: wavefront right shift by 1 thread @@ -322,7 +322,8 @@ namespace VegaISA // newLane will be the same as the input lane unless swizzling happens int newLane = currLane; // for shift/rotate permutations; positive values are LEFT rotates - int count = 1; + // shift/rotate left means lane n -> lane n-1 (e.g., lane 1 -> lane 0) + int count = 0; int localRowOffset = rowOffset; int localRowNum = rowNum; @@ -335,51 +336,47 @@ namespace VegaISA panic("ERROR: instruction using reserved DPP_CTRL value\n"); } else if ((dppCtrl >= SQ_DPP_ROW_SL1) && (dppCtrl <= SQ_DPP_ROW_SL15)) { // DPP_ROW_SL{1:15} - count -= (dppCtrl - SQ_DPP_ROW_SL1 + 1); + count = (dppCtrl - SQ_DPP_ROW_SL1 + 1); if ((localRowOffset + count >= 0) && (localRowOffset + count < ROW_SIZE)) { localRowOffset += count; - newLane = (rowNum | localRowOffset); + newLane = ((rowNum * ROW_SIZE) | localRowOffset); } else { outOfBounds = true; } } else if ((dppCtrl >= SQ_DPP_ROW_SR1) && (dppCtrl <= SQ_DPP_ROW_SR15)) { // DPP_ROW_SR{1:15} - count -= (dppCtrl - SQ_DPP_ROW_SR1 + 1); + count = -(dppCtrl - SQ_DPP_ROW_SR1 + 1); if ((localRowOffset + count >= 0) && (localRowOffset + count < ROW_SIZE)) { localRowOffset += count; - newLane = (rowNum | localRowOffset); + newLane = ((rowNum * ROW_SIZE) | localRowOffset); } else { outOfBounds = true; } } else if ((dppCtrl >= SQ_DPP_ROW_RR1) && (dppCtrl <= SQ_DPP_ROW_RR15)) { // DPP_ROW_RR{1:15} - count -= (dppCtrl - SQ_DPP_ROW_RR1 + 1); + count = -(dppCtrl - SQ_DPP_ROW_RR1 + 1); localRowOffset = (localRowOffset + count + ROW_SIZE) % ROW_SIZE; - newLane = (rowNum | localRowOffset); + newLane = ((rowNum * ROW_SIZE) | localRowOffset); } else if (dppCtrl == SQ_DPP_WF_SL1) { // DPP_WF_SL1 - count = 1; if ((currLane >= 0) && (currLane < NumVecElemPerVecReg)) { - newLane += count; + newLane += 1; } else { outOfBounds = true; } } else if (dppCtrl == SQ_DPP_WF_RL1) { // DPP_WF_RL1 - count = 1; - newLane = (currLane + count + NumVecElemPerVecReg) % + newLane = (currLane - 1 + NumVecElemPerVecReg) % NumVecElemPerVecReg; } else if (dppCtrl == SQ_DPP_WF_SR1) { // DPP_WF_SR1 - count = -1; - int currVal = (currLane + count); + int currVal = (currLane - 1); if ((currVal >= 0) && (currVal < NumVecElemPerVecReg)) { - newLane += count; + newLane -= 1; } else { outOfBounds = true; } } else if (dppCtrl == SQ_DPP_WF_RR1) { // DPP_WF_RR1 - count = -1; - newLane = (currLane + count + NumVecElemPerVecReg) % + newLane = (currLane - 1 + NumVecElemPerVecReg) % NumVecElemPerVecReg; } else if (dppCtrl == SQ_DPP_ROW_MIRROR) { // DPP_ROW_MIRROR localRowOffset = (15 - localRowOffset); @@ -392,12 +389,22 @@ namespace VegaISA } else if (dppCtrl == SQ_DPP_ROW_BCAST15) { // DPP_ROW_BCAST15 count = 15; if (currLane > count) { - newLane = (currLane & ~count) - 1; + // 0x30 selects which set of 16 lanes to use. We broadcast the + // last lane of one set to all lanes of the next set (e.g., + // lane 15 is written to 16-31, 31 to 32-47, 47 to 48-63). + newLane = (currLane & 0x30) - 1; + } else { + outOfBounds = true; } } else if (dppCtrl == SQ_DPP_ROW_BCAST31) { // DPP_ROW_BCAST31 count = 31; if (currLane > count) { - newLane = (currLane & ~count) - 1; + // 0x20 selects either the upper 32 or lower 32 lanes and + // broadcasts the last lane of one set to all lanes of the + // next set (e.g., lane 31 is written to 32-63). + newLane = (currLane & 0x20) - 1; + } else { + outOfBounds = true; } } else { panic("Unimplemented DPP control operation: %d\n", dppCtrl); @@ -443,6 +450,9 @@ namespace VegaISA src0.absModifier(); } + // Need a copy of the original data since we update one lane at a time + T src0_copy = src0; + // iterate over all register lanes, performing steps 2-4 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { threadValid = (0x1LL << lane); @@ -458,7 +468,6 @@ namespace VegaISA if (((rowMask & (0x1 << rowNum)) == 0) /* row mask */ || ((bankMask & (0x1 << bankNum)) == 0) /* bank mask */) { laneDisabled = true; - continue; } /** @@ -495,7 +504,7 @@ namespace VegaISA } else { threadValid = 0; } - } else if (!gpuDynInst->exec_mask[lane]) { + } else if (!gpuDynInst->wavefront()->execMask(lane)) { if (boundCtrl == 1) { zeroSrc = true; } else { @@ -505,13 +514,15 @@ namespace VegaISA if (threadValid != 0 && !outOfBounds && !zeroSrc) { assert(!laneDisabled); - src0[outLane] = src0[lane]; + src0[lane] = src0_copy[outLane]; } else if (zeroSrc) { src0[lane] = 0; } // reset for next iteration laneDisabled = false; + outOfBounds = false; + zeroSrc = false; } }