From bbdebc25daec686cc06e159c28de005ca5875f5b Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Fri, 16 Dec 2022 13:35:02 -0800 Subject: [PATCH] arch-vega: Fix several issues with DPP DPP processing has several issues which are fixed in this changeset: 1) Incorrect comment is updated 2) newLane calculation for shift/rotate instructions is corrected 3) A copy of original data is made so that a copy of a copy is not made 4) Reset all booleans (OOB, zeroSrc, laneDisabled) after each lane iteration The shift, rotate, and broadcast variants were tested by implementing them in assembly and running on silicon. Change-Id: If86fbb26c87eaca4ef0587fd846978115858b168 Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/66752 Maintainer: Matt Sinclair Tested-by: kokoro Reviewed-by: Matt Sinclair --- src/arch/amdgpu/vega/insts/inst_util.hh | 57 +++++++++++++++---------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/src/arch/amdgpu/vega/insts/inst_util.hh b/src/arch/amdgpu/vega/insts/inst_util.hh index 01925f9d07..7ec2e2ddd3 100644 --- a/src/arch/amdgpu/vega/insts/inst_util.hh +++ b/src/arch/amdgpu/vega/insts/inst_util.hh @@ -303,9 +303,9 @@ namespace VegaISA * Currently the values are: * 0x0 - 0xFF: full permute of four threads * 0x100: reserved - * 0x101 - 0x10F: row shift right by 1-15 threads + * 0x101 - 0x10F: row shift left by 1-15 threads * 0x111 - 0x11F: row shift right by 1-15 threads - * 0x121 - 0x12F: row shift right by 1-15 threads + * 0x121 - 0x12F: row rotate right by 1-15 threads * 0x130: wavefront left shift by 1 thread * 0x134: wavefront left rotate by 1 thread * 0x138: wavefront right shift by 1 thread @@ -322,7 +322,8 @@ namespace VegaISA // newLane will be the same as the input lane unless swizzling happens int newLane = currLane; // for shift/rotate permutations; positive values are LEFT rotates - int count = 1; + // shift/rotate left means lane n -> lane n-1 (e.g., lane 1 -> lane 0) + int count = 0; int localRowOffset = rowOffset; int localRowNum = rowNum; @@ -335,51 +336,47 @@ namespace VegaISA panic("ERROR: instruction using reserved DPP_CTRL value\n"); } else if ((dppCtrl >= SQ_DPP_ROW_SL1) && (dppCtrl <= SQ_DPP_ROW_SL15)) { // DPP_ROW_SL{1:15} - count -= (dppCtrl - SQ_DPP_ROW_SL1 + 1); + count = (dppCtrl - SQ_DPP_ROW_SL1 + 1); if ((localRowOffset + count >= 0) && (localRowOffset + count < ROW_SIZE)) { localRowOffset += count; - newLane = (rowNum | localRowOffset); + newLane = ((rowNum * ROW_SIZE) | localRowOffset); } else { outOfBounds = true; } } else if ((dppCtrl >= SQ_DPP_ROW_SR1) && (dppCtrl <= SQ_DPP_ROW_SR15)) { // DPP_ROW_SR{1:15} - count -= (dppCtrl - SQ_DPP_ROW_SR1 + 1); + count = -(dppCtrl - SQ_DPP_ROW_SR1 + 1); if ((localRowOffset + count >= 0) && (localRowOffset + count < ROW_SIZE)) { localRowOffset += count; - newLane = (rowNum | localRowOffset); + newLane = ((rowNum * ROW_SIZE) | localRowOffset); } else { outOfBounds = true; } } else if ((dppCtrl >= SQ_DPP_ROW_RR1) && (dppCtrl <= SQ_DPP_ROW_RR15)) { // DPP_ROW_RR{1:15} - count -= (dppCtrl - SQ_DPP_ROW_RR1 + 1); + count = -(dppCtrl - SQ_DPP_ROW_RR1 + 1); localRowOffset = (localRowOffset + count + ROW_SIZE) % ROW_SIZE; - newLane = (rowNum | localRowOffset); + newLane = ((rowNum * ROW_SIZE) | localRowOffset); } else if (dppCtrl == SQ_DPP_WF_SL1) { // DPP_WF_SL1 - count = 1; if ((currLane >= 0) && (currLane < NumVecElemPerVecReg)) { - newLane += count; + newLane += 1; } else { outOfBounds = true; } } else if (dppCtrl == SQ_DPP_WF_RL1) { // DPP_WF_RL1 - count = 1; - newLane = (currLane + count + NumVecElemPerVecReg) % + newLane = (currLane - 1 + NumVecElemPerVecReg) % NumVecElemPerVecReg; } else if (dppCtrl == SQ_DPP_WF_SR1) { // DPP_WF_SR1 - count = -1; - int currVal = (currLane + count); + int currVal = (currLane - 1); if ((currVal >= 0) && (currVal < NumVecElemPerVecReg)) { - newLane += count; + newLane -= 1; } else { outOfBounds = true; } } else if (dppCtrl == SQ_DPP_WF_RR1) { // DPP_WF_RR1 - count = -1; - newLane = (currLane + count + NumVecElemPerVecReg) % + newLane = (currLane - 1 + NumVecElemPerVecReg) % NumVecElemPerVecReg; } else if (dppCtrl == SQ_DPP_ROW_MIRROR) { // DPP_ROW_MIRROR localRowOffset = (15 - localRowOffset); @@ -392,12 +389,22 @@ namespace VegaISA } else if (dppCtrl == SQ_DPP_ROW_BCAST15) { // DPP_ROW_BCAST15 count = 15; if (currLane > count) { - newLane = (currLane & ~count) - 1; + // 0x30 selects which set of 16 lanes to use. We broadcast the + // last lane of one set to all lanes of the next set (e.g., + // lane 15 is written to 16-31, 31 to 32-47, 47 to 48-63). + newLane = (currLane & 0x30) - 1; + } else { + outOfBounds = true; } } else if (dppCtrl == SQ_DPP_ROW_BCAST31) { // DPP_ROW_BCAST31 count = 31; if (currLane > count) { - newLane = (currLane & ~count) - 1; + // 0x20 selects either the upper 32 or lower 32 lanes and + // broadcasts the last lane of one set to all lanes of the + // next set (e.g., lane 31 is written to 32-63). + newLane = (currLane & 0x20) - 1; + } else { + outOfBounds = true; } } else { panic("Unimplemented DPP control operation: %d\n", dppCtrl); @@ -443,6 +450,9 @@ namespace VegaISA src0.absModifier(); } + // Need a copy of the original data since we update one lane at a time + T src0_copy = src0; + // iterate over all register lanes, performing steps 2-4 for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { threadValid = (0x1LL << lane); @@ -458,7 +468,6 @@ namespace VegaISA if (((rowMask & (0x1 << rowNum)) == 0) /* row mask */ || ((bankMask & (0x1 << bankNum)) == 0) /* bank mask */) { laneDisabled = true; - continue; } /** @@ -495,7 +504,7 @@ namespace VegaISA } else { threadValid = 0; } - } else if (!gpuDynInst->exec_mask[lane]) { + } else if (!gpuDynInst->wavefront()->execMask(lane)) { if (boundCtrl == 1) { zeroSrc = true; } else { @@ -505,13 +514,15 @@ namespace VegaISA if (threadValid != 0 && !outOfBounds && !zeroSrc) { assert(!laneDisabled); - src0[outLane] = src0[lane]; + src0[lane] = src0_copy[outLane]; } else if (zeroSrc) { src0[lane] = 0; } // reset for next iteration laneDisabled = false; + outOfBounds = false; + zeroSrc = false; } }