From 8722aef2e21620341c028e94bc1075d88ca9b989 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Tue, 20 Feb 2024 13:34:51 -0600 Subject: [PATCH 1/3] gpu-compute: Store accum_offset from code object in WF The accumulation offset is needed for some instructions. In order to access this value we need to place it somewhere instruction definitions can access. The most logical place is in the wavefront. This commit simply copies the value from the HSA task to the wavefront object. Change-Id: I44ef62ef32d2421953f096c431dd758e882245b4 --- src/gpu-compute/gpu_command_processor.cc | 1 - src/gpu-compute/hsa_queue_entry.hh | 13 +++++++++++++ src/gpu-compute/wavefront.cc | 3 +++ src/gpu-compute/wavefront.hh | 2 ++ 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/gpu-compute/gpu_command_processor.cc b/src/gpu-compute/gpu_command_processor.cc index dbb909f624..02b1bb174a 100644 --- a/src/gpu-compute/gpu_command_processor.cc +++ b/src/gpu-compute/gpu_command_processor.cc @@ -723,7 +723,6 @@ GPUCommandProcessor::sanityCheckAKC(AMDKernelCode *akc) warn_if(akc->kernarg_preload_spec_length || akc->kernarg_preload_spec_offset, "Kernarg preload not implemented\n"); - warn_if(akc->accum_offset, "ACC offset not implemented\n"); warn_if(akc->tg_split, "TG split not implemented\n"); } diff --git a/src/gpu-compute/hsa_queue_entry.hh b/src/gpu-compute/hsa_queue_entry.hh index a464e4882d..f015b091fc 100644 --- a/src/gpu-compute/hsa_queue_entry.hh +++ b/src/gpu-compute/hsa_queue_entry.hh @@ -122,6 +122,11 @@ class HSAQueueEntry } parseKernelCode(akc); + + // Offset of a first AccVGPR in the unified register file. + // Granularity 4. Value 0-63. 0 - accum-offset = 4, + // 1 - accum-offset = 8, ..., 63 - accum-offset = 256. + _accumOffset = (akc->accum_offset + 1) * 4; } const GfxVersion& @@ -394,6 +399,12 @@ class HSAQueueEntry assert(_outstandingWbs >= 0); } + unsigned + accumOffset() const + { + return _accumOffset; + } + private: void parseKernelCode(AMDKernelCode *akc) @@ -489,6 +500,8 @@ class HSAQueueEntry std::bitset initialVgprState; std::bitset initialSgprState; + + unsigned _accumOffset; }; } // namespace gem5 diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc index cb8b6220e7..98d882b20e 100644 --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -430,6 +430,9 @@ Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems) } } + // Save the offset to the first accumulation VGPR number from HSA task. + accumOffset = task->accumOffset(); + regInitIdx = 0; // VGPRs are initialized to the work item IDs for a given thread. There diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh index 43ac3e9ffc..82035f7d47 100644 --- a/src/gpu-compute/wavefront.hh +++ b/src/gpu-compute/wavefront.hh @@ -131,6 +131,8 @@ class Wavefront : public SimObject uint32_t maxVgprs; // number of SGPRs required by WF uint32_t maxSgprs; + // first accumulation vgpr number + uint32_t accumOffset; void freeResources(); GPUDynInstPtr nextInstr(); void setStatus(status_e newStatus); From e0e65221b47dfa03fb5e3676cb8340b2278fb854 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Mon, 26 Feb 2024 12:48:48 -0600 Subject: [PATCH 2/3] arch-vega: Use accum offset for v_accvgpr_read/write The accum offset is used as an index into the unified VGPR register file in MI200 and is not the same as a move if accum_offset in the dispatch packet is non-zero. Change these instructions to use the stored accum_offset value. Change-Id: Ib661804f8f5b8392e4c586082c423645f539e641 --- src/arch/amdgpu/vega/insts/vop3p.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/arch/amdgpu/vega/insts/vop3p.cc b/src/arch/amdgpu/vega/insts/vop3p.cc index 85f0af2a51..224c525e0f 100644 --- a/src/arch/amdgpu/vega/insts/vop3p.cc +++ b/src/arch/amdgpu/vega/insts/vop3p.cc @@ -596,10 +596,10 @@ void Inst_VOP3P__V_DOT8_U32_U4::execute(GPUDynInstPtr gpuDynInst) void Inst_VOP3P__V_ACCVGPR_READ::execute(GPUDynInstPtr gpuDynInst) { - // The Acc register file is not supported in gem5 and has been removed - // in MI200. Therefore this instruction becomes a mov. Wavefront *wf = gpuDynInst->wavefront(); - ConstVecOperandU32 src(gpuDynInst, extData.SRC0); + unsigned accum_offset = wf->accumOffset; + + ConstVecOperandU32 src(gpuDynInst, extData.SRC0+accum_offset); VecOperandU32 vdst(gpuDynInst, instData.VDST); src.readSrc(); @@ -615,11 +615,11 @@ void Inst_VOP3P__V_ACCVGPR_READ::execute(GPUDynInstPtr gpuDynInst) void Inst_VOP3P__V_ACCVGPR_WRITE::execute(GPUDynInstPtr gpuDynInst) { - // The Acc register file is not supported in gem5 and has been removed - // in MI200. Therefore this instruction becomes a mov. Wavefront *wf = gpuDynInst->wavefront(); + unsigned accum_offset = wf->accumOffset; + ConstVecOperandU32 src(gpuDynInst, extData.SRC0); - VecOperandU32 vdst(gpuDynInst, instData.VDST); + VecOperandU32 vdst(gpuDynInst, instData.VDST+accum_offset); src.readSrc(); From 2ca7f48828782368bed8d02a1a6f8c537b5af15a Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Mon, 26 Feb 2024 13:18:43 -0600 Subject: [PATCH 3/3] arch-vega: Accumulation offset for existing MFMA insts This commit update the two exiting MFMA instructions to support the accumulation offset for A, B, and C/D matrix. Additionally uses array indexed C/D matrix registers to reduce duplicate code. Future MFMA instructions have up to 16 registers for C/D and this reduces the amount of code being written. Change-Id: Ibdc3b6255234a3bab99f115c79e8a0248c800400 --- src/arch/amdgpu/vega/insts/vop3p_mai.cc | 169 ++++++++++++++---------- 1 file changed, 97 insertions(+), 72 deletions(-) diff --git a/src/arch/amdgpu/vega/insts/vop3p_mai.cc b/src/arch/amdgpu/vega/insts/vop3p_mai.cc index 943aa72cfd..6136a94da9 100644 --- a/src/arch/amdgpu/vega/insts/vop3p_mai.cc +++ b/src/arch/amdgpu/vega/insts/vop3p_mai.cc @@ -56,9 +56,19 @@ namespace VegaISA void Inst_VOP3P_MAI__V_MFMA_I32_16X16X16I8::execute(GPUDynInstPtr gpuDynInst) { - int acc_offset = 0; + // Accumulation register offsets for A, B, and C/D matrix. + int a_offset = 0; + int b_offset = 0; + int cd_offset = 0; if (instData.ACC_CD) { - warn("ACC_CD not yet implemented\n"); + cd_offset = gpuDynInst->wavefront()->accumOffset; + } + if (extData.ACC) { + if (extData.ACC & 0x1) { + a_offset = gpuDynInst->wavefront()->accumOffset; + } else if (extData.ACC & 0x2) { + b_offset = gpuDynInst->wavefront()->accumOffset; + } } // int8 size allows for 4 elements per lane. At 16x16 this means 4 @@ -71,24 +81,27 @@ namespace VegaISA // VecOperandI8 will read 8 bits and sign extend, so used U32 to read // as "untyped" 32-bit values. - ConstVecOperandU32 src0(gpuDynInst, extData.SRC0); - ConstVecOperandU32 src1(gpuDynInst, extData.SRC1); - ConstVecOperandI32 src2a(gpuDynInst, extData.SRC2+acc_offset); - ConstVecOperandI32 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta); - ConstVecOperandI32 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta); - ConstVecOperandI32 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta); + ConstVecOperandU32 src0(gpuDynInst, extData.SRC0+a_offset); + ConstVecOperandU32 src1(gpuDynInst, extData.SRC1+b_offset); + ConstVecOperandI32 src2[4] = { + ConstVecOperandI32(gpuDynInst, extData.SRC2+cd_offset), + ConstVecOperandI32(gpuDynInst, extData.SRC2+cd_offset+1*delta), + ConstVecOperandI32(gpuDynInst, extData.SRC2+cd_offset+2*delta), + ConstVecOperandI32(gpuDynInst, extData.SRC2+cd_offset+3*delta), + }; - VecOperandI32 vdsta(gpuDynInst, instData.VDST+acc_offset); - VecOperandI32 vdstb(gpuDynInst, instData.VDST+acc_offset+1); - VecOperandI32 vdstc(gpuDynInst, instData.VDST+acc_offset+2); - VecOperandI32 vdstd(gpuDynInst, instData.VDST+acc_offset+3); + VecOperandI32 vdst[4] = { + VecOperandI32(gpuDynInst, instData.VDST+cd_offset), + VecOperandI32(gpuDynInst, instData.VDST+cd_offset+1), + VecOperandI32(gpuDynInst, instData.VDST+cd_offset+2), + VecOperandI32(gpuDynInst, instData.VDST+cd_offset+3), + }; src0.readSrc(); src1.readSrc(); - src2a.readSrc(); - src2b.readSrc(); - src2c.readSrc(); - src2d.readSrc(); + for (int i = 0; i < 4; ++i) { + src2[i].readSrc(); + } int32_t A[16][16]; for (int i = 0; i < 64; ++i) { @@ -124,14 +137,14 @@ namespace VegaISA // Load accumulation matrix C into result for (int i = 0; i < 64; ++i) { - // src2a contains rows 0, 4, 8, 12 - result[(i/16)*4][(i%16)] = src2a[i]; - // src2b contains rows 1, 5, 9, 13 - result[(i/16)*4+1][(i%16)] = src2b[i]; - // src2c contains rows 2, 6, 10, 14 - result[(i/16)*4+2][(i%16)] = src2c[i]; - // src2d contains rows 3, 7, 11, 15 - result[(i/16)*4+3][(i%16)] = src2d[i]; + // src2[0] contains rows 0, 4, 8, 12 + result[(i/16)*4][(i%16)] = src2[0][i]; + // src2[1] contains rows 1, 5, 9, 13 + result[(i/16)*4+1][(i%16)] = src2[1][i]; + // src2[2] contains rows 2, 6, 10, 14 + result[(i/16)*4+2][(i%16)] = src2[2][i]; + // src2[3] contains rows 3, 7, 11, 15 + result[(i/16)*4+3][(i%16)] = src2[3][i]; } // Compute new result - This is (obviously) not optimized @@ -145,20 +158,19 @@ namespace VegaISA // Put result in dest VGPRs for (int i = 0; i < 64; ++i) { - // vdsta contains rows 0, 4, 8, 12 - vdsta[i] = result[(i/16)*4][(i%16)]; - // vdstb contains rows 1, 5, 9, 13 - vdstb[i] = result[(i/16)*4+1][(i%16)]; - // vdstc contains rows 2, 6, 10, 14 - vdstc[i] = result[(i/16)*4+2][(i%16)]; - // vdstd contains rows 3, 7, 11, 15 - vdstd[i] = result[(i/16)*4+3][(i%16)]; + // vdst[0] contains rows 0, 4, 8, 12 + vdst[0][i] = result[(i/16)*4][(i%16)]; + // vdst[1] contains rows 1, 5, 9, 13 + vdst[1][i] = result[(i/16)*4+1][(i%16)]; + // vdst[2] contains rows 2, 6, 10, 14 + vdst[2][i] = result[(i/16)*4+2][(i%16)]; + // vdst[3] contains rows 3, 7, 11, 15 + vdst[3][i] = result[(i/16)*4+3][(i%16)]; } - vdsta.write(); - vdstb.write(); - vdstc.write(); - vdstd.write(); + for (int i = 0; i < 4; ++i) { + vdst[i].write(); + } } // execute // --- Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64 class methods --- @@ -179,9 +191,19 @@ namespace VegaISA void Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::execute(GPUDynInstPtr gpuDynInst) { - int acc_offset = 0; + // Accumulation register offsets for A, B, and C/D matrix. + int a_offset = 0; + int b_offset = 0; + int cd_offset = 0; if (instData.ACC_CD) { - warn("ACC_CD not yet implemented\n"); + cd_offset = gpuDynInst->wavefront()->accumOffset; + } + if (extData.ACC) { + if (extData.ACC & 0x1) { + a_offset = gpuDynInst->wavefront()->accumOffset; + } else if (extData.ACC & 0x2) { + b_offset = gpuDynInst->wavefront()->accumOffset; + } } // Handling of src2 is a bit tricky. The operator[] overload cannot @@ -191,37 +213,41 @@ namespace VegaISA // a delta for each of the pairs of src2 GPRs. int delta = isVectorReg(extData.SRC2) ? 2 : 0; - ConstVecOperandF64 src0(gpuDynInst, extData.SRC0); - ConstVecOperandF64 src1(gpuDynInst, extData.SRC1); - ConstVecOperandF64 src2a(gpuDynInst, extData.SRC2+acc_offset); - ConstVecOperandF64 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta); - ConstVecOperandF64 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta); - ConstVecOperandF64 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta); + ConstVecOperandF64 src0(gpuDynInst, extData.SRC0+a_offset); + ConstVecOperandF64 src1(gpuDynInst, extData.SRC1+b_offset); + ConstVecOperandF64 src2[4] = { + ConstVecOperandF64(gpuDynInst, extData.SRC2+cd_offset), + ConstVecOperandF64(gpuDynInst, extData.SRC2+cd_offset+1*delta), + ConstVecOperandF64(gpuDynInst, extData.SRC2+cd_offset+2*delta), + ConstVecOperandF64(gpuDynInst, extData.SRC2+cd_offset+3*delta), + }; - VecOperandF64 vdsta(gpuDynInst, instData.VDST+acc_offset); - VecOperandF64 vdstb(gpuDynInst, instData.VDST+acc_offset+2); - VecOperandF64 vdstc(gpuDynInst, instData.VDST+acc_offset+4); - VecOperandF64 vdstd(gpuDynInst, instData.VDST+acc_offset+6); + VecOperandF64 vdst[4] = { + VecOperandF64(gpuDynInst, instData.VDST+cd_offset), + VecOperandF64(gpuDynInst, instData.VDST+cd_offset+2), + VecOperandF64(gpuDynInst, instData.VDST+cd_offset+4), + VecOperandF64(gpuDynInst, instData.VDST+cd_offset+6), + }; src0.readSrc(); src1.readSrc(); - src2a.readSrc(); - src2b.readSrc(); - src2c.readSrc(); - src2d.readSrc(); + + for (int i = 0; i < 4; ++i) { + src2[i].readSrc(); + } double result[16][16]; // Load src2 into result. src2 is row major for (int i = 0; i < 64; ++i) { - // src2a contains rows 0 - 3 - result[(i/16)][(i%16)] = src2a[i]; - // src2b contains rows 4 - 7 - result[(i/16)+4][(i%16)] = src2b[i]; - // src2c contains rows 8 - 11 - result[(i/16)+8][(i%16)] = src2c[i]; - // src2d contains rows 12 - 15 - result[(i/16)+12][(i%16)] = src2d[i]; + // src2[0] contains rows 0 - 3 + result[(i/16)][(i%16)] = src2[0][i]; + // src2[1] contains rows 4 - 7 + result[(i/16)+4][(i%16)] = src2[1][i]; + // src2[2] contains rows 8 - 11 + result[(i/16)+8][(i%16)] = src2[2][i]; + // src2[3] contains rows 12 - 15 + result[(i/16)+12][(i%16)] = src2[3][i]; } // Compute new result @@ -238,20 +264,19 @@ namespace VegaISA // Put result in dest VGPRs for (int i = 0; i < 64; ++i) { - // vdsta contains rows 0 - 3 - vdsta[i] = result[(i/16)][(i%16)]; - // src2b contains rows 4 - 7 - vdstb[i] = result[(i/16)+4][(i%16)]; - // src2c contains rows 8 - 11 - vdstc[i] = result[(i/16)+8][(i%16)]; - // src2d contains rows 12 - 15 - vdstd[i] = result[(i/16)+12][(i%16)]; + // vdst[0] contains rows 0 - 3 + vdst[0][i] = result[(i/16)][(i%16)]; + // src2[1] contains rows 4 - 7 + vdst[1][i] = result[(i/16)+4][(i%16)]; + // src2[2] contains rows 8 - 11 + vdst[2][i] = result[(i/16)+8][(i%16)]; + // src2[3] contains rows 12 - 15 + vdst[3][i] = result[(i/16)+12][(i%16)]; } - vdsta.write(); - vdstb.write(); - vdstc.write(); - vdstd.write(); + for (int i = 0; i < 4; ++i) { + vdst[i].write(); + } } // execute } // namespace VegaISA } // namespace gem5