From b91c9be10283e450b3c806c4d8360318f0baca71 Mon Sep 17 00:00:00 2001 From: Matthew Poremba Date: Wed, 15 May 2024 16:28:36 -0700 Subject: [PATCH] arch-vega: Load/stores commonly used with 16b MFMA This implements some missing loads and store that are commonly used in applications with MFMA instructions to load 16-bit data types into specific register locations: DS_READ_U16_D16, DS_READ_U16_D16_HI, BUFFER_LOAD_SHORT_D16, BUFFER_LOAD_SHORT_D16_HI. Change-Id: Ie22d81ef010328f4541553a9a674764dc16a9f4d --- src/arch/amdgpu/vega/gpu_decoder.cc | 12 +- src/arch/amdgpu/vega/insts/ds.cc | 128 +++++++++++++ src/arch/amdgpu/vega/insts/instructions.hh | 144 ++++++++++++++ src/arch/amdgpu/vega/insts/mubuf.cc | 207 +++++++++++++++++++++ 4 files changed, 483 insertions(+), 8 deletions(-) diff --git a/src/arch/amdgpu/vega/gpu_decoder.cc b/src/arch/amdgpu/vega/gpu_decoder.cc index 6924ea6922..0f4b1e9872 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.cc +++ b/src/arch/amdgpu/vega/gpu_decoder.cc @@ -7808,15 +7808,13 @@ namespace VegaISA GPUStaticInst* Decoder::decode_OP_DS__DS_READ_U16_D16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_DS__DS_READ_U16_D16(&iFmt->iFmt_DS); } GPUStaticInst* Decoder::decode_OP_DS__DS_READ_U16_D16_HI(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_DS__DS_READ_U16_D16_HI(&iFmt->iFmt_DS); } GPUStaticInst* @@ -10207,14 +10205,12 @@ namespace VegaISA GPUStaticInst* Decoder::decode_OP_MUBUF__BUFFER_LOAD_SHORT_D16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_MUBUF__BUFFER_LOAD_SHORT_D16(&iFmt->iFmt_MUBUF); } GPUStaticInst* Decoder::decode_OP_MUBUF__BUFFER_LOAD_SHORT_D16_HI(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI(&iFmt->iFmt_MUBUF); } GPUStaticInst* Decoder::decode_OP_MUBUF__BUFFER_LOAD_FORMAT_D16_HI_X(MachInst iFmt) diff --git a/src/arch/amdgpu/vega/insts/ds.cc b/src/arch/amdgpu/vega/insts/ds.cc index 17acdaa287..57d58638c5 100644 --- a/src/arch/amdgpu/vega/insts/ds.cc +++ b/src/arch/amdgpu/vega/insts/ds.cc @@ -1858,6 +1858,134 @@ namespace VegaISA vdst.write(); } // completeAcc + // --- Inst_DS__DS_READ_U16_D16 class methods --- + + Inst_DS__DS_READ_U16_D16:: + Inst_DS__DS_READ_U16_D16(InFmt_DS *iFmt) + : Inst_DS(iFmt, "ds_read_u16_d16_hi") + { + setFlag(MemoryRef); + setFlag(Load); + } // Inst_DS__DS_READ_U16_D16 + + Inst_DS__DS_READ_U16_D16::~Inst_DS__DS_READ_U16_D16() + { + } // ~Inst_DS__DS_READ_U16_D16 + + // --- description from .arch file --- + // RETURN_DATA[15 : 0].u16 = MEM[ADDR].u16; + // // RETURN_DATA[31:16] is preserved. + void + Inst_DS__DS_READ_U16_D16::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *wf = gpuDynInst->wavefront(); + + if (gpuDynInst->exec_mask.none()) { + wf->decLGKMInstsIssued(); + return; + } + + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set( + gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24))); + ConstVecOperandU32 addr(gpuDynInst, extData.ADDR); + + addr.read(); + + calcAddr(gpuDynInst, addr); + + gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst); + } // execute + void + Inst_DS__DS_READ_U16_D16::initiateAcc(GPUDynInstPtr gpuDynInst) + { + Addr offset0 = instData.OFFSET0; + Addr offset1 = instData.OFFSET1; + Addr offset = (offset1 << 8) | offset0; + + initMemRead(gpuDynInst, offset); + } // initiateAcc + + void + Inst_DS__DS_READ_U16_D16::completeAcc(GPUDynInstPtr gpuDynInst) + { + VecOperandU32 vdst(gpuDynInst, extData.VDST); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + VecElemU16 ds_val = reinterpret_cast( + gpuDynInst->d_data)[lane]; + replaceBits(vdst[lane], 15, 0, ds_val); + } + } + + vdst.write(); + } // completeAcc + // --- Inst_DS__DS_READ_U16_D16_HI class methods --- + + Inst_DS__DS_READ_U16_D16_HI:: + Inst_DS__DS_READ_U16_D16_HI(InFmt_DS *iFmt) + : Inst_DS(iFmt, "ds_read_u16_d16_hi") + { + setFlag(MemoryRef); + setFlag(Load); + } // Inst_DS__DS_READ_U16_D16_HI + + Inst_DS__DS_READ_U16_D16_HI::~Inst_DS__DS_READ_U16_D16_HI() + { + } // ~Inst_DS__DS_READ_U16_D16_HI + + // --- description from .arch file --- + // RETURN_DATA[31 : 16].u16 = MEM[ADDR].u16; + // // RETURN_DATA[15:0] is preserved. + void + Inst_DS__DS_READ_U16_D16_HI::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *wf = gpuDynInst->wavefront(); + + if (gpuDynInst->exec_mask.none()) { + wf->decLGKMInstsIssued(); + return; + } + + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set( + gpuDynInst->computeUnit()->cyclesToTicks(Cycles(24))); + ConstVecOperandU32 addr(gpuDynInst, extData.ADDR); + + addr.read(); + + calcAddr(gpuDynInst, addr); + + gpuDynInst->computeUnit()->localMemoryPipe.issueRequest(gpuDynInst); + } // execute + void + Inst_DS__DS_READ_U16_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst) + { + Addr offset0 = instData.OFFSET0; + Addr offset1 = instData.OFFSET1; + Addr offset = (offset1 << 8) | offset0; + + initMemRead(gpuDynInst, offset); + } // initiateAcc + + void + Inst_DS__DS_READ_U16_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst) + { + VecOperandU32 vdst(gpuDynInst, extData.VDST); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + VecElemU16 ds_val = reinterpret_cast( + gpuDynInst->d_data)[lane]; + replaceBits(vdst[lane], 31, 16, ds_val); + } + } + + vdst.write(); + } // completeAcc // --- Inst_DS__DS_SWIZZLE_B32 class methods --- Inst_DS__DS_SWIZZLE_B32::Inst_DS__DS_SWIZZLE_B32(InFmt_DS *iFmt) diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh index 8195e52341..587058cb89 100644 --- a/src/arch/amdgpu/vega/insts/instructions.hh +++ b/src/arch/amdgpu/vega/insts/instructions.hh @@ -33126,6 +33126,74 @@ namespace VegaISA void completeAcc(GPUDynInstPtr) override; }; // Inst_DS__DS_READ_U16 + class Inst_DS__DS_READ_U16_D16 : public Inst_DS + { + public: + Inst_DS__DS_READ_U16_D16(InFmt_DS*); + ~Inst_DS__DS_READ_U16_D16(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 1; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: //vgpr_a + return 4; + case 1: //vgpr_rtn + return 2; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + void initiateAcc(GPUDynInstPtr) override; + void completeAcc(GPUDynInstPtr) override; + }; // Inst_DS__DS_READ_U16_D16 + + class Inst_DS__DS_READ_U16_D16_HI : public Inst_DS + { + public: + Inst_DS__DS_READ_U16_D16_HI(InFmt_DS*); + ~Inst_DS__DS_READ_U16_D16_HI(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 1; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: //vgpr_a + return 4; + case 1: //vgpr_rtn + return 2; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + void initiateAcc(GPUDynInstPtr) override; + void completeAcc(GPUDynInstPtr) override; + }; // Inst_DS__DS_READ_U16_D16_HI + class Inst_DS__DS_SWIZZLE_B32 : public Inst_DS { public: @@ -36810,6 +36878,82 @@ namespace VegaISA void completeAcc(GPUDynInstPtr) override; }; // Inst_MUBUF__BUFFER_LOAD_SSHORT + class Inst_MUBUF__BUFFER_LOAD_SHORT_D16 : public Inst_MUBUF + { + public: + Inst_MUBUF__BUFFER_LOAD_SHORT_D16(InFmt_MUBUF*); + ~Inst_MUBUF__BUFFER_LOAD_SHORT_D16(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 3; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: //vgpr_a + return 8; + case 1: //sgpr_r + return 16; + case 2: //sgpr_o + return 4; + case 3: //vgpr_d + return 4; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + void initiateAcc(GPUDynInstPtr) override; + void completeAcc(GPUDynInstPtr) override; + }; // Inst_MUBUF__BUFFER_LOAD_SHORT_D16 + + class Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI : public Inst_MUBUF + { + public: + Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI(InFmt_MUBUF*); + ~Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 3; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: //vgpr_a + return 8; + case 1: //sgpr_r + return 16; + case 2: //sgpr_o + return 4; + case 3: //vgpr_d + return 4; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + void initiateAcc(GPUDynInstPtr) override; + void completeAcc(GPUDynInstPtr) override; + }; // Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI + class Inst_MUBUF__BUFFER_LOAD_DWORD : public Inst_MUBUF { public: diff --git a/src/arch/amdgpu/vega/insts/mubuf.cc b/src/arch/amdgpu/vega/insts/mubuf.cc index ff8bae2475..ffc68e8c2b 100644 --- a/src/arch/amdgpu/vega/insts/mubuf.cc +++ b/src/arch/amdgpu/vega/insts/mubuf.cc @@ -823,6 +823,209 @@ namespace VegaISA Inst_MUBUF__BUFFER_LOAD_SSHORT::completeAcc(GPUDynInstPtr gpuDynInst) { } // execute + // --- Inst_MUBUF__BUFFER_LOAD_SHORT_D16 class methods --- + + Inst_MUBUF__BUFFER_LOAD_SHORT_D16 + ::Inst_MUBUF__BUFFER_LOAD_SHORT_D16(InFmt_MUBUF *iFmt) + : Inst_MUBUF(iFmt, "buffer_load_short_d16") + { + setFlag(MemoryRef); + setFlag(Load); + if (instData.LDS) { + setFlag(GroupSegment); + warn("BUFFER.LDS not implemented!"); + } else { + setFlag(GlobalSegment); + } + } // Inst_MUBUF__BUFFER_LOAD_SHORT_D16 + + Inst_MUBUF__BUFFER_LOAD_SHORT_D16::~Inst_MUBUF__BUFFER_LOAD_SHORT_D16() + { + } // ~Inst_MUBUF__BUFFER_LOAD_SHORT_D16 + + // --- description from .arch file --- + // RETURN_DATA[15 : 0].u16 = MEM[ADDR].u16; + // // RETURN_DATA[31:16] is preserved. + void + Inst_MUBUF__BUFFER_LOAD_SHORT_D16::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *wf = gpuDynInst->wavefront(); + + if (gpuDynInst->exec_mask.none()) { + wf->decVMemInstsIssued(); + return; + } + + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); + + ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR); + ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1); + ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4); + ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET); + + rsrcDesc.read(); + offset.read(); + + int inst_offset = instData.OFFSET; + + // For explanation of buffer addressing, see section 9.1.5 in: + // https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/ + // instruction-set-architectures/ + // amd-instinct-mi300-cdna3-instruction-set-architecture.pdf + if (!instData.IDXEN && !instData.OFFEN) { + calcAddr(gpuDynInst, + addr0, addr1, rsrcDesc, offset, inst_offset); + } else if (!instData.IDXEN && instData.OFFEN) { + addr0.read(); + calcAddr(gpuDynInst, + addr0, addr1, rsrcDesc, offset, inst_offset); + } else if (instData.IDXEN && !instData.OFFEN) { + addr0.read(); + calcAddr(gpuDynInst, + addr1, addr0, rsrcDesc, offset, inst_offset); + } else { + addr0.read(); + addr1.read(); + calcAddr(gpuDynInst, + addr1, addr0, rsrcDesc, offset, inst_offset); + } + + gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst); + } // execute + + void + Inst_MUBUF__BUFFER_LOAD_SHORT_D16::initiateAcc(GPUDynInstPtr gpuDynInst) + { + initMemRead(gpuDynInst); + } // initiateAcc + + void + Inst_MUBUF__BUFFER_LOAD_SHORT_D16::completeAcc(GPUDynInstPtr gpuDynInst) + { + VecOperandU32 vdst(gpuDynInst, extData.VDATA); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + if (!oobMask[lane]) { + VecElemU16 buf_val = (reinterpret_cast( + gpuDynInst->d_data))[lane]; + replaceBits(vdst[lane], 15, 0, buf_val); + } else { + vdst[lane] = 0; + } + } + } + + vdst.write(); + } // completeAcc + // --- Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI class methods --- + + Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI + ::Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI(InFmt_MUBUF *iFmt) + : Inst_MUBUF(iFmt, "buffer_load_short_d16_hi") + { + setFlag(MemoryRef); + setFlag(Load); + if (instData.LDS) { + setFlag(GroupSegment); + warn("BUFFER.LDS not implemented!"); + } else { + setFlag(GlobalSegment); + } + } // Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI + + Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI:: + ~Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI() + { + } // ~Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI + + // --- description from .arch file --- + // VDATA[31 : 16].b16 = MEM[ADDR].b16; + // // VDATA[15:0] is preserved. + void + Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI::execute(GPUDynInstPtr gpuDynInst) + { + Wavefront *wf = gpuDynInst->wavefront(); + + if (gpuDynInst->exec_mask.none()) { + wf->decVMemInstsIssued(); + return; + } + + gpuDynInst->execUnitId = wf->execUnitId; + gpuDynInst->latency.init(gpuDynInst->computeUnit()); + gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod()); + + ConstVecOperandU32 addr0(gpuDynInst, extData.VADDR); + ConstVecOperandU32 addr1(gpuDynInst, extData.VADDR + 1); + ConstScalarOperandU128 rsrcDesc(gpuDynInst, extData.SRSRC * 4); + ConstScalarOperandU32 offset(gpuDynInst, extData.SOFFSET); + + rsrcDesc.read(); + offset.read(); + + int inst_offset = instData.OFFSET; + + // For explanation of buffer addressing, see section 9.1.5 in: + // https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/ + // instruction-set-architectures/ + // amd-instinct-mi300-cdna3-instruction-set-architecture.pdf + if (!instData.IDXEN && !instData.OFFEN) { + calcAddr(gpuDynInst, + addr0, addr1, rsrcDesc, offset, inst_offset); + } else if (!instData.IDXEN && instData.OFFEN) { + addr0.read(); + calcAddr(gpuDynInst, + addr0, addr1, rsrcDesc, offset, inst_offset); + } else if (instData.IDXEN && !instData.OFFEN) { + addr0.read(); + calcAddr(gpuDynInst, + addr1, addr0, rsrcDesc, offset, inst_offset); + } else { + addr0.read(); + addr1.read(); + calcAddr(gpuDynInst, + addr1, addr0, rsrcDesc, offset, inst_offset); + } + + gpuDynInst->computeUnit()->globalMemoryPipe.issueRequest(gpuDynInst); + } // execute + + void + Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst) + { + initMemRead(gpuDynInst); + } // initiateAcc + + void + Inst_MUBUF__BUFFER_LOAD_SHORT_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst) + { + VecOperandU32 vdst(gpuDynInst, extData.VDATA); + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (gpuDynInst->exec_mask[lane]) { + if (!oobMask[lane]) { + VecElemU16 buf_val = (reinterpret_cast( + gpuDynInst->d_data))[lane]; + replaceBits(vdst[lane], 31, 16, buf_val); + } else { + vdst[lane] = 0; + } + } + } + + vdst.write(); + } // completeAcc // --- Inst_MUBUF__BUFFER_LOAD_DWORD class methods --- Inst_MUBUF__BUFFER_LOAD_DWORD @@ -868,6 +1071,10 @@ namespace VegaISA int inst_offset = instData.OFFSET; + // For explanation of buffer addressing, see section 9.1.5 in: + // https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/ + // instruction-set-architectures/ + // amd-instinct-mi300-cdna3-instruction-set-architecture.pdf if (!instData.IDXEN && !instData.OFFEN) { calcAddr(gpuDynInst,