diff --git a/src/arch/amdgpu/vega/SConscript b/src/arch/amdgpu/vega/SConscript index 9c6a01bf81..019ef279b3 100644 --- a/src/arch/amdgpu/vega/SConscript +++ b/src/arch/amdgpu/vega/SConscript @@ -56,4 +56,6 @@ if env['CONF']['TARGET_GPU_ISA'] == 'vega': Source('isa.cc') Source('registers.cc') + Source('insts/vop3p.cc') + DebugFlag('VEGA', 'Debug flag for VEGA GPU ISA') diff --git a/src/arch/amdgpu/vega/decoder.cc b/src/arch/amdgpu/vega/decoder.cc index 065f8c8493..2850640af2 100644 --- a/src/arch/amdgpu/vega/decoder.cc +++ b/src/arch/amdgpu/vega/decoder.cc @@ -34,6 +34,7 @@ #include "arch/amdgpu/vega/gpu_decoder.hh" #include "arch/amdgpu/vega/insts/gpu_static_inst.hh" #include "arch/amdgpu/vega/insts/instructions.hh" +#include "arch/amdgpu/vega/insts/vop3p.hh" namespace gem5 { @@ -3629,6 +3630,7 @@ namespace VegaISA &Decoder::decode_invalid, &Decoder::decode_invalid, &Decoder::decode_invalid, + &Decoder::decode_OP_VOP3P__V_PK_MOV_B32, &Decoder::decode_invalid, &Decoder::decode_invalid, &Decoder::decode_invalid, @@ -3687,8 +3689,7 @@ namespace VegaISA &Decoder::decode_invalid, &Decoder::decode_invalid, &Decoder::decode_invalid, - &Decoder::decode_invalid, - &Decoder::decode_invalid, + &Decoder::decode_OP_VOP3P__V_MFMA_F64_16X16X4F64, &Decoder::decode_invalid, &Decoder::decode_invalid, &Decoder::decode_invalid, @@ -12920,134 +12921,115 @@ namespace VegaISA GPUStaticInst* Decoder::decode_OP_VOP3P__V_PK_MAD_I16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP3P__V_PK_MAD_I16(&iFmt->iFmt_VOP3P); } GPUStaticInst* Decoder::decode_OP_VOP3P__V_PK_MUL_LO_U16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP3P__V_PK_MUL_LO_U16(&iFmt->iFmt_VOP3P); } GPUStaticInst* Decoder::decode_OP_VOP3P__V_PK_ADD_I16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP3P__V_PK_ADD_I16(&iFmt->iFmt_VOP3P); } GPUStaticInst* Decoder::decode_OP_VOP3P__V_PK_SUB_I16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP3P__V_PK_SUB_I16(&iFmt->iFmt_VOP3P); } GPUStaticInst* Decoder::decode_OP_VOP3P__V_PK_LSHLREV_B16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP3P__V_PK_LSHLREV_B16(&iFmt->iFmt_VOP3P); } GPUStaticInst* Decoder::decode_OP_VOP3P__V_PK_LSHRREV_B16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP3P__V_PK_LSHRREV_B16(&iFmt->iFmt_VOP3P); } GPUStaticInst* Decoder::decode_OP_VOP3P__V_PK_ASHRREV_I16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP3P__V_PK_ASHRREV_B16(&iFmt->iFmt_VOP3P); } GPUStaticInst* Decoder::decode_OP_VOP3P__V_PK_MAX_I16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP3P__V_PK_MAX_I16(&iFmt->iFmt_VOP3P); } GPUStaticInst* Decoder::decode_OP_VOP3P__V_PK_MIN_I16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP3P__V_PK_MIN_I16(&iFmt->iFmt_VOP3P); } GPUStaticInst* Decoder::decode_OP_VOP3P__V_PK_MAD_U16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP3P__V_PK_MAD_U16(&iFmt->iFmt_VOP3P); } GPUStaticInst* Decoder::decode_OP_VOP3P__V_PK_ADD_U16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP3P__V_PK_ADD_U16(&iFmt->iFmt_VOP3P); } GPUStaticInst* Decoder::decode_OP_VOP3P__V_PK_SUB_U16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP3P__V_PK_SUB_U16(&iFmt->iFmt_VOP3P); } GPUStaticInst* Decoder::decode_OP_VOP3P__V_PK_MAX_U16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP3P__V_PK_MAX_U16(&iFmt->iFmt_VOP3P); } GPUStaticInst* Decoder::decode_OP_VOP3P__V_PK_MIN_U16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP3P__V_PK_MIN_U16(&iFmt->iFmt_VOP3P); } GPUStaticInst* Decoder::decode_OP_VOP3P__V_PK_FMA_F16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP3P__V_PK_FMA_F16(&iFmt->iFmt_VOP3P); } GPUStaticInst* Decoder::decode_OP_VOP3P__V_PK_ADD_F16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP3P__V_PK_ADD_F16(&iFmt->iFmt_VOP3P); } GPUStaticInst* Decoder::decode_OP_VOP3P__V_PK_MUL_F16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP3P__V_PK_MUL_F16(&iFmt->iFmt_VOP3P); } GPUStaticInst* Decoder::decode_OP_VOP3P__V_PK_MIN_F16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP3P__V_PK_MIN_F16(&iFmt->iFmt_VOP3P); } GPUStaticInst* Decoder::decode_OP_VOP3P__V_PK_MAX_F16(MachInst iFmt) { - fatal("Trying to decode instruction without a class\n"); - return nullptr; + return new Inst_VOP3P__V_PK_MAX_F16(&iFmt->iFmt_VOP3P); } GPUStaticInst* @@ -13071,6 +13053,19 @@ namespace VegaISA return nullptr; } + GPUStaticInst* + Decoder::decode_OP_VOP3P__V_PK_MOV_B32(MachInst iFmt) + { + return new Inst_VOP3P__V_PK_MOV_B32(&iFmt->iFmt_VOP3P); + } + + GPUStaticInst* + Decoder::decode_OP_VOP3P__V_MFMA_F64_16X16X4F64(MachInst iFmt) + { + return new Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64( + &iFmt->iFmt_VOP3P_MAI); + } + GPUStaticInst* Decoder::decode_invalid(MachInst iFmt) { diff --git a/src/arch/amdgpu/vega/gpu_decoder.hh b/src/arch/amdgpu/vega/gpu_decoder.hh index af989e0cc7..2a6f9370fb 100644 --- a/src/arch/amdgpu/vega/gpu_decoder.hh +++ b/src/arch/amdgpu/vega/gpu_decoder.hh @@ -1585,6 +1585,8 @@ namespace VegaISA GPUStaticInst* decode_OP_VOP3P__V_MAD_MIX_F32(MachInst); GPUStaticInst* decode_OP_VOP3P__V_MAD_MIXLO_F16(MachInst); GPUStaticInst* decode_OP_VOP3P__V_MAD_MIXHI_F16(MachInst); + GPUStaticInst* decode_OP_VOP3P__V_PK_MOV_B32(MachInst); + GPUStaticInst* decode_OP_VOP3P__V_MFMA_F64_16X16X4F64(MachInst); GPUStaticInst* subDecode_OPU_VOP3(MachInst); GPUStaticInst* subDecode_OP_DS(MachInst); GPUStaticInst* subDecode_OP_FLAT(MachInst); @@ -1908,7 +1910,27 @@ namespace VegaISA unsigned int NEG : 3; }; - union InstFormat { + struct InFmt_VOP3P_MAI + { + unsigned int VDST : 8; + unsigned int CBSZ : 3; + unsigned int ABID : 4; + unsigned int ACC_CD : 1; + unsigned int OP : 7; + unsigned int ENCODING : 9; + }; + + struct InFmt_VOP3P_MAI_1 + { + unsigned int SRC0 : 9; + unsigned int SRC1 : 9; + unsigned int SRC2 : 9; + unsigned int ACC : 2; + unsigned int BLGP : 3; + }; + + union InstFormat + { InFmt_DS iFmt_DS; InFmt_DS_1 iFmt_DS_1; InFmt_EXP iFmt_EXP; @@ -1941,6 +1963,8 @@ namespace VegaISA InFmt_VOP_SDWAB iFmt_VOP_SDWAB; InFmt_VOP3P iFmt_VOP3P; InFmt_VOP3P_1 iFmt_VOP3P_1; + InFmt_VOP3P_MAI iFmt_VOP3P_MAI; + InFmt_VOP3P_MAI_1 iFmt_VOP3P_MAI_1; uint32_t imm_u32; float imm_f32; }; // union InstFormat diff --git a/src/arch/amdgpu/vega/insts/instructions.cc b/src/arch/amdgpu/vega/insts/instructions.cc index cd4ad74e6e..855f91699f 100644 --- a/src/arch/amdgpu/vega/insts/instructions.cc +++ b/src/arch/amdgpu/vega/insts/instructions.cc @@ -45908,5 +45908,143 @@ namespace VegaISA { panicUnimplemented(); } // execute + // --- Inst_VOP3P__V_PK_MOV_B32 class methods --- + + Inst_VOP3P__V_PK_MOV_B32::Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P *iFmt) + : Inst_VOP3P(iFmt, "v_pk_mov_b32") + { + setFlag(ALU); + } // Inst_VOP3P__V_PK_MOV_B32 + + Inst_VOP3P__V_PK_MOV_B32::~Inst_VOP3P__V_PK_MOV_B32() + { + } // ~Inst_VOP3P__V_PK_MOV_B32 + + // D.u[63:32] = S1.u[31:0]; D.u[31:0] = S0.u[31:0]. + void + Inst_VOP3P__V_PK_MOV_B32::execute(GPUDynInstPtr gpuDynInst) + { + // This is a special case of packed instructions which operates on + // 64-bit inputs/outputs and not 32-bit. + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandU64 src0(gpuDynInst, extData.SRC0); + ConstVecOperandU64 src1(gpuDynInst, extData.SRC1); + VecOperandU64 vdst(gpuDynInst, instData.VDST); + + src0.readSrc(); + src1.readSrc(); + + // Only OPSEL[1:0] are used + // OPSEL[0] 0/1: Lower dest dword = lower/upper dword of src0 + + int opsel = instData.OPSEL; + + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + // OPSEL[1] 0/1: Lower dest dword = lower/upper dword of src1 + uint64_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32) + : bits(src0[lane], 31, 0); + uint64_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32) + : bits(src1[lane], 31, 0); + + vdst[lane] = upper_dword << 32 | lower_dword; + } + } + + vdst.write(); + } // execute + // --- Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64 class methods --- + + Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64:: + Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64(InFmt_VOP3P_MAI *iFmt) + : Inst_VOP3P_MAI(iFmt, "v_mfma_f64_16x16x4f64") + { + setFlag(ALU); + } // Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64 + + Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64:: + ~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64() + { + } // ~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64 + + // D(16x16F64) = A(16x4F64) x B(4x16F64) + C(16x16F64), 1 Blocks, 8 + // pass, srcA/srcB 2 VGPR, srcC/D 8 VGPR + void + Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64::execute(GPUDynInstPtr gpuDynInst) + { + int acc_offset = 0; + if (instData.ACC_CD) { + warn("ACC_CD not yet implemented\n"); + } + + // Handling of src2 is a bit tricky. The operator[] overload cannot + // be used for dword count > 2, and the dword count here is 8. Usually + // src2 is a VGPR/AccGPR, but it might also be constant. In order to + // use operator[] and handle constants, check for VGPR here and set + // a delta for each of the pairs of src2 GPRs. + int delta = isVectorReg(extData.SRC2) ? 2 : 0; + + ConstVecOperandF64 src0(gpuDynInst, extData.SRC0); + ConstVecOperandF64 src1(gpuDynInst, extData.SRC1); + ConstVecOperandF64 src2a(gpuDynInst, extData.SRC2+acc_offset); + ConstVecOperandF64 src2b(gpuDynInst, extData.SRC2+acc_offset+1*delta); + ConstVecOperandF64 src2c(gpuDynInst, extData.SRC2+acc_offset+2*delta); + ConstVecOperandF64 src2d(gpuDynInst, extData.SRC2+acc_offset+3*delta); + + VecOperandF64 vdsta(gpuDynInst, instData.VDST+acc_offset); + VecOperandF64 vdstb(gpuDynInst, instData.VDST+acc_offset+2); + VecOperandF64 vdstc(gpuDynInst, instData.VDST+acc_offset+4); + VecOperandF64 vdstd(gpuDynInst, instData.VDST+acc_offset+6); + + src0.readSrc(); + src1.readSrc(); + src2a.readSrc(); + src2b.readSrc(); + src2c.readSrc(); + src2d.readSrc(); + + double result[16][16]; + + // Load src2 into result. src2 is row major + for (int i = 0; i < 64; ++i) { + // src2a contains rows 0 - 3 + result[(i/16)][(i%16)] = src2a[i]; + // src2b contains rows 4 - 7 + result[(i/16)+4][(i%16)] = src2b[i]; + // src2c contains rows 8 - 11 + result[(i/16)+8][(i%16)] = src2c[i]; + // src2d contains rows 12 - 15 + result[(i/16)+12][(i%16)] = src2d[i]; + } + + // Compute new result + for (int i = 0; i < 16; ++i) { + for (int j = 0; j < 16; ++j) { + for (int k = 0; k < 4; ++k) { + // src0 is column major, src1 is row major + int lane_A = 16*k + i; + int lane_B = 16*k + j; + result[i][j] += src0[lane_A] * src1[lane_B]; + } + } + } + + // Put result in dest VGPRs + for (int i = 0; i < 64; ++i) { + // vdsta contains rows 0 - 3 + vdsta[i] = result[(i/16)][(i%16)]; + // src2b contains rows 4 - 7 + vdstb[i] = result[(i/16)+4][(i%16)]; + // src2c contains rows 8 - 11 + vdstc[i] = result[(i/16)+8][(i%16)]; + // src2d contains rows 12 - 15 + vdstd[i] = result[(i/16)+12][(i%16)]; + } + + vdsta.write(); + vdstb.write(); + vdstc.write(); + vdstd.write(); + } // execute } // namespace VegaISA } // namespace gem5 diff --git a/src/arch/amdgpu/vega/insts/instructions.hh b/src/arch/amdgpu/vega/insts/instructions.hh index ca349c365f..22423e14c6 100644 --- a/src/arch/amdgpu/vega/insts/instructions.hh +++ b/src/arch/amdgpu/vega/insts/instructions.hh @@ -43405,6 +43405,76 @@ namespace VegaISA void execute(GPUDynInstPtr) override; }; // Inst_FLAT__FLAT_ATOMIC_DEC_X2 + + class Inst_VOP3P__V_PK_MOV_B32 : public Inst_VOP3P + { + public: + Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P*); + ~Inst_VOP3P__V_PK_MOV_B32(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 2; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: // src0 + return 8; + case 1: // src1 + return 8; + case 2: // dst + return 8; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + }; // Inst_VOP3P__V_PK_MOV_B32 + + class Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64 : public Inst_VOP3P_MAI + { + public: + Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64(InFmt_VOP3P_MAI*); + ~Inst_VOP3P_MAI__V_MFMA_F64_16X16X4F64(); + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 3; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: // src0 "A" + return 8; + case 1: // src1 "B" + return 8; + case 2: // src2 "C" + return 32; + case 3: // dst + return 32; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } // getOperandSize + + void execute(GPUDynInstPtr) override; + }; } // namespace VegaISA } // namespace gem5 diff --git a/src/arch/amdgpu/vega/insts/op_encodings.cc b/src/arch/amdgpu/vega/insts/op_encodings.cc index c934094d9b..c1302b8b49 100644 --- a/src/arch/amdgpu/vega/insts/op_encodings.cc +++ b/src/arch/amdgpu/vega/insts/op_encodings.cc @@ -1178,6 +1178,158 @@ namespace VegaISA disassembly = dis_stream.str(); } + // --- Inst_VOP3P base class methods --- + + Inst_VOP3P::Inst_VOP3P(InFmt_VOP3P *iFmt, const std::string &opcode) + : VEGAGPUStaticInst(opcode) + { + // copy first instruction DWORD + instData = iFmt[0]; + // copy second instruction DWORD + extData = ((InFmt_VOP3P_1 *)iFmt)[1]; + } // Inst_VOP3P + + Inst_VOP3P::~Inst_VOP3P() + { + } // ~Inst_VOP3P + + void + Inst_VOP3P::initOperandInfo() + { + // Also takes care of bitfield addr issue + unsigned int srcs[3] = {extData.SRC0, extData.SRC1, extData.SRC2}; + + int opNum = 0; + + int numSrc = numSrcRegOperands(); + + for (opNum = 0; opNum < numSrc; opNum++) { + srcOps.emplace_back(srcs[opNum], getOperandSize(opNum), true, + isScalarReg(srcs[opNum]), + isVectorReg(srcs[opNum]), false); + } + + // There is always one dest + // Needed because can't take addr of bitfield + int reg = instData.VDST; + dstOps.emplace_back(reg, getOperandSize(opNum), false, + false, true, false); + opNum++; + + assert(srcOps.size() == numSrcRegOperands()); + assert(dstOps.size() == numDstRegOperands()); + } + + int + Inst_VOP3P::instSize() const + { + return 8; + } // instSize + + void + Inst_VOP3P::generateDisassembly() + { + std::stringstream dis_stream; + dis_stream << _opcode << " "; + + // There is always a dest and the index is after the src operands + // The output size much be a multiple of dword size + int dst_size = getOperandSize(numSrcRegOperands()); + + dis_stream << opSelectorToRegSym(instData.VDST + 0x100, dst_size / 4); + + unsigned int srcs[3] = {extData.SRC0, extData.SRC1, extData.SRC2}; + for (int opnum = 0; opnum < numSrcRegOperands(); opnum++) { + int num_regs = getOperandSize(opnum) / 4; + dis_stream << ", " << opSelectorToRegSym(srcs[opnum], num_regs); + } + + // Print op_sel only if one is non-zero + if (instData.OPSEL) { + int opsel = instData.OPSEL; + + dis_stream << " op_sel:[" << bits(opsel, 0, 0) << "," + << bits(opsel, 1, 1) << "," << bits(opsel, 2, 2) << "]"; + } + + disassembly = dis_stream.str(); + } + + // --- Inst_VOP3P_MAI base class methods --- + + Inst_VOP3P_MAI::Inst_VOP3P_MAI(InFmt_VOP3P_MAI *iFmt, + const std::string &opcode) + : VEGAGPUStaticInst(opcode) + { + // copy first instruction DWORD + instData = iFmt[0]; + // copy second instruction DWORD + extData = ((InFmt_VOP3P_MAI_1 *)iFmt)[1]; + } // Inst_VOP3P_MAI + + Inst_VOP3P_MAI::~Inst_VOP3P_MAI() + { + } // ~Inst_VOP3P_MAI + + void + Inst_VOP3P_MAI::initOperandInfo() + { + // Also takes care of bitfield addr issue + unsigned int srcs[3] = {extData.SRC0, extData.SRC1, extData.SRC2}; + + int opNum = 0; + + int numSrc = numSrcRegOperands(); + + for (opNum = 0; opNum < numSrc; opNum++) { + srcOps.emplace_back(srcs[opNum], getOperandSize(opNum), true, + isScalarReg(srcs[opNum]), + isVectorReg(srcs[opNum]), false); + } + + // There is always one dest + // Needed because can't take addr of bitfield + int reg = instData.VDST; + dstOps.emplace_back(reg, getOperandSize(opNum), false, + false, true, false); + opNum++; + + assert(srcOps.size() == numSrcRegOperands()); + assert(dstOps.size() == numDstRegOperands()); + } + + int + Inst_VOP3P_MAI::instSize() const + { + return 8; + } // instSize + + void + Inst_VOP3P_MAI::generateDisassembly() + { + std::stringstream dis_stream; + dis_stream << _opcode << " "; + + // There is always a dest and the index is after the src operands + // The output size much be a multiple of dword size + int dst_size = getOperandSize(numSrcRegOperands()); + + // opSelectorToRegSym handles formating for us. VDST is always VGPR + // so only the last 8 bits are used. This adds the implicit 9th bit + // which is 1 for VGPRs as VGPR op nums are from 256-255. + int dst_opnum = instData.VDST + 0x100; + + dis_stream << opSelectorToRegSym(dst_opnum, dst_size / 4); + + unsigned int srcs[3] = {extData.SRC0, extData.SRC1, extData.SRC2}; + for (int opnum = 0; opnum < numSrcRegOperands(); opnum++) { + int num_regs = getOperandSize(opnum) / 4; + dis_stream << ", " << opSelectorToRegSym(srcs[opnum], num_regs); + } + + disassembly = dis_stream.str(); + } + // --- Inst_DS base class methods --- Inst_DS::Inst_DS(InFmt_DS *iFmt, const std::string &opcode) diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh index a1c5e99c91..bf4ee2b0c9 100644 --- a/src/arch/amdgpu/vega/insts/op_encodings.hh +++ b/src/arch/amdgpu/vega/insts/op_encodings.hh @@ -491,6 +491,149 @@ namespace VegaISA bool hasSecondDword(InFmt_VOP3B *); }; // Inst_VOP3B + class Inst_VOP3P : public VEGAGPUStaticInst + { + public: + Inst_VOP3P(InFmt_VOP3P*, const std::string &opcode); + ~Inst_VOP3P(); + + int instSize() const override; + void generateDisassembly() override; + + void initOperandInfo() override; + + protected: + // first instruction DWORD + InFmt_VOP3P instData; + // second instruction DWORD + InFmt_VOP3P_1 extData; + + template + void vop3pHelper(GPUDynInstPtr gpuDynInst, + T (*fOpImpl)(T, T, bool)) + { + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandU32 S0(gpuDynInst, extData.SRC0); + ConstVecOperandU32 S1(gpuDynInst, extData.SRC1); + VecOperandU32 D(gpuDynInst, instData.VDST); + + S0.readSrc(); + S1.readSrc(); + + int opLo = instData.OPSEL; + int opHi = instData.OPSEL_HI2 << 2 | extData.OPSEL_HI; + int negLo = extData.NEG; + int negHi = instData.NEG_HI; + bool clamp = instData.CLMP; + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + T upper_val = fOpImpl(word(S0[lane], opHi, negHi, 0), + word(S1[lane], opHi, negHi, 1), + clamp); + T lower_val = fOpImpl(word(S0[lane], opLo, negLo, 0), + word(S1[lane], opLo, negLo, 1), + clamp); + + uint16_t upper_raw = + *reinterpret_cast(&upper_val); + uint16_t lower_raw = + *reinterpret_cast(&lower_val); + + D[lane] = upper_raw << 16 | lower_raw; + } + } + + D.write(); + } + + template + void vop3pHelper(GPUDynInstPtr gpuDynInst, + T (*fOpImpl)(T, T, T, bool)) + { + Wavefront *wf = gpuDynInst->wavefront(); + ConstVecOperandU32 S0(gpuDynInst, extData.SRC0); + ConstVecOperandU32 S1(gpuDynInst, extData.SRC1); + ConstVecOperandU32 S2(gpuDynInst, extData.SRC2); + VecOperandU32 D(gpuDynInst, instData.VDST); + + S0.readSrc(); + S1.readSrc(); + S2.readSrc(); + + int opLo = instData.OPSEL; + int opHi = instData.OPSEL_HI2 << 2 | extData.OPSEL_HI; + int negLo = extData.NEG; + int negHi = instData.NEG_HI; + bool clamp = instData.CLMP; + for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) { + if (wf->execMask(lane)) { + T upper_val = fOpImpl(word(S0[lane], opHi, negHi, 0), + word(S1[lane], opHi, negHi, 1), + word(S2[lane], opHi, negHi, 2), + clamp); + T lower_val = fOpImpl(word(S0[lane], opLo, negLo, 0), + word(S1[lane], opLo, negLo, 1), + word(S2[lane], opLo, negLo, 2), + clamp); + + uint16_t upper_raw = + *reinterpret_cast(&upper_val); + uint16_t lower_raw = + *reinterpret_cast(&lower_val); + + D[lane] = upper_raw << 16 | lower_raw; + } + } + + D.write(); + } + + private: + bool hasSecondDword(InFmt_VOP3P *); + + template + T + word(uint32_t data, int opSel, int neg, int opSelBit) + { + // This method assumes two words packed into a dword + static_assert(sizeof(T) == 2); + + bool select = bits(opSel, opSelBit, opSelBit); + uint16_t raw = select ? bits(data, 31, 16) + : bits(data, 15, 0); + + // Apply input modifiers. This may seem odd, but the hardware + // just flips the MSb instead of doing unary negation. + bool negate = bits(neg, opSelBit, opSelBit); + if (negate) { + raw ^= 0x8000; + } + + return *reinterpret_cast(&raw); + } + }; // Inst_VOP3P + + class Inst_VOP3P_MAI : public VEGAGPUStaticInst + { + public: + Inst_VOP3P_MAI(InFmt_VOP3P_MAI*, const std::string &opcode); + ~Inst_VOP3P_MAI(); + + int instSize() const override; + void generateDisassembly() override; + + void initOperandInfo() override; + + protected: + // first instruction DWORD + InFmt_VOP3P_MAI instData; + // second instruction DWORD + InFmt_VOP3P_MAI_1 extData; + + private: + bool hasSecondDword(InFmt_VOP3P_MAI *); + }; // Inst_VOP3P + class Inst_DS : public VEGAGPUStaticInst { public: diff --git a/src/arch/amdgpu/vega/insts/vop3p.cc b/src/arch/amdgpu/vega/insts/vop3p.cc new file mode 100644 index 0000000000..a356d3bab3 --- /dev/null +++ b/src/arch/amdgpu/vega/insts/vop3p.cc @@ -0,0 +1,302 @@ +/* + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "arch/amdgpu/vega/insts/vop3p.hh" + +#include "arch/arm/insts/fplib.hh" + +namespace gem5 +{ + +namespace VegaISA +{ + +using half = uint16_t; + +// Helper functions +int16_t +clampI16(int32_t value, bool clamp) +{ + if (!clamp) { + return static_cast(value); + } + + return std::clamp(value, + static_cast(std::numeric_limits::min()), + static_cast(std::numeric_limits::max())); +} + +uint16_t +clampU16(uint32_t value, bool clamp) +{ + if (!clamp) { + return static_cast(value); + } + + return std::clamp(value, + static_cast(std::numeric_limits::min()), + static_cast(std::numeric_limits::max())); +} + +uint16_t +clampF16(uint16_t value, bool clamp) +{ + if (!clamp) { + return value; + } + + // Values of one and zero in fp16. + constexpr uint16_t one = 0x3c00; + constexpr uint16_t zero = 0x0; + ArmISA::FPSCR fpscr1, fpscr2; + + // If value > one, set to one, then if value < zero set to zero. + uint16_t imm = fplibMin(value, one, fpscr1); + return fplibMax(imm, zero, fpscr2); +} + + + + +// Begin instruction execute definitions +void Inst_VOP3P__V_PK_MAD_I16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = + [](int16_t S0, int16_t S1, int16_t S2, bool clamp) -> int16_t + { + return clampI16(S0 * S1 + S2, clamp); + }; + + vop3pHelper(gpuDynInst, opImpl); +} + +void +Inst_VOP3P__V_PK_MUL_LO_U16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t + { + // Only return lower 16 bits of result - This operation cannot clamp. + uint32_t D = S0 * S1; + uint16_t Dh = D & 0xFFFF; + return Dh; + }; + + vop3pHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_PK_ADD_I16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t + { + return clampI16(S0 + S1, clamp); + }; + + vop3pHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_PK_SUB_I16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t + { + return clampI16(S0 - S1, clamp); + }; + + vop3pHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_PK_LSHLREV_B16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t + { + unsigned shift_val = bits(S0, 3, 0); + + // Shift does not clamp + return S1 << shift_val; + }; + + vop3pHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_PK_LSHRREV_B16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = [](uint16_t S0, uint16_t S1, bool) -> uint16_t + { + unsigned shift_val = bits(S0, 3, 0); + + return S1 >> shift_val; + }; + + vop3pHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_PK_ASHRREV_B16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t + { + // Sign extend to larger type to ensure we don't lose sign bits when + // shifting. + int32_t S1e = S1; + unsigned shift_val = bits(S0, 3, 0); + + return S1e >> shift_val; + }; + + vop3pHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_PK_MAX_I16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t + { + return clampI16((S0 >= S1) ? S0 : S1, clamp); + }; + + vop3pHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_PK_MIN_I16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = [](int16_t S0, int16_t S1, bool clamp) -> int16_t + { + return clampI16((S0 < S1) ? S0 : S1, clamp); + }; + + vop3pHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_PK_MAD_U16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = + [](uint16_t S0, uint16_t S1, uint16_t S2, bool clamp) -> uint16_t + { + return clampU16(S0 * S1 + S2, clamp); + }; + + vop3pHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_PK_ADD_U16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t + { + return clampU16(S0 + S1, clamp); + }; + + vop3pHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_PK_SUB_U16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t + { + return clampU16(S0 - S1, clamp); + }; + + vop3pHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_PK_MAX_U16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t + { + return clampU16((S0 >= S1) ? S0 : S1, clamp); + }; + + vop3pHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_PK_MIN_U16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = [](uint16_t S0, uint16_t S1, bool clamp) -> uint16_t + { + return clampU16((S0 < S1) ? S0 : S1, clamp); + }; + + vop3pHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_PK_FMA_F16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = [](half S0, half S1, half S2, bool clamp) -> half + { + ArmISA::FPSCR fpscr; + return clampF16(fplibMulAdd(S2, S0, S1, fpscr), clamp); + }; + + vop3pHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_PK_ADD_F16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = [](half S0, half S1, bool clamp) -> half + { + ArmISA::FPSCR fpscr; + return clampF16(fplibAdd(S0, S1, fpscr), clamp); + }; + + vop3pHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_PK_MUL_F16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = [](half S0, half S1, bool clamp) -> half + { + ArmISA::FPSCR fpscr; + return clampF16(fplibMul(S0, S1, fpscr), clamp); + }; + + vop3pHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_PK_MIN_F16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = [](half S0, half S1, bool clamp) -> half + { + ArmISA::FPSCR fpscr; + return clampF16(fplibMin(S0, S1, fpscr), clamp); + }; + + vop3pHelper(gpuDynInst, opImpl); +} + +void Inst_VOP3P__V_PK_MAX_F16::execute(GPUDynInstPtr gpuDynInst) +{ + auto opImpl = [](half S0, half S1, bool clamp) -> half + { + ArmISA::FPSCR fpscr; + return clampF16(fplibMax(S0, S1, fpscr), clamp); + }; + + vop3pHelper(gpuDynInst, opImpl); +} + +} // namespace VegaISA +} // namespace gem5 diff --git a/src/arch/amdgpu/vega/insts/vop3p.hh b/src/arch/amdgpu/vega/insts/vop3p.hh new file mode 100644 index 0000000000..56f0f80435 --- /dev/null +++ b/src/arch/amdgpu/vega/insts/vop3p.hh @@ -0,0 +1,316 @@ +/* + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ARCH_VEGA_INSTS_VOP3P_HH__ +#define __ARCH_VEGA_INSTS_VOP3P_HH__ + +#include "arch/amdgpu/vega/gpu_decoder.hh" +#include "arch/amdgpu/vega/insts/gpu_static_inst.hh" +#include "arch/amdgpu/vega/insts/op_encodings.hh" +#include "debug/VEGA.hh" + +namespace gem5 +{ + +namespace VegaISA +{ + // Two source operands with two 16-bit values in a dword + class Inst_VOP3P__2OP_X16 : public Inst_VOP3P + { + public: + Inst_VOP3P__2OP_X16(InFmt_VOP3P *iFmt, const std::string& name) + : Inst_VOP3P(iFmt, name) + { + setFlag(ALU); + } + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 2; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: // src0 + return 4; + case 1: // src1 + return 4; + case 2: // dst + return 4; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } + + }; + + // Three source operands with two 16-bit values in a dword + class Inst_VOP3P__3OP_X16 : public Inst_VOP3P + { + public: + Inst_VOP3P__3OP_X16(InFmt_VOP3P *iFmt, const std::string& name) + : Inst_VOP3P(iFmt, name) + { + setFlag(ALU); + } + + int + getNumOperands() override + { + return numDstRegOperands() + numSrcRegOperands(); + } // getNumOperands + + int numDstRegOperands() override { return 1; } + int numSrcRegOperands() override { return 3; } + + int + getOperandSize(int opIdx) override + { + switch (opIdx) { + case 0: // src0 + return 4; + case 1: // src1 + return 4; + case 2: // src2 + return 4; + case 3: // dst + return 4; + default: + fatal("op idx %i out of bounds\n", opIdx); + return -1; + } + } + + }; + + + + // Begin instruction implementations + class Inst_VOP3P__V_PK_MAD_I16 : public Inst_VOP3P__3OP_X16 + { + public: + Inst_VOP3P__V_PK_MAD_I16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__3OP_X16(iFmt, "v_pk_mad_i16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_PK_MUL_LO_U16 : public Inst_VOP3P__2OP_X16 + { + public: + Inst_VOP3P__V_PK_MUL_LO_U16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__2OP_X16(iFmt, "v_pk_mul_lo_u16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_PK_ADD_I16 : public Inst_VOP3P__2OP_X16 + { + public: + Inst_VOP3P__V_PK_ADD_I16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__2OP_X16(iFmt, "v_pk_add_i16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_PK_SUB_I16 : public Inst_VOP3P__2OP_X16 + { + public: + Inst_VOP3P__V_PK_SUB_I16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__2OP_X16(iFmt, "v_pk_sub_i16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_PK_LSHLREV_B16 : public Inst_VOP3P__2OP_X16 + { + public: + Inst_VOP3P__V_PK_LSHLREV_B16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__2OP_X16(iFmt, "v_pk_lshlrev_b16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_PK_LSHRREV_B16 : public Inst_VOP3P__2OP_X16 + { + public: + Inst_VOP3P__V_PK_LSHRREV_B16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__2OP_X16(iFmt, "v_pk_lshrrev_b16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_PK_ASHRREV_B16 : public Inst_VOP3P__2OP_X16 + { + public: + Inst_VOP3P__V_PK_ASHRREV_B16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__2OP_X16(iFmt, "v_pk_ashrrev_b16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_PK_MAX_I16 : public Inst_VOP3P__2OP_X16 + { + public: + Inst_VOP3P__V_PK_MAX_I16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__2OP_X16(iFmt, "v_pk_max_i16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_PK_MIN_I16 : public Inst_VOP3P__2OP_X16 + { + public: + Inst_VOP3P__V_PK_MIN_I16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__2OP_X16(iFmt, "v_pk_min_i16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_PK_MAD_U16 : public Inst_VOP3P__3OP_X16 + { + public: + Inst_VOP3P__V_PK_MAD_U16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__3OP_X16(iFmt, "v_pk_mad_u16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_PK_ADD_U16 : public Inst_VOP3P__2OP_X16 + { + public: + Inst_VOP3P__V_PK_ADD_U16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__2OP_X16(iFmt, "v_pk_add_u16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_PK_SUB_U16 : public Inst_VOP3P__2OP_X16 + { + public: + Inst_VOP3P__V_PK_SUB_U16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__2OP_X16(iFmt, "v_pk_sub_u16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_PK_MAX_U16 : public Inst_VOP3P__2OP_X16 + { + public: + Inst_VOP3P__V_PK_MAX_U16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__2OP_X16(iFmt, "v_pk_max_u16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_PK_MIN_U16 : public Inst_VOP3P__2OP_X16 + { + public: + Inst_VOP3P__V_PK_MIN_U16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__2OP_X16(iFmt, "v_pk_min_u16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_PK_FMA_F16 : public Inst_VOP3P__3OP_X16 + { + public: + Inst_VOP3P__V_PK_FMA_F16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__3OP_X16(iFmt, "v_pk_fma_f16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_PK_ADD_F16 : public Inst_VOP3P__2OP_X16 + { + public: + Inst_VOP3P__V_PK_ADD_F16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__2OP_X16(iFmt, "v_pk_add_f16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_PK_MUL_F16 : public Inst_VOP3P__2OP_X16 + { + public: + Inst_VOP3P__V_PK_MUL_F16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__2OP_X16(iFmt, "v_pk_mul_f16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_PK_MIN_F16 : public Inst_VOP3P__2OP_X16 + { + public: + Inst_VOP3P__V_PK_MIN_F16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__2OP_X16(iFmt, "v_pk_min_f16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; + + class Inst_VOP3P__V_PK_MAX_F16 : public Inst_VOP3P__2OP_X16 + { + public: + Inst_VOP3P__V_PK_MAX_F16(InFmt_VOP3P *iFmt) + : Inst_VOP3P__2OP_X16(iFmt, "v_pk_max_f16") + { } + + void execute(GPUDynInstPtr gpuDynInst) override; + }; +} // namespace VegaISA +} // namespace gem5 + +#endif // __ARCH_VEGA_INSTS_VOP3P_HH__