arch-vega: Implement V_LSHL_ADD_U64

This is a new instruction in MI300 and operates similar to
V_LSHL_ADD_U32 but on 64-bit values.

Change-Id: Ia4ac65160bdad748fccdcb28286ba03157cc4046
This commit is contained in:
Matthew Poremba
2024-02-13 16:34:05 -06:00
parent f36be791aa
commit 9ab004cccc
4 changed files with 92 additions and 1 deletions

View File

@@ -1091,7 +1091,7 @@ namespace VegaISA
&Decoder::decode_OPU_VOP3__V_MAD_I16,
&Decoder::decode_OPU_VOP3__V_FMA_F16,
&Decoder::decode_OPU_VOP3__V_DIV_FIXUP_F16,
&Decoder::decode_invalid,
&Decoder::decode_OPU_VOP3__V_LSHL_ADD_U64,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
@@ -7054,6 +7054,12 @@ namespace VegaISA
return new Inst_VOP3__V_DIV_FIXUP_F16(&iFmt->iFmt_VOP3A);
}
GPUStaticInst*
Decoder::decode_OPU_VOP3__V_LSHL_ADD_U64(MachInst iFmt)
{
return new Inst_VOP3__V_LSHL_ADD_U64(&iFmt->iFmt_VOP3A);
}
GPUStaticInst*
Decoder::decode_OPU_VOP3__V_INTERP_P1_F32(MachInst iFmt)
{

View File

@@ -470,6 +470,7 @@ namespace VegaISA
GPUStaticInst* decode_OPU_VOP3__V_MAD_I16(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_FMA_F16(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_DIV_FIXUP_F16(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_LSHL_ADD_U64(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_INTERP_P1_F32(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_INTERP_P2_F32(MachInst);
GPUStaticInst* decode_OPU_VOP3__V_INTERP_MOV_F32(MachInst);

View File

@@ -30158,6 +30158,42 @@ namespace VegaISA
void execute(GPUDynInstPtr) override;
}; // Inst_VOP3__V_DIV_FIXUP_F16
class Inst_VOP3__V_LSHL_ADD_U64 : public Inst_VOP3A
{
public:
Inst_VOP3__V_LSHL_ADD_U64(InFmt_VOP3A*);
~Inst_VOP3__V_LSHL_ADD_U64();
int
getNumOperands() override
{
return numDstRegOperands() + numSrcRegOperands();
} // getNumOperands
int numDstRegOperands() override { return 1; }
int numSrcRegOperands() override { return 3; }
int
getOperandSize(int opIdx) override
{
switch (opIdx) {
case 0: //src_0
return 8;
case 1: //src_1
return 4;
case 2: //src_2
return 8;
case 3: //vdst
return 8;
default:
fatal("op idx %i out of bounds\n", opIdx);
return -1;
}
} // getOperandSize
void execute(GPUDynInstPtr) override;
}; // Inst_VOP3__V_LSHL_ADD_U64
class Inst_VOP3__V_CVT_PKACCUM_U8_F32 : public Inst_VOP3A
{
public:

View File

@@ -7630,6 +7630,54 @@ namespace VegaISA
{
panicUnimplemented();
} // execute
// --- Inst_VOP3__V_LSHL_ADD_U64 class methods ---
Inst_VOP3__V_LSHL_ADD_U64::Inst_VOP3__V_LSHL_ADD_U64(InFmt_VOP3A *iFmt)
: Inst_VOP3A(iFmt, "v_lshl_add_u64", false)
{
setFlag(ALU);
} // Inst_VOP3__V_LSHL_ADD_U64
Inst_VOP3__V_LSHL_ADD_U64::~Inst_VOP3__V_LSHL_ADD_U64()
{
} // ~Inst_VOP3__V_LSHL_ADD_U64
// --- description from .arch file ---
// D.u = (S0.u << S1.u[4:0]) + S2.u.
void
Inst_VOP3__V_LSHL_ADD_U64::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
VecOperandU64 vdst(gpuDynInst, instData.VDST);
src0.readSrc();
src1.readSrc();
src2.readSrc();
/**
* input modifiers are supported by FP operations only
*/
assert(!(instData.ABS & 0x1));
assert(!(instData.ABS & 0x2));
assert(!(instData.ABS & 0x4));
assert(!(extData.NEG & 0x1));
assert(!(extData.NEG & 0x2));
assert(!(extData.NEG & 0x4));
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
int shift_amount = bits(src1[lane], 2, 0);
shift_amount = shift_amount > 4 ? 0 : shift_amount;
vdst[lane] = (src0[lane] << shift_amount)
+ src2[lane];
}
}
vdst.write();
} // execute
// --- Inst_VOP3__V_CVT_PKACCUM_U8_F32 class methods ---
Inst_VOP3__V_CVT_PKACCUM_U8_F32::Inst_VOP3__V_CVT_PKACCUM_U8_F32(