arch-vega: Implement FP32 packed math
Starting with MI200, packed math can operate on double dword inputs. In this case, 64-bits of inputs (two VGPRs per lane) contain two FP32 values. Add instructions to perform add, multiply, and FMA on packed FP32 types. Change-Id: Ib838bff91a10e02e013cc7c33ec3d91ff08647b0
This commit is contained in:
@@ -3627,9 +3627,9 @@ namespace VegaISA
|
||||
&Decoder::decode_invalid,
|
||||
&Decoder::decode_invalid,
|
||||
&Decoder::decode_invalid,
|
||||
&Decoder::decode_invalid,
|
||||
&Decoder::decode_invalid,
|
||||
&Decoder::decode_invalid,
|
||||
&Decoder::decode_OP_VOP3P__V_PK_FMA_F32,
|
||||
&Decoder::decode_OP_VOP3P__V_PK_MUL_F32,
|
||||
&Decoder::decode_OP_VOP3P__V_PK_ADD_F32,
|
||||
&Decoder::decode_OP_VOP3P__V_PK_MOV_B32,
|
||||
&Decoder::decode_invalid,
|
||||
&Decoder::decode_invalid,
|
||||
@@ -4203,8 +4203,7 @@ namespace VegaISA
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_VOP2__V_FMAC_F32(MachInst iFmt)
|
||||
{
|
||||
fatal("Trying to decode instruction without a class\n");
|
||||
return nullptr;
|
||||
return new Inst_VOP2__V_FMAC_F32(&iFmt->iFmt_VOP2);
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
@@ -8293,8 +8292,7 @@ namespace VegaISA
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_FLAT__FLAT_STORE_SHORT_D16_HI(MachInst iFmt)
|
||||
{
|
||||
fatal("Trying to decode instruction without a class\n");
|
||||
return nullptr;
|
||||
return new Inst_FLAT__FLAT_STORE_SHORT_D16_HI(&iFmt->iFmt_FLAT);
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
@@ -8607,8 +8605,7 @@ namespace VegaISA
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_GLOBAL__GLOBAL_STORE_SHORT_D16_HI(MachInst iFmt)
|
||||
{
|
||||
fatal("Trying to decode instruction without a class\n");
|
||||
return nullptr;
|
||||
return new Inst_FLAT__FLAT_STORE_SHORT_D16_HI(&iFmt->iFmt_FLAT);
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
@@ -9968,8 +9965,7 @@ namespace VegaISA
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_SCRATCH__SCRATCH_STORE_SHORT_D16_HI(MachInst iFmt)
|
||||
{
|
||||
fatal("Trying to decode instruction without a class\n");
|
||||
return nullptr;
|
||||
return new Inst_FLAT__FLAT_STORE_SHORT_D16_HI(&iFmt->iFmt_FLAT);
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
@@ -13105,6 +13101,24 @@ namespace VegaISA
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_VOP3P__V_PK_FMA_F32(MachInst iFmt)
|
||||
{
|
||||
return new Inst_VOP3P__V_PK_FMA_F32(&iFmt->iFmt_VOP3P);
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_VOP3P__V_PK_MUL_F32(MachInst iFmt)
|
||||
{
|
||||
return new Inst_VOP3P__V_PK_MUL_F32(&iFmt->iFmt_VOP3P);
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_VOP3P__V_PK_ADD_F32(MachInst iFmt)
|
||||
{
|
||||
return new Inst_VOP3P__V_PK_ADD_F32(&iFmt->iFmt_VOP3P);
|
||||
}
|
||||
|
||||
GPUStaticInst*
|
||||
Decoder::decode_OP_VOP3P__V_PK_MOV_B32(MachInst iFmt)
|
||||
{
|
||||
|
||||
@@ -1593,6 +1593,9 @@ namespace VegaISA
|
||||
GPUStaticInst* decode_OP_VOP3P__V_MAD_MIX_F32(MachInst);
|
||||
GPUStaticInst* decode_OP_VOP3P__V_MAD_MIXLO_F16(MachInst);
|
||||
GPUStaticInst* decode_OP_VOP3P__V_MAD_MIXHI_F16(MachInst);
|
||||
GPUStaticInst* decode_OP_VOP3P__V_PK_FMA_F32(MachInst);
|
||||
GPUStaticInst* decode_OP_VOP3P__V_PK_MUL_F32(MachInst);
|
||||
GPUStaticInst* decode_OP_VOP3P__V_PK_ADD_F32(MachInst);
|
||||
GPUStaticInst* decode_OP_VOP3P__V_PK_MOV_B32(MachInst);
|
||||
GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_16X16X16I8(MachInst);
|
||||
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F64_16X16X4F64(MachInst);
|
||||
|
||||
@@ -8129,6 +8129,40 @@ namespace VegaISA
|
||||
|
||||
vdst.write();
|
||||
} // execute
|
||||
// --- Inst_VOP2__V_FMAC_F32 class methods ---
|
||||
|
||||
Inst_VOP2__V_FMAC_F32::Inst_VOP2__V_FMAC_F32(InFmt_VOP2 *iFmt)
|
||||
: Inst_VOP2(iFmt, "v_fmac_f32")
|
||||
{
|
||||
setFlag(ALU);
|
||||
} // Inst_VOP2__V_FMAC_F32
|
||||
|
||||
Inst_VOP2__V_FMAC_F32::~Inst_VOP2__V_FMAC_F32()
|
||||
{
|
||||
} // ~Inst_VOP2__V_FMAC_F32
|
||||
|
||||
// --- description from .arch file ---
|
||||
// D.u = S1.u - S0.u;
|
||||
void
|
||||
Inst_VOP2__V_FMAC_F32::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
|
||||
ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
|
||||
VecOperandU32 vdst(gpuDynInst, instData.VDST);
|
||||
|
||||
src0.readSrc();
|
||||
src1.read();
|
||||
vdst.read();
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (wf->execMask(lane)) {
|
||||
vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
|
||||
}
|
||||
}
|
||||
|
||||
vdst.write();
|
||||
} // execute
|
||||
// --- Inst_VOP1__V_NOP class methods ---
|
||||
|
||||
Inst_VOP1__V_NOP::Inst_VOP1__V_NOP(InFmt_VOP1 *iFmt)
|
||||
@@ -44497,6 +44531,66 @@ namespace VegaISA
|
||||
Inst_FLAT__FLAT_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
} // completeAcc
|
||||
// --- Inst_FLAT__FLAT_STORE_SHORT_D16_HI class methods ---
|
||||
|
||||
Inst_FLAT__FLAT_STORE_SHORT_D16_HI::
|
||||
Inst_FLAT__FLAT_STORE_SHORT_D16_HI(InFmt_FLAT *iFmt)
|
||||
: Inst_FLAT(iFmt, "flat_store_short_d16_hi")
|
||||
{
|
||||
setFlag(MemoryRef);
|
||||
setFlag(Store);
|
||||
} // Inst_FLAT__FLAT_STORE_SHORT_D16_HI
|
||||
|
||||
Inst_FLAT__FLAT_STORE_SHORT_D16_HI::~Inst_FLAT__FLAT_STORE_SHORT_D16_HI()
|
||||
{
|
||||
} // ~Inst_FLAT__FLAT_STORE_SHORT_D16_HI
|
||||
|
||||
// --- description from .arch file ---
|
||||
// Untyped buffer store short.
|
||||
void
|
||||
Inst_FLAT__FLAT_STORE_SHORT_D16_HI::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
|
||||
if (gpuDynInst->exec_mask.none()) {
|
||||
wf->decVMemInstsIssued();
|
||||
if (isFlat()) {
|
||||
wf->decLGKMInstsIssued();
|
||||
}
|
||||
wf->decExpInstsIssued();
|
||||
return;
|
||||
}
|
||||
|
||||
gpuDynInst->execUnitId = wf->execUnitId;
|
||||
gpuDynInst->latency.init(gpuDynInst->computeUnit());
|
||||
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
|
||||
|
||||
ConstVecOperandU32 data(gpuDynInst, extData.DATA);
|
||||
|
||||
data.read();
|
||||
|
||||
calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (gpuDynInst->exec_mask[lane]) {
|
||||
(reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
|
||||
= (data[lane] >> 16);
|
||||
}
|
||||
}
|
||||
|
||||
issueRequestHelper(gpuDynInst);
|
||||
} // execute
|
||||
|
||||
void
|
||||
Inst_FLAT__FLAT_STORE_SHORT_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
initMemWrite<VecElemU16>(gpuDynInst);
|
||||
} // initiateAcc
|
||||
|
||||
void
|
||||
Inst_FLAT__FLAT_STORE_SHORT_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
} // completeAcc
|
||||
// --- Inst_FLAT__FLAT_STORE_DWORD class methods ---
|
||||
|
||||
Inst_FLAT__FLAT_STORE_DWORD::Inst_FLAT__FLAT_STORE_DWORD(InFmt_FLAT *iFmt)
|
||||
@@ -45995,6 +46089,191 @@ namespace VegaISA
|
||||
{
|
||||
atomicComplete<VecOperandF64, VecElemF64>(gpuDynInst);
|
||||
} // completeAcc
|
||||
// --- Inst_VOP3P__V_PK_FMA_F32 class methods ---
|
||||
|
||||
Inst_VOP3P__V_PK_FMA_F32::Inst_VOP3P__V_PK_FMA_F32(InFmt_VOP3P *iFmt)
|
||||
: Inst_VOP3P(iFmt, "v_pk_fma_f32")
|
||||
{
|
||||
setFlag(ALU);
|
||||
} // Inst_VOP3P__V_PK_FMA_F32
|
||||
|
||||
Inst_VOP3P__V_PK_FMA_F32::~Inst_VOP3P__V_PK_FMA_F32()
|
||||
{
|
||||
} // ~Inst_VOP3P__V_PK_FMA_F32
|
||||
|
||||
// D.f[63:32] = S0.f[63:32] * S1.f[63:32] + S2.f[63:32] . D.f[31:0] =
|
||||
// S0.f[31:0] * S1.f[31:0] + S2.f[31:0] .
|
||||
void
|
||||
Inst_VOP3P__V_PK_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
// This is a special case of packed instructions which operates on
|
||||
// 64-bit inputs/outputs and not 32-bit. U64 is used here as float
|
||||
// values cannot use bitwise operations. Consider the U64 to imply
|
||||
// untyped 64-bits of data.
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
|
||||
ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
|
||||
ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
|
||||
VecOperandU64 vdst(gpuDynInst, instData.VDST);
|
||||
|
||||
src0.readSrc();
|
||||
src1.readSrc();
|
||||
src2.readSrc();
|
||||
|
||||
int opsel = instData.OPSEL;
|
||||
int opsel_hi = extData.OPSEL_HI | (instData.OPSEL_HI2 << 2);
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (wf->execMask(lane)) {
|
||||
uint32_t s0l = (opsel & 1) ? bits(src0[lane], 63, 32)
|
||||
: bits(src0[lane], 31, 0);
|
||||
uint32_t s1l = (opsel & 2) ? bits(src1[lane], 63, 32)
|
||||
: bits(src1[lane], 31, 0);
|
||||
uint32_t s2l = (opsel & 4) ? bits(src2[lane], 63, 32)
|
||||
: bits(src2[lane], 31, 0);
|
||||
|
||||
float dword1 = std::fma(*reinterpret_cast<float*>(&s0l),
|
||||
*reinterpret_cast<float*>(&s1l),
|
||||
*reinterpret_cast<float*>(&s2l));
|
||||
|
||||
uint32_t s0h = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
|
||||
: bits(src0[lane], 31, 0);
|
||||
uint32_t s1h = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
|
||||
: bits(src1[lane], 31, 0);
|
||||
uint32_t s2h = (opsel_hi & 4) ? bits(src2[lane], 63, 32)
|
||||
: bits(src2[lane], 31, 0);
|
||||
|
||||
float dword2 = std::fma(*reinterpret_cast<float*>(&s0h),
|
||||
*reinterpret_cast<float*>(&s1h),
|
||||
*reinterpret_cast<float*>(&s2h));
|
||||
|
||||
uint64_t result1 = *reinterpret_cast<uint64_t*>(&dword1);
|
||||
uint64_t result2 = *reinterpret_cast<uint64_t*>(&dword2);
|
||||
|
||||
vdst[lane] = (result2 << 32) | result1;
|
||||
}
|
||||
}
|
||||
|
||||
vdst.write();
|
||||
} // execute
|
||||
// --- Inst_VOP3P__V_PK_MUL_F32 class methods ---
|
||||
|
||||
Inst_VOP3P__V_PK_MUL_F32::Inst_VOP3P__V_PK_MUL_F32(InFmt_VOP3P *iFmt)
|
||||
: Inst_VOP3P(iFmt, "v_pk_mul_f32")
|
||||
{
|
||||
setFlag(ALU);
|
||||
} // Inst_VOP3P__V_PK_MUL_F32
|
||||
|
||||
Inst_VOP3P__V_PK_MUL_F32::~Inst_VOP3P__V_PK_MUL_F32()
|
||||
{
|
||||
} // ~Inst_VOP3P__V_PK_MUL_F32
|
||||
|
||||
// D.f[63:32] = S0.f[63:32] * S1.f[63:32] . D.f[31:0] = S0.f[31:0] *
|
||||
// S1.f[31:0]
|
||||
void
|
||||
Inst_VOP3P__V_PK_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
// This is a special case of packed instructions which operates on
|
||||
// 64-bit inputs/outputs and not 32-bit. U64 is used here as float
|
||||
// values cannot use bitwise operations. Consider the U64 to imply
|
||||
// untyped 64-bits of data.
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
|
||||
ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
|
||||
VecOperandU64 vdst(gpuDynInst, instData.VDST);
|
||||
|
||||
src0.readSrc();
|
||||
src1.readSrc();
|
||||
|
||||
int opsel = instData.OPSEL;
|
||||
int opsel_hi = extData.OPSEL_HI;
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (wf->execMask(lane)) {
|
||||
uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
|
||||
: bits(src0[lane], 31, 0);
|
||||
uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
|
||||
: bits(src1[lane], 31, 0);
|
||||
|
||||
float dword1 = *reinterpret_cast<float*>(&lower_dword)
|
||||
* *reinterpret_cast<float*>(&upper_dword);
|
||||
|
||||
lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
|
||||
: bits(src0[lane], 31, 0);
|
||||
upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
|
||||
: bits(src1[lane], 31, 0);
|
||||
|
||||
float dword2 = *reinterpret_cast<float*>(&lower_dword)
|
||||
* *reinterpret_cast<float*>(&upper_dword);
|
||||
|
||||
uint64_t result1 = *reinterpret_cast<uint64_t*>(&dword1);
|
||||
uint64_t result2 = *reinterpret_cast<uint64_t*>(&dword2);
|
||||
|
||||
vdst[lane] = (result2 << 32) | result1;
|
||||
}
|
||||
}
|
||||
|
||||
vdst.write();
|
||||
} // execute
|
||||
// --- Inst_VOP3P__V_PK_ADD_F32 class methods ---
|
||||
|
||||
Inst_VOP3P__V_PK_ADD_F32::Inst_VOP3P__V_PK_ADD_F32(InFmt_VOP3P *iFmt)
|
||||
: Inst_VOP3P(iFmt, "v_pk_add_f32")
|
||||
{
|
||||
setFlag(ALU);
|
||||
} // Inst_VOP3P__V_PK_ADD_F32
|
||||
|
||||
Inst_VOP3P__V_PK_ADD_F32::~Inst_VOP3P__V_PK_ADD_F32()
|
||||
{
|
||||
} // ~Inst_VOP3P__V_PK_ADD_F32
|
||||
|
||||
// D.f[63:32] = S0.f[63:32] + S1.f[63:32] . D.f[31:0] = S0.f[31:0] +
|
||||
// S1.f[31:0]
|
||||
void
|
||||
Inst_VOP3P__V_PK_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
// This is a special case of packed instructions which operates on
|
||||
// 64-bit inputs/outputs and not 32-bit. U64 is used here as float
|
||||
// values cannot use bitwise operations. Consider the U64 to imply
|
||||
// untyped 64-bits of data.
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
|
||||
ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
|
||||
VecOperandU64 vdst(gpuDynInst, instData.VDST);
|
||||
|
||||
src0.readSrc();
|
||||
src1.readSrc();
|
||||
|
||||
int opsel = instData.OPSEL;
|
||||
int opsel_hi = extData.OPSEL_HI;
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (wf->execMask(lane)) {
|
||||
uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
|
||||
: bits(src0[lane], 31, 0);
|
||||
uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
|
||||
: bits(src1[lane], 31, 0);
|
||||
|
||||
float dword1 = *reinterpret_cast<float*>(&lower_dword)
|
||||
+ *reinterpret_cast<float*>(&upper_dword);
|
||||
|
||||
lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
|
||||
: bits(src0[lane], 31, 0);
|
||||
upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
|
||||
: bits(src1[lane], 31, 0);
|
||||
|
||||
float dword2 = *reinterpret_cast<float*>(&lower_dword)
|
||||
+ *reinterpret_cast<float*>(&upper_dword);
|
||||
|
||||
uint64_t result1 = *reinterpret_cast<uint64_t*>(&dword1);
|
||||
uint64_t result2 = *reinterpret_cast<uint64_t*>(&dword2);
|
||||
|
||||
vdst[lane] = (result2 << 32) | result1;
|
||||
}
|
||||
}
|
||||
|
||||
vdst.write();
|
||||
} // execute
|
||||
// --- Inst_VOP3P__V_PK_MOV_B32 class methods ---
|
||||
|
||||
Inst_VOP3P__V_PK_MOV_B32::Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P *iFmt)
|
||||
|
||||
@@ -8098,6 +8098,40 @@ namespace VegaISA
|
||||
void execute(GPUDynInstPtr) override;
|
||||
}; // Inst_VOP2__V_SUBREV_U32
|
||||
|
||||
class Inst_VOP2__V_FMAC_F32 : public Inst_VOP2
|
||||
{
|
||||
public:
|
||||
Inst_VOP2__V_FMAC_F32(InFmt_VOP2*);
|
||||
~Inst_VOP2__V_FMAC_F32();
|
||||
|
||||
int
|
||||
getNumOperands() override
|
||||
{
|
||||
return numDstRegOperands() + numSrcRegOperands();
|
||||
} // getNumOperands
|
||||
|
||||
int numDstRegOperands() override { return 1; }
|
||||
int numSrcRegOperands() override { return 2; }
|
||||
|
||||
int
|
||||
getOperandSize(int opIdx) override
|
||||
{
|
||||
switch (opIdx) {
|
||||
case 0: //src_0
|
||||
return 4;
|
||||
case 1: //src_1
|
||||
return 4;
|
||||
case 2: //vdst
|
||||
return 4;
|
||||
default:
|
||||
fatal("op idx %i out of bounds\n", opIdx);
|
||||
return -1;
|
||||
}
|
||||
} // getOperandSize
|
||||
|
||||
void execute(GPUDynInstPtr) override;
|
||||
}; // Inst_VOP2__V_FMAC_F32
|
||||
|
||||
class Inst_VOP1__V_NOP : public Inst_VOP1
|
||||
{
|
||||
public:
|
||||
@@ -42280,6 +42314,43 @@ namespace VegaISA
|
||||
void completeAcc(GPUDynInstPtr) override;
|
||||
}; // Inst_FLAT__FLAT_STORE_SHORT
|
||||
|
||||
class Inst_FLAT__FLAT_STORE_SHORT_D16_HI : public Inst_FLAT
|
||||
{
|
||||
public:
|
||||
Inst_FLAT__FLAT_STORE_SHORT_D16_HI(InFmt_FLAT*);
|
||||
~Inst_FLAT__FLAT_STORE_SHORT_D16_HI();
|
||||
|
||||
int
|
||||
getNumOperands() override
|
||||
{
|
||||
return numDstRegOperands() + numSrcRegOperands();
|
||||
} // getNumOperands
|
||||
|
||||
int numDstRegOperands() override { return 0; }
|
||||
int numSrcRegOperands() override { return isFlat() ? 2 : 3; }
|
||||
|
||||
int
|
||||
getOperandSize(int opIdx) override
|
||||
{
|
||||
switch (opIdx) {
|
||||
case 0: //vgpr_addr
|
||||
return vgprIsOffset() ? 4 : 8;
|
||||
case 1: //vgpr_src
|
||||
return 2;
|
||||
case 2: //saddr
|
||||
assert(!isFlat());
|
||||
return 8;
|
||||
default:
|
||||
fatal("op idx %i out of bounds\n", opIdx);
|
||||
return -1;
|
||||
}
|
||||
} // getOperandSize
|
||||
|
||||
void execute(GPUDynInstPtr) override;
|
||||
void initiateAcc(GPUDynInstPtr) override;
|
||||
void completeAcc(GPUDynInstPtr) override;
|
||||
}; // Inst_FLAT__FLAT_STORE_SHORT_D16_HI
|
||||
|
||||
class Inst_FLAT__FLAT_STORE_DWORD : public Inst_FLAT
|
||||
{
|
||||
public:
|
||||
@@ -43637,6 +43708,110 @@ namespace VegaISA
|
||||
void completeAcc(GPUDynInstPtr) override;
|
||||
}; // Inst_FLAT__FLAT_ATOMIC_MAX_F64
|
||||
|
||||
class Inst_VOP3P__V_PK_FMA_F32 : public Inst_VOP3P
|
||||
{
|
||||
public:
|
||||
Inst_VOP3P__V_PK_FMA_F32(InFmt_VOP3P*);
|
||||
~Inst_VOP3P__V_PK_FMA_F32();
|
||||
|
||||
int
|
||||
getNumOperands() override
|
||||
{
|
||||
return numDstRegOperands() + numSrcRegOperands();
|
||||
} // getNumOperands
|
||||
|
||||
int numDstRegOperands() override { return 1; }
|
||||
int numSrcRegOperands() override { return 3; }
|
||||
|
||||
int
|
||||
getOperandSize(int opIdx) override
|
||||
{
|
||||
switch (opIdx) {
|
||||
case 0: // src0
|
||||
return 8;
|
||||
case 1: // src1
|
||||
return 8;
|
||||
case 2: // src2
|
||||
return 8;
|
||||
case 3: // dst
|
||||
return 8;
|
||||
default:
|
||||
fatal("op idx %i out of bounds\n", opIdx);
|
||||
return -1;
|
||||
}
|
||||
} // getOperandSize
|
||||
|
||||
void execute(GPUDynInstPtr) override;
|
||||
}; // Inst_VOP3P__V_PK_FMA_F32
|
||||
|
||||
class Inst_VOP3P__V_PK_MUL_F32 : public Inst_VOP3P
|
||||
{
|
||||
public:
|
||||
Inst_VOP3P__V_PK_MUL_F32(InFmt_VOP3P*);
|
||||
~Inst_VOP3P__V_PK_MUL_F32();
|
||||
|
||||
int
|
||||
getNumOperands() override
|
||||
{
|
||||
return numDstRegOperands() + numSrcRegOperands();
|
||||
} // getNumOperands
|
||||
|
||||
int numDstRegOperands() override { return 1; }
|
||||
int numSrcRegOperands() override { return 2; }
|
||||
|
||||
int
|
||||
getOperandSize(int opIdx) override
|
||||
{
|
||||
switch (opIdx) {
|
||||
case 0: // src0
|
||||
return 8;
|
||||
case 1: // src1
|
||||
return 8;
|
||||
case 2: // dst
|
||||
return 8;
|
||||
default:
|
||||
fatal("op idx %i out of bounds\n", opIdx);
|
||||
return -1;
|
||||
}
|
||||
} // getOperandSize
|
||||
|
||||
void execute(GPUDynInstPtr) override;
|
||||
}; // Inst_VOP3P__V_PK_MUL_F32
|
||||
|
||||
class Inst_VOP3P__V_PK_ADD_F32 : public Inst_VOP3P
|
||||
{
|
||||
public:
|
||||
Inst_VOP3P__V_PK_ADD_F32(InFmt_VOP3P*);
|
||||
~Inst_VOP3P__V_PK_ADD_F32();
|
||||
|
||||
int
|
||||
getNumOperands() override
|
||||
{
|
||||
return numDstRegOperands() + numSrcRegOperands();
|
||||
} // getNumOperands
|
||||
|
||||
int numDstRegOperands() override { return 1; }
|
||||
int numSrcRegOperands() override { return 2; }
|
||||
|
||||
int
|
||||
getOperandSize(int opIdx) override
|
||||
{
|
||||
switch (opIdx) {
|
||||
case 0: // src0
|
||||
return 8;
|
||||
case 1: // src1
|
||||
return 8;
|
||||
case 2: // dst
|
||||
return 8;
|
||||
default:
|
||||
fatal("op idx %i out of bounds\n", opIdx);
|
||||
return -1;
|
||||
}
|
||||
} // getOperandSize
|
||||
|
||||
void execute(GPUDynInstPtr) override;
|
||||
}; // Inst_VOP3P__V_PK_ADD_F32
|
||||
|
||||
class Inst_VOP3P__V_PK_MOV_B32 : public Inst_VOP3P
|
||||
{
|
||||
public:
|
||||
|
||||
Reference in New Issue
Block a user