arch-vega: Implement FP32 packed math

Starting with MI200, packed math can operate on double dword inputs. In
this case, 64-bits of inputs (two VGPRs per lane) contain two FP32
values.

Add instructions to perform add, multiply, and FMA on packed FP32 types.

Change-Id: Ib838bff91a10e02e013cc7c33ec3d91ff08647b0
This commit is contained in:
Matthew Poremba
2023-12-08 12:12:16 -06:00
parent 7b0c47d52f
commit 420cda1bef
4 changed files with 482 additions and 11 deletions

View File

@@ -3627,9 +3627,9 @@ namespace VegaISA
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_OP_VOP3P__V_PK_FMA_F32,
&Decoder::decode_OP_VOP3P__V_PK_MUL_F32,
&Decoder::decode_OP_VOP3P__V_PK_ADD_F32,
&Decoder::decode_OP_VOP3P__V_PK_MOV_B32,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
@@ -4203,8 +4203,7 @@ namespace VegaISA
GPUStaticInst*
Decoder::decode_OP_VOP2__V_FMAC_F32(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_VOP2__V_FMAC_F32(&iFmt->iFmt_VOP2);
}
GPUStaticInst*
@@ -8293,8 +8292,7 @@ namespace VegaISA
GPUStaticInst*
Decoder::decode_OP_FLAT__FLAT_STORE_SHORT_D16_HI(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_FLAT__FLAT_STORE_SHORT_D16_HI(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
@@ -8607,8 +8605,7 @@ namespace VegaISA
GPUStaticInst*
Decoder::decode_OP_GLOBAL__GLOBAL_STORE_SHORT_D16_HI(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_FLAT__FLAT_STORE_SHORT_D16_HI(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
@@ -9968,8 +9965,7 @@ namespace VegaISA
GPUStaticInst*
Decoder::decode_OP_SCRATCH__SCRATCH_STORE_SHORT_D16_HI(MachInst iFmt)
{
fatal("Trying to decode instruction without a class\n");
return nullptr;
return new Inst_FLAT__FLAT_STORE_SHORT_D16_HI(&iFmt->iFmt_FLAT);
}
GPUStaticInst*
@@ -13105,6 +13101,24 @@ namespace VegaISA
return nullptr;
}
GPUStaticInst*
Decoder::decode_OP_VOP3P__V_PK_FMA_F32(MachInst iFmt)
{
return new Inst_VOP3P__V_PK_FMA_F32(&iFmt->iFmt_VOP3P);
}
GPUStaticInst*
Decoder::decode_OP_VOP3P__V_PK_MUL_F32(MachInst iFmt)
{
return new Inst_VOP3P__V_PK_MUL_F32(&iFmt->iFmt_VOP3P);
}
GPUStaticInst*
Decoder::decode_OP_VOP3P__V_PK_ADD_F32(MachInst iFmt)
{
return new Inst_VOP3P__V_PK_ADD_F32(&iFmt->iFmt_VOP3P);
}
GPUStaticInst*
Decoder::decode_OP_VOP3P__V_PK_MOV_B32(MachInst iFmt)
{

View File

@@ -1593,6 +1593,9 @@ namespace VegaISA
GPUStaticInst* decode_OP_VOP3P__V_MAD_MIX_F32(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MAD_MIXLO_F16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MAD_MIXHI_F16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_PK_FMA_F32(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_PK_MUL_F32(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_PK_ADD_F32(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_PK_MOV_B32(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_16X16X16I8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F64_16X16X4F64(MachInst);

View File

@@ -8129,6 +8129,40 @@ namespace VegaISA
vdst.write();
} // execute
// --- Inst_VOP2__V_FMAC_F32 class methods ---
Inst_VOP2__V_FMAC_F32::Inst_VOP2__V_FMAC_F32(InFmt_VOP2 *iFmt)
: Inst_VOP2(iFmt, "v_fmac_f32")
{
setFlag(ALU);
} // Inst_VOP2__V_FMAC_F32
Inst_VOP2__V_FMAC_F32::~Inst_VOP2__V_FMAC_F32()
{
} // ~Inst_VOP2__V_FMAC_F32
// --- description from .arch file ---
// D.u = S1.u - S0.u;
void
Inst_VOP2__V_FMAC_F32::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
VecOperandU32 vdst(gpuDynInst, instData.VDST);
src0.readSrc();
src1.read();
vdst.read();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
vdst[lane] = std::fma(src0[lane], src1[lane], vdst[lane]);
}
}
vdst.write();
} // execute
// --- Inst_VOP1__V_NOP class methods ---
Inst_VOP1__V_NOP::Inst_VOP1__V_NOP(InFmt_VOP1 *iFmt)
@@ -44497,6 +44531,66 @@ namespace VegaISA
Inst_FLAT__FLAT_STORE_SHORT::completeAcc(GPUDynInstPtr gpuDynInst)
{
} // completeAcc
// --- Inst_FLAT__FLAT_STORE_SHORT_D16_HI class methods ---
Inst_FLAT__FLAT_STORE_SHORT_D16_HI::
Inst_FLAT__FLAT_STORE_SHORT_D16_HI(InFmt_FLAT *iFmt)
: Inst_FLAT(iFmt, "flat_store_short_d16_hi")
{
setFlag(MemoryRef);
setFlag(Store);
} // Inst_FLAT__FLAT_STORE_SHORT_D16_HI
Inst_FLAT__FLAT_STORE_SHORT_D16_HI::~Inst_FLAT__FLAT_STORE_SHORT_D16_HI()
{
} // ~Inst_FLAT__FLAT_STORE_SHORT_D16_HI
// --- description from .arch file ---
// Untyped buffer store short.
void
Inst_FLAT__FLAT_STORE_SHORT_D16_HI::execute(GPUDynInstPtr gpuDynInst)
{
Wavefront *wf = gpuDynInst->wavefront();
if (gpuDynInst->exec_mask.none()) {
wf->decVMemInstsIssued();
if (isFlat()) {
wf->decLGKMInstsIssued();
}
wf->decExpInstsIssued();
return;
}
gpuDynInst->execUnitId = wf->execUnitId;
gpuDynInst->latency.init(gpuDynInst->computeUnit());
gpuDynInst->latency.set(gpuDynInst->computeUnit()->clockPeriod());
ConstVecOperandU32 data(gpuDynInst, extData.DATA);
data.read();
calcAddr(gpuDynInst, extData.ADDR, extData.SADDR, instData.OFFSET);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
(reinterpret_cast<VecElemU16*>(gpuDynInst->d_data))[lane]
= (data[lane] >> 16);
}
}
issueRequestHelper(gpuDynInst);
} // execute
void
Inst_FLAT__FLAT_STORE_SHORT_D16_HI::initiateAcc(GPUDynInstPtr gpuDynInst)
{
initMemWrite<VecElemU16>(gpuDynInst);
} // initiateAcc
void
Inst_FLAT__FLAT_STORE_SHORT_D16_HI::completeAcc(GPUDynInstPtr gpuDynInst)
{
} // completeAcc
// --- Inst_FLAT__FLAT_STORE_DWORD class methods ---
Inst_FLAT__FLAT_STORE_DWORD::Inst_FLAT__FLAT_STORE_DWORD(InFmt_FLAT *iFmt)
@@ -45995,6 +46089,191 @@ namespace VegaISA
{
atomicComplete<VecOperandF64, VecElemF64>(gpuDynInst);
} // completeAcc
// --- Inst_VOP3P__V_PK_FMA_F32 class methods ---
Inst_VOP3P__V_PK_FMA_F32::Inst_VOP3P__V_PK_FMA_F32(InFmt_VOP3P *iFmt)
: Inst_VOP3P(iFmt, "v_pk_fma_f32")
{
setFlag(ALU);
} // Inst_VOP3P__V_PK_FMA_F32
Inst_VOP3P__V_PK_FMA_F32::~Inst_VOP3P__V_PK_FMA_F32()
{
} // ~Inst_VOP3P__V_PK_FMA_F32
// D.f[63:32] = S0.f[63:32] * S1.f[63:32] + S2.f[63:32] . D.f[31:0] =
// S0.f[31:0] * S1.f[31:0] + S2.f[31:0] .
void
Inst_VOP3P__V_PK_FMA_F32::execute(GPUDynInstPtr gpuDynInst)
{
// This is a special case of packed instructions which operates on
// 64-bit inputs/outputs and not 32-bit. U64 is used here as float
// values cannot use bitwise operations. Consider the U64 to imply
// untyped 64-bits of data.
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
ConstVecOperandU64 src2(gpuDynInst, extData.SRC2);
VecOperandU64 vdst(gpuDynInst, instData.VDST);
src0.readSrc();
src1.readSrc();
src2.readSrc();
int opsel = instData.OPSEL;
int opsel_hi = extData.OPSEL_HI | (instData.OPSEL_HI2 << 2);
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
uint32_t s0l = (opsel & 1) ? bits(src0[lane], 63, 32)
: bits(src0[lane], 31, 0);
uint32_t s1l = (opsel & 2) ? bits(src1[lane], 63, 32)
: bits(src1[lane], 31, 0);
uint32_t s2l = (opsel & 4) ? bits(src2[lane], 63, 32)
: bits(src2[lane], 31, 0);
float dword1 = std::fma(*reinterpret_cast<float*>(&s0l),
*reinterpret_cast<float*>(&s1l),
*reinterpret_cast<float*>(&s2l));
uint32_t s0h = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
: bits(src0[lane], 31, 0);
uint32_t s1h = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
: bits(src1[lane], 31, 0);
uint32_t s2h = (opsel_hi & 4) ? bits(src2[lane], 63, 32)
: bits(src2[lane], 31, 0);
float dword2 = std::fma(*reinterpret_cast<float*>(&s0h),
*reinterpret_cast<float*>(&s1h),
*reinterpret_cast<float*>(&s2h));
uint64_t result1 = *reinterpret_cast<uint64_t*>(&dword1);
uint64_t result2 = *reinterpret_cast<uint64_t*>(&dword2);
vdst[lane] = (result2 << 32) | result1;
}
}
vdst.write();
} // execute
// --- Inst_VOP3P__V_PK_MUL_F32 class methods ---
Inst_VOP3P__V_PK_MUL_F32::Inst_VOP3P__V_PK_MUL_F32(InFmt_VOP3P *iFmt)
: Inst_VOP3P(iFmt, "v_pk_mul_f32")
{
setFlag(ALU);
} // Inst_VOP3P__V_PK_MUL_F32
Inst_VOP3P__V_PK_MUL_F32::~Inst_VOP3P__V_PK_MUL_F32()
{
} // ~Inst_VOP3P__V_PK_MUL_F32
// D.f[63:32] = S0.f[63:32] * S1.f[63:32] . D.f[31:0] = S0.f[31:0] *
// S1.f[31:0]
void
Inst_VOP3P__V_PK_MUL_F32::execute(GPUDynInstPtr gpuDynInst)
{
// This is a special case of packed instructions which operates on
// 64-bit inputs/outputs and not 32-bit. U64 is used here as float
// values cannot use bitwise operations. Consider the U64 to imply
// untyped 64-bits of data.
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
VecOperandU64 vdst(gpuDynInst, instData.VDST);
src0.readSrc();
src1.readSrc();
int opsel = instData.OPSEL;
int opsel_hi = extData.OPSEL_HI;
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
: bits(src0[lane], 31, 0);
uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
: bits(src1[lane], 31, 0);
float dword1 = *reinterpret_cast<float*>(&lower_dword)
* *reinterpret_cast<float*>(&upper_dword);
lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
: bits(src0[lane], 31, 0);
upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
: bits(src1[lane], 31, 0);
float dword2 = *reinterpret_cast<float*>(&lower_dword)
* *reinterpret_cast<float*>(&upper_dword);
uint64_t result1 = *reinterpret_cast<uint64_t*>(&dword1);
uint64_t result2 = *reinterpret_cast<uint64_t*>(&dword2);
vdst[lane] = (result2 << 32) | result1;
}
}
vdst.write();
} // execute
// --- Inst_VOP3P__V_PK_ADD_F32 class methods ---
Inst_VOP3P__V_PK_ADD_F32::Inst_VOP3P__V_PK_ADD_F32(InFmt_VOP3P *iFmt)
: Inst_VOP3P(iFmt, "v_pk_add_f32")
{
setFlag(ALU);
} // Inst_VOP3P__V_PK_ADD_F32
Inst_VOP3P__V_PK_ADD_F32::~Inst_VOP3P__V_PK_ADD_F32()
{
} // ~Inst_VOP3P__V_PK_ADD_F32
// D.f[63:32] = S0.f[63:32] + S1.f[63:32] . D.f[31:0] = S0.f[31:0] +
// S1.f[31:0]
void
Inst_VOP3P__V_PK_ADD_F32::execute(GPUDynInstPtr gpuDynInst)
{
// This is a special case of packed instructions which operates on
// 64-bit inputs/outputs and not 32-bit. U64 is used here as float
// values cannot use bitwise operations. Consider the U64 to imply
// untyped 64-bits of data.
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandU64 src0(gpuDynInst, extData.SRC0);
ConstVecOperandU64 src1(gpuDynInst, extData.SRC1);
VecOperandU64 vdst(gpuDynInst, instData.VDST);
src0.readSrc();
src1.readSrc();
int opsel = instData.OPSEL;
int opsel_hi = extData.OPSEL_HI;
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
uint32_t lower_dword = (opsel & 1) ? bits(src0[lane], 63, 32)
: bits(src0[lane], 31, 0);
uint32_t upper_dword = (opsel & 2) ? bits(src1[lane], 63, 32)
: bits(src1[lane], 31, 0);
float dword1 = *reinterpret_cast<float*>(&lower_dword)
+ *reinterpret_cast<float*>(&upper_dword);
lower_dword = (opsel_hi & 1) ? bits(src0[lane], 63, 32)
: bits(src0[lane], 31, 0);
upper_dword = (opsel_hi & 2) ? bits(src1[lane], 63, 32)
: bits(src1[lane], 31, 0);
float dword2 = *reinterpret_cast<float*>(&lower_dword)
+ *reinterpret_cast<float*>(&upper_dword);
uint64_t result1 = *reinterpret_cast<uint64_t*>(&dword1);
uint64_t result2 = *reinterpret_cast<uint64_t*>(&dword2);
vdst[lane] = (result2 << 32) | result1;
}
}
vdst.write();
} // execute
// --- Inst_VOP3P__V_PK_MOV_B32 class methods ---
Inst_VOP3P__V_PK_MOV_B32::Inst_VOP3P__V_PK_MOV_B32(InFmt_VOP3P *iFmt)

View File

@@ -8098,6 +8098,40 @@ namespace VegaISA
void execute(GPUDynInstPtr) override;
}; // Inst_VOP2__V_SUBREV_U32
class Inst_VOP2__V_FMAC_F32 : public Inst_VOP2
{
public:
Inst_VOP2__V_FMAC_F32(InFmt_VOP2*);
~Inst_VOP2__V_FMAC_F32();
int
getNumOperands() override
{
return numDstRegOperands() + numSrcRegOperands();
} // getNumOperands
int numDstRegOperands() override { return 1; }
int numSrcRegOperands() override { return 2; }
int
getOperandSize(int opIdx) override
{
switch (opIdx) {
case 0: //src_0
return 4;
case 1: //src_1
return 4;
case 2: //vdst
return 4;
default:
fatal("op idx %i out of bounds\n", opIdx);
return -1;
}
} // getOperandSize
void execute(GPUDynInstPtr) override;
}; // Inst_VOP2__V_FMAC_F32
class Inst_VOP1__V_NOP : public Inst_VOP1
{
public:
@@ -42280,6 +42314,43 @@ namespace VegaISA
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_STORE_SHORT
class Inst_FLAT__FLAT_STORE_SHORT_D16_HI : public Inst_FLAT
{
public:
Inst_FLAT__FLAT_STORE_SHORT_D16_HI(InFmt_FLAT*);
~Inst_FLAT__FLAT_STORE_SHORT_D16_HI();
int
getNumOperands() override
{
return numDstRegOperands() + numSrcRegOperands();
} // getNumOperands
int numDstRegOperands() override { return 0; }
int numSrcRegOperands() override { return isFlat() ? 2 : 3; }
int
getOperandSize(int opIdx) override
{
switch (opIdx) {
case 0: //vgpr_addr
return vgprIsOffset() ? 4 : 8;
case 1: //vgpr_src
return 2;
case 2: //saddr
assert(!isFlat());
return 8;
default:
fatal("op idx %i out of bounds\n", opIdx);
return -1;
}
} // getOperandSize
void execute(GPUDynInstPtr) override;
void initiateAcc(GPUDynInstPtr) override;
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_STORE_SHORT_D16_HI
class Inst_FLAT__FLAT_STORE_DWORD : public Inst_FLAT
{
public:
@@ -43637,6 +43708,110 @@ namespace VegaISA
void completeAcc(GPUDynInstPtr) override;
}; // Inst_FLAT__FLAT_ATOMIC_MAX_F64
class Inst_VOP3P__V_PK_FMA_F32 : public Inst_VOP3P
{
public:
Inst_VOP3P__V_PK_FMA_F32(InFmt_VOP3P*);
~Inst_VOP3P__V_PK_FMA_F32();
int
getNumOperands() override
{
return numDstRegOperands() + numSrcRegOperands();
} // getNumOperands
int numDstRegOperands() override { return 1; }
int numSrcRegOperands() override { return 3; }
int
getOperandSize(int opIdx) override
{
switch (opIdx) {
case 0: // src0
return 8;
case 1: // src1
return 8;
case 2: // src2
return 8;
case 3: // dst
return 8;
default:
fatal("op idx %i out of bounds\n", opIdx);
return -1;
}
} // getOperandSize
void execute(GPUDynInstPtr) override;
}; // Inst_VOP3P__V_PK_FMA_F32
class Inst_VOP3P__V_PK_MUL_F32 : public Inst_VOP3P
{
public:
Inst_VOP3P__V_PK_MUL_F32(InFmt_VOP3P*);
~Inst_VOP3P__V_PK_MUL_F32();
int
getNumOperands() override
{
return numDstRegOperands() + numSrcRegOperands();
} // getNumOperands
int numDstRegOperands() override { return 1; }
int numSrcRegOperands() override { return 2; }
int
getOperandSize(int opIdx) override
{
switch (opIdx) {
case 0: // src0
return 8;
case 1: // src1
return 8;
case 2: // dst
return 8;
default:
fatal("op idx %i out of bounds\n", opIdx);
return -1;
}
} // getOperandSize
void execute(GPUDynInstPtr) override;
}; // Inst_VOP3P__V_PK_MUL_F32
class Inst_VOP3P__V_PK_ADD_F32 : public Inst_VOP3P
{
public:
Inst_VOP3P__V_PK_ADD_F32(InFmt_VOP3P*);
~Inst_VOP3P__V_PK_ADD_F32();
int
getNumOperands() override
{
return numDstRegOperands() + numSrcRegOperands();
} // getNumOperands
int numDstRegOperands() override { return 1; }
int numSrcRegOperands() override { return 2; }
int
getOperandSize(int opIdx) override
{
switch (opIdx) {
case 0: // src0
return 8;
case 1: // src1
return 8;
case 2: // dst
return 8;
default:
fatal("op idx %i out of bounds\n", opIdx);
return -1;
}
} // getOperandSize
void execute(GPUDynInstPtr) override;
}; // Inst_VOP3P__V_PK_ADD_F32
class Inst_VOP3P__V_PK_MOV_B32 : public Inst_VOP3P
{
public: