arch-vega: Add vop3p DOT instructions

Implemented according to the ISA spec. Validated with silion. In
particular the sign extend is important for the signed variants and the
unsigned variants seem to overflow lanes (hence why there is no mask()
in the unsigned varints. FP16 -> FP32 continues using ARM's fplib.

Tested vs. an MI210. Clamp has not been verified.

Change-Id: Ifc09aecbc1ef2c92a5524a43ca529983018a6d59
This commit is contained in:
Matthew Poremba
2023-12-12 00:59:33 -06:00
parent a40f8f0efa
commit 31e63b01ad
5 changed files with 580 additions and 9 deletions

View File

@@ -3614,15 +3614,15 @@ namespace VegaISA
&Decoder::decode_OP_VOP3P__V_MAD_MIX_F32,
&Decoder::decode_OP_VOP3P__V_MAD_MIXLO_F16,
&Decoder::decode_OP_VOP3P__V_MAD_MIXHI_F16,
&Decoder::decode_OP_VOP3P__V_DOT2_F32_F16,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_OP_VOP3P__V_DOT2_I32_I16,
&Decoder::decode_OP_VOP3P__V_DOT2_U32_U16,
&Decoder::decode_OP_VOP3P__V_DOT4_I32_I8,
&Decoder::decode_OP_VOP3P__V_DOT4_U32_U8,
&Decoder::decode_OP_VOP3P__V_DOT8_I32_I4,
&Decoder::decode_OP_VOP3P__V_DOT8_U32_U4,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
@@ -3667,8 +3667,8 @@ namespace VegaISA
&Decoder::decode_OP_VOP3P__V_MFMA_I32_16X16X16I8,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_OP_VOP3P__V_ACCVGPR_READ,
&Decoder::decode_OP_VOP3P__V_ACCVGPR_WRITE,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
&Decoder::decode_invalid,
@@ -13125,6 +13125,48 @@ namespace VegaISA
return new Inst_VOP3P__V_PK_MOV_B32(&iFmt->iFmt_VOP3P);
}
GPUStaticInst*
Decoder::decode_OP_VOP3P__V_DOT2_F32_F16(MachInst iFmt)
{
return new Inst_VOP3P__V_DOT2_F32_F16(&iFmt->iFmt_VOP3P);
}
GPUStaticInst*
Decoder::decode_OP_VOP3P__V_DOT2_I32_I16(MachInst iFmt)
{
return new Inst_VOP3P__V_DOT2_I32_I16(&iFmt->iFmt_VOP3P);
}
GPUStaticInst*
Decoder::decode_OP_VOP3P__V_DOT2_U32_U16(MachInst iFmt)
{
return new Inst_VOP3P__V_DOT2_U32_U16(&iFmt->iFmt_VOP3P);
}
GPUStaticInst*
Decoder::decode_OP_VOP3P__V_DOT4_I32_I8(MachInst iFmt)
{
return new Inst_VOP3P__V_DOT4_I32_I8(&iFmt->iFmt_VOP3P);
}
GPUStaticInst*
Decoder::decode_OP_VOP3P__V_DOT4_U32_U8(MachInst iFmt)
{
return new Inst_VOP3P__V_DOT4_U32_U8(&iFmt->iFmt_VOP3P);
}
GPUStaticInst*
Decoder::decode_OP_VOP3P__V_DOT8_I32_I4(MachInst iFmt)
{
return new Inst_VOP3P__V_DOT8_I32_I4(&iFmt->iFmt_VOP3P);
}
GPUStaticInst*
Decoder::decode_OP_VOP3P__V_DOT8_U32_U4(MachInst iFmt)
{
return new Inst_VOP3P__V_DOT8_U32_U4(&iFmt->iFmt_VOP3P);
}
GPUStaticInst*
Decoder::decode_OP_VOP3P__V_MFMA_I32_16X16X16I8(MachInst iFmt)
{
@@ -13132,6 +13174,18 @@ namespace VegaISA
&iFmt->iFmt_VOP3P_MAI);
}
GPUStaticInst*
Decoder::decode_OP_VOP3P__V_ACCVGPR_READ(MachInst iFmt)
{
return new Inst_VOP3P__V_ACCVGPR_READ(&iFmt->iFmt_VOP3P);
}
GPUStaticInst*
Decoder::decode_OP_VOP3P__V_ACCVGPR_WRITE(MachInst iFmt)
{
return new Inst_VOP3P__V_ACCVGPR_WRITE(&iFmt->iFmt_VOP3P);
}
GPUStaticInst*
Decoder::decode_OP_VOP3P__V_MFMA_F64_16X16X4F64(MachInst iFmt)
{

View File

@@ -1597,7 +1597,16 @@ namespace VegaISA
GPUStaticInst* decode_OP_VOP3P__V_PK_MUL_F32(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_PK_ADD_F32(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_PK_MOV_B32(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_DOT2_F32_F16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_DOT2_I32_I16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_DOT2_U32_U16(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_DOT4_I32_I8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_DOT4_U32_U8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_DOT8_I32_I4(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_DOT8_U32_U4(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_I32_16X16X16I8(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_ACCVGPR_READ(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_ACCVGPR_WRITE(MachInst);
GPUStaticInst* decode_OP_VOP3P__V_MFMA_F64_16X16X4F64(MachInst);
GPUStaticInst* subDecode_OPU_VOP3(MachInst);
GPUStaticInst* subDecode_OP_DS(MachInst);

View File

@@ -588,6 +588,56 @@ namespace VegaISA
D.write();
}
void
dotHelper(GPUDynInstPtr gpuDynInst,
uint32_t (*fOpImpl)(uint32_t, uint32_t, uint32_t, bool))
{
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandU32 S0(gpuDynInst, extData.SRC0);
ConstVecOperandU32 S1(gpuDynInst, extData.SRC1);
ConstVecOperandU32 S2(gpuDynInst, extData.SRC2);
VecOperandU32 D(gpuDynInst, instData.VDST);
S0.readSrc();
S1.readSrc();
S2.readSrc();
// OPSEL[2] and OPSEL_HI2 are unused. Craft two dwords where:
// dword1[15:0] is upper/lower 16b of src0 based on opsel[0]
// dword1[31:15] is upper/lower 16b of src0 based on opsel_hi[0]
// dword2[15:0] is upper/lower 16b of src1 based on opsel[1]
// dword2[31:15] is upper/lower 16b of src1 based on opsel_hi[1]
int opLo = instData.OPSEL;
int opHi = extData.OPSEL_HI;
int negLo = extData.NEG;
int negHi = instData.NEG_HI;
bool clamp = instData.CLMP;
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
uint32_t dword1l =
word<uint16_t>(S0[lane], opLo, negLo, 0);
uint32_t dword1h =
word<uint16_t>(S0[lane], opHi, negHi, 0);
uint32_t dword2l =
word<uint16_t>(S1[lane], opLo, negLo, 1);
uint32_t dword2h =
word<uint16_t>(S1[lane], opHi, negHi, 1);
uint32_t dword1 = (dword1h << 16) | dword1l;
uint32_t dword2 = (dword2h << 16) | dword2l;
// Take in two uint32_t dwords and one src2 dword. The
// function will need to call bits to break up to the
// correct size and then reinterpret cast to the correct
// value.
D[lane] = fOpImpl(dword1, dword2, S2[lane], clamp);
}
}
D.write();
}
private:
bool hasSecondDword(InFmt_VOP3P *);

View File

@@ -42,6 +42,38 @@ namespace VegaISA
using half = uint16_t;
// Helper functions
template<int N>
int32_t
dotClampI(int32_t value, bool clamp)
{
// Only valid for N < 32
static_assert(N < 32);
if (!clamp) {
return static_cast<int32_t>(value);
}
int32_t min = -(1 << (N - 1));
int32_t max = (1 << (N - 1)) - 1;
return std::clamp<int32_t>(value, min, max);
}
template<int N>
uint32_t
dotClampU(uint32_t value, bool clamp)
{
// Only valid for N < 32
static_assert(N < 32);
if (!clamp) {
return static_cast<int32_t>(value);
}
uint32_t min = 0;
uint32_t max = (1 << N) - 1;
return std::clamp<int32_t>(value, min, max);
}
int16_t
clampI16(int32_t value, bool clamp)
{
@@ -83,6 +115,16 @@ clampF16(uint16_t value, bool clamp)
return fplibMax(imm, zero, fpscr2);
}
float
clampF32(float value, bool clamp)
{
if (!clamp) {
return value;
}
return std::clamp(value, 0.0f, 1.0f);
}
@@ -298,5 +340,296 @@ void Inst_VOP3P__V_PK_MAX_F16::execute(GPUDynInstPtr gpuDynInst)
vop3pHelper<half>(gpuDynInst, opImpl);
}
void Inst_VOP3P__V_DOT2_F32_F16::execute(GPUDynInstPtr gpuDynInst)
{
auto opImpl =
[](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
{
constexpr unsigned INBITS = 16;
constexpr unsigned elems = 32 / INBITS;
half S0[elems];
half S1[elems];
for (int i = 0; i < elems; ++i) {
S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
}
float S2 = *reinterpret_cast<float*>(&S2r);
// Compute components individually to prevent overflow across packing
half C[elems];
float Csum = 0.0f;
for (int i = 0; i < elems; ++i) {
ArmISA::FPSCR fpscr;
C[i] = fplibMul(S0[i], S1[i], fpscr);
uint32_t conv =
ArmISA::fplibConvert<uint16_t, uint32_t>(
C[i], ArmISA::FPRounding_TIEEVEN, fpscr);
Csum += clampF32(*reinterpret_cast<float*>(&conv), clamp);
}
Csum += S2;
uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
return rv;
};
dotHelper(gpuDynInst, opImpl);
}
void Inst_VOP3P__V_DOT2_I32_I16::execute(GPUDynInstPtr gpuDynInst)
{
auto opImpl =
[](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
{
constexpr unsigned INBITS = 16;
constexpr unsigned elems = 32 / INBITS;
uint32_t S0[elems];
uint32_t S1[elems];
for (int i = 0; i < elems; ++i) {
S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
}
int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
// Compute components individually to prevent overflow across packing
int32_t C[elems];
int32_t Csum = 0;
for (int i = 0; i < elems; ++i) {
C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
Csum += C[i];
}
Csum += S2;
uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
return rv;
};
dotHelper(gpuDynInst, opImpl);
}
void Inst_VOP3P__V_DOT2_U32_U16::execute(GPUDynInstPtr gpuDynInst)
{
auto opImpl =
[](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
{
constexpr unsigned INBITS = 16;
constexpr unsigned elems = 32 / INBITS;
uint32_t S0[elems];
uint32_t S1[elems];
for (int i = 0; i < elems; ++i) {
S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
}
// Compute components individually to prevent overflow across packing
uint32_t C[elems];
uint32_t Csum = 0;
for (int i = 0; i < elems; ++i) {
C[i] = S0[i] * S1[i];
C[i] = dotClampU<INBITS>(C[i], clamp);
Csum += C[i];
}
Csum += S2;
return Csum;
};
dotHelper(gpuDynInst, opImpl);
}
void Inst_VOP3P__V_DOT4_I32_I8::execute(GPUDynInstPtr gpuDynInst)
{
auto opImpl =
[](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
{
constexpr unsigned INBITS = 8;
constexpr unsigned elems = 32 / INBITS;
uint32_t S0[elems];
uint32_t S1[elems];
for (int i = 0; i < elems; ++i) {
S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
}
int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
// Compute components individually to prevent overflow across packing
int32_t C[elems];
int32_t Csum = 0;
for (int i = 0; i < elems; ++i) {
C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
Csum += C[i];
}
Csum += S2;
uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
return rv;
};
dotHelper(gpuDynInst, opImpl);
}
void Inst_VOP3P__V_DOT4_U32_U8::execute(GPUDynInstPtr gpuDynInst)
{
auto opImpl =
[](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
{
constexpr unsigned INBITS = 8;
constexpr unsigned elems = 32 / INBITS;
uint32_t S0[elems];
uint32_t S1[elems];
for (int i = 0; i < elems; ++i) {
S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
}
// Compute components individually to prevent overflow across packing
uint32_t C[elems];
uint32_t Csum = 0;
for (int i = 0; i < elems; ++i) {
C[i] = S0[i] * S1[i];
C[i] = dotClampU<INBITS>(C[i], clamp);
Csum += C[i];
}
Csum += S2;
return Csum;
};
dotHelper(gpuDynInst, opImpl);
}
void Inst_VOP3P__V_DOT8_I32_I4::execute(GPUDynInstPtr gpuDynInst)
{
auto opImpl =
[](uint32_t S0r, uint32_t S1r, uint32_t S2r, bool clamp) -> uint32_t
{
constexpr unsigned INBITS = 4;
constexpr unsigned elems = 32 / INBITS;
uint32_t S0[elems];
uint32_t S1[elems];
for (int i = 0; i < elems; ++i) {
S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
}
int32_t S2 = *reinterpret_cast<int32_t*>(&S2r);
// Compute components individually to prevent overflow across packing
int32_t C[elems];
int32_t Csum = 0;
for (int i = 0; i < elems; ++i) {
C[i] = sext<INBITS>(S0[i]) * sext<INBITS>(S1[i]);
C[i] = sext<INBITS>(dotClampI<INBITS>(C[i], clamp) & mask(INBITS));
Csum += C[i];
}
Csum += S2;
uint32_t rv = *reinterpret_cast<uint32_t*>(&Csum);
return rv;
};
dotHelper(gpuDynInst, opImpl);
}
void Inst_VOP3P__V_DOT8_U32_U4::execute(GPUDynInstPtr gpuDynInst)
{
auto opImpl =
[](uint32_t S0r, uint32_t S1r, uint32_t S2, bool clamp) -> uint32_t
{
constexpr unsigned INBITS = 4;
constexpr unsigned elems = 32 / INBITS;
uint32_t S0[elems];
uint32_t S1[elems];
for (int i = 0; i < elems; ++i) {
S0[i] = bits(S0r, i*INBITS+INBITS-1, i*INBITS);
S1[i] = bits(S1r, i*INBITS+INBITS-1, i*INBITS);
}
// Compute components individually to prevent overflow across packing
uint32_t C[elems];
uint32_t Csum = 0;
for (int i = 0; i < elems; ++i) {
C[i] = S0[i] * S1[i];
C[i] = dotClampU<INBITS>(C[i], clamp);
Csum += C[i];
}
Csum += S2;
return Csum;
};
dotHelper(gpuDynInst, opImpl);
}
void Inst_VOP3P__V_ACCVGPR_READ::execute(GPUDynInstPtr gpuDynInst)
{
// The Acc register file is not supported in gem5 and has been removed
// in MI200. Therefore this instruction becomes a mov.
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
VecOperandU32 vdst(gpuDynInst, instData.VDST);
src.readSrc();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
vdst[lane] = src[lane];
}
}
vdst.write();
}
void Inst_VOP3P__V_ACCVGPR_WRITE::execute(GPUDynInstPtr gpuDynInst)
{
// The Acc register file is not supported in gem5 and has been removed
// in MI200. Therefore this instruction becomes a mov.
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandU32 src(gpuDynInst, extData.SRC0);
VecOperandU32 vdst(gpuDynInst, instData.VDST);
src.readSrc();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
vdst[lane] = src[lane];
}
}
vdst.write();
}
} // namespace VegaISA
} // namespace gem5

View File

@@ -42,6 +42,41 @@ namespace gem5
namespace VegaISA
{
// One source operand
class Inst_VOP3P__1OP : public Inst_VOP3P
{
public:
Inst_VOP3P__1OP(InFmt_VOP3P *iFmt, const std::string& name)
: Inst_VOP3P(iFmt, name)
{
setFlag(ALU);
}
int
getNumOperands() override
{
return numDstRegOperands() + numSrcRegOperands();
} // getNumOperands
int numDstRegOperands() override { return 1; }
int numSrcRegOperands() override { return 1; }
int
getOperandSize(int opIdx) override
{
switch (opIdx) {
case 0: // src
return 4;
case 1: // dst
return 4;
default:
fatal("op idx %i out of bounds\n", opIdx);
return -1;
}
}
};
// Two source operands with two 16-bit values in a dword
class Inst_VOP3P__2OP_X16 : public Inst_VOP3P
{
@@ -310,6 +345,96 @@ namespace VegaISA
void execute(GPUDynInstPtr gpuDynInst) override;
};
class Inst_VOP3P__V_DOT2_F32_F16 : public Inst_VOP3P__3OP_X16
{
public:
Inst_VOP3P__V_DOT2_F32_F16(InFmt_VOP3P *iFmt)
: Inst_VOP3P__3OP_X16(iFmt, "v_dot2_f32_f16")
{ }
void execute(GPUDynInstPtr gpuDynInst) override;
};
class Inst_VOP3P__V_DOT2_I32_I16 : public Inst_VOP3P__3OP_X16
{
public:
Inst_VOP3P__V_DOT2_I32_I16(InFmt_VOP3P *iFmt)
: Inst_VOP3P__3OP_X16(iFmt, "v_dot2_i32_i16")
{ }
void execute(GPUDynInstPtr gpuDynInst) override;
};
class Inst_VOP3P__V_DOT2_U32_U16 : public Inst_VOP3P__3OP_X16
{
public:
Inst_VOP3P__V_DOT2_U32_U16(InFmt_VOP3P *iFmt)
: Inst_VOP3P__3OP_X16(iFmt, "v_dot2_u32_u16")
{ }
void execute(GPUDynInstPtr gpuDynInst) override;
};
class Inst_VOP3P__V_DOT4_I32_I8 : public Inst_VOP3P__3OP_X16
{
public:
Inst_VOP3P__V_DOT4_I32_I8(InFmt_VOP3P *iFmt)
: Inst_VOP3P__3OP_X16(iFmt, "v_dot4_i32_i8")
{ }
void execute(GPUDynInstPtr gpuDynInst) override;
};
class Inst_VOP3P__V_DOT4_U32_U8 : public Inst_VOP3P__3OP_X16
{
public:
Inst_VOP3P__V_DOT4_U32_U8(InFmt_VOP3P *iFmt)
: Inst_VOP3P__3OP_X16(iFmt, "v_dot4_u32_u8")
{ }
void execute(GPUDynInstPtr gpuDynInst) override;
};
class Inst_VOP3P__V_DOT8_I32_I4 : public Inst_VOP3P__3OP_X16
{
public:
Inst_VOP3P__V_DOT8_I32_I4(InFmt_VOP3P *iFmt)
: Inst_VOP3P__3OP_X16(iFmt, "v_dot8_i32_i4")
{ }
void execute(GPUDynInstPtr gpuDynInst) override;
};
class Inst_VOP3P__V_DOT8_U32_U4 : public Inst_VOP3P__3OP_X16
{
public:
Inst_VOP3P__V_DOT8_U32_U4(InFmt_VOP3P *iFmt)
: Inst_VOP3P__3OP_X16(iFmt, "v_dot8_u32_u4")
{ }
void execute(GPUDynInstPtr gpuDynInst) override;
};
class Inst_VOP3P__V_ACCVGPR_READ : public Inst_VOP3P__1OP
{
public:
Inst_VOP3P__V_ACCVGPR_READ(InFmt_VOP3P *iFmt)
: Inst_VOP3P__1OP(iFmt, "v_accvgpr_read")
{ }
void execute(GPUDynInstPtr gpuDynInst) override;
};
class Inst_VOP3P__V_ACCVGPR_WRITE : public Inst_VOP3P__1OP
{
public:
Inst_VOP3P__V_ACCVGPR_WRITE(InFmt_VOP3P *iFmt)
: Inst_VOP3P__1OP(iFmt, "v_accvgpr_write")
{ }
void execute(GPUDynInstPtr gpuDynInst) override;
};
} // namespace VegaISA
} // namespace gem5