arch-vega: Implement F32 <-> F16 conversions

These instructions are used in some of the F16 MFMA example applications
to convert to/from floating point types.

Change-Id: I7426ea663ce11a39fe8c60c8006d8cca11cfaf07
This commit is contained in:
Matthew Poremba
2024-05-14 08:14:41 -07:00
parent a062229ac3
commit de11daec5f
3 changed files with 117 additions and 4 deletions

View File

@@ -455,6 +455,29 @@ namespace VegaISA
// second instruction DWORD
InFmt_VOP3_1 extData;
// Output modifier for VOP3 instructions. This 2-bit field can be set
// to "0" to do nothing, "1" to multiply output value by 2, "2" to
// multiply output value by 4, or "3" to divide output value by 2. If
// the instruction supports clamping, this is applied *before* clamp
// but after the abs and neg modifiers.
template<typename T>
T omodModifier(T val, unsigned omod)
{
assert(omod < 4);
if constexpr (std::is_floating_point_v<T>) {
if (omod == 1) return val * T(2.0f);
if (omod == 2) return val * T(4.0f);
if (omod == 3) return val / T(2.0f);
} else {
assert(std::is_integral_v<T>);
if (omod == 1) return val * T(2);
if (omod == 2) return val * T(4);
if (omod == 3) return val / T(2);
}
return val;
}
private:
bool hasSecondDword(InFmt_VOP3A *);
/**

View File

@@ -433,7 +433,22 @@ namespace VegaISA
void
Inst_VOP1__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
VecOperandU32 vdst(gpuDynInst, instData.VDST);
src.readSrc();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
float tmp = src[lane];
AMDGPU::mxfloat16 out(tmp);
vdst[lane] = (out.data >> 16);
}
}
vdst.write();
} // execute
// --- Inst_VOP1__V_CVT_F32_F16 class methods ---
@@ -454,7 +469,20 @@ namespace VegaISA
void
Inst_VOP1__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
VecOperandF32 vdst(gpuDynInst, instData.VDST);
src.readSrc();
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
AMDGPU::mxfloat16 tmp(src[lane]);
vdst[lane] = float(tmp);
}
}
vdst.write();
} // execute
// --- Inst_VOP1__V_CVT_RPI_I32_F32 class methods ---

View File

@@ -2790,7 +2790,42 @@ namespace VegaISA
void
Inst_VOP3__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
VecOperandU32 vdst(gpuDynInst, instData.VDST);
src0.readSrc();
vdst.read();
panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
unsigned abs = instData.ABS;
unsigned neg = extData.NEG;
int opsel = instData.OPSEL;
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
float tmp = src0[lane];
if ((abs & 1) && (tmp < 0)) tmp = -tmp;
if (neg & 1) tmp = -tmp;
tmp = omodModifier(tmp, extData.OMOD);
tmp = std::clamp(tmp, 0.0f, 1.0f);
AMDGPU::mxfloat16 out(tmp);
// If opsel[3] use upper 16-bits of dest, otherwise lower.
if (opsel & 8) {
replaceBits(vdst[lane], 31, 16, (out.data >> 16));
} else {
replaceBits(vdst[lane], 15, 0, (out.data >> 16));
}
}
}
vdst.write();
} // execute
// --- Inst_VOP3__V_CVT_F32_F16 class methods ---
@@ -2811,7 +2846,34 @@ namespace VegaISA
void
Inst_VOP3__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
{
panicUnimplemented();
Wavefront *wf = gpuDynInst->wavefront();
ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
VecOperandF32 vdst(gpuDynInst, instData.VDST);
src0.readSrc();
panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
panic_if(instData.OPSEL, "OPSEL not implemented for %s", _opcode);
unsigned abs = instData.ABS;
unsigned neg = extData.NEG;
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
if (wf->execMask(lane)) {
AMDGPU::mxfloat16 tmp(src0[lane]);
if ((abs & 1) && (tmp < 0)) tmp = -tmp;
if (neg & 1) tmp = -tmp;
float out = omodModifier(float(tmp), extData.OMOD);
out = std::clamp(out, 0.0f, 1.0f);
vdst[lane] = out;
}
}
vdst.write();
} // execute
// --- Inst_VOP3__V_CVT_RPI_I32_F32 class methods ---