arch-vega: Implement F32 <-> F16 conversions
These instructions are used in some of the F16 MFMA example applications to convert to/from floating point types. Change-Id: I7426ea663ce11a39fe8c60c8006d8cca11cfaf07
This commit is contained in:
@@ -455,6 +455,29 @@ namespace VegaISA
|
||||
// second instruction DWORD
|
||||
InFmt_VOP3_1 extData;
|
||||
|
||||
// Output modifier for VOP3 instructions. This 2-bit field can be set
|
||||
// to "0" to do nothing, "1" to multiply output value by 2, "2" to
|
||||
// multiply output value by 4, or "3" to divide output value by 2. If
|
||||
// the instruction supports clamping, this is applied *before* clamp
|
||||
// but after the abs and neg modifiers.
|
||||
template<typename T>
|
||||
T omodModifier(T val, unsigned omod)
|
||||
{
|
||||
assert(omod < 4);
|
||||
|
||||
if constexpr (std::is_floating_point_v<T>) {
|
||||
if (omod == 1) return val * T(2.0f);
|
||||
if (omod == 2) return val * T(4.0f);
|
||||
if (omod == 3) return val / T(2.0f);
|
||||
} else {
|
||||
assert(std::is_integral_v<T>);
|
||||
if (omod == 1) return val * T(2);
|
||||
if (omod == 2) return val * T(4);
|
||||
if (omod == 3) return val / T(2);
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
private:
|
||||
bool hasSecondDword(InFmt_VOP3A *);
|
||||
/**
|
||||
|
||||
@@ -433,7 +433,22 @@ namespace VegaISA
|
||||
void
|
||||
Inst_VOP1__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
panicUnimplemented();
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
ConstVecOperandF32 src(gpuDynInst, instData.SRC0);
|
||||
VecOperandU32 vdst(gpuDynInst, instData.VDST);
|
||||
|
||||
src.readSrc();
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (wf->execMask(lane)) {
|
||||
float tmp = src[lane];
|
||||
AMDGPU::mxfloat16 out(tmp);
|
||||
|
||||
vdst[lane] = (out.data >> 16);
|
||||
}
|
||||
}
|
||||
|
||||
vdst.write();
|
||||
} // execute
|
||||
// --- Inst_VOP1__V_CVT_F32_F16 class methods ---
|
||||
|
||||
@@ -454,7 +469,20 @@ namespace VegaISA
|
||||
void
|
||||
Inst_VOP1__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
panicUnimplemented();
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
ConstVecOperandU32 src(gpuDynInst, instData.SRC0);
|
||||
VecOperandF32 vdst(gpuDynInst, instData.VDST);
|
||||
|
||||
src.readSrc();
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (wf->execMask(lane)) {
|
||||
AMDGPU::mxfloat16 tmp(src[lane]);
|
||||
vdst[lane] = float(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
vdst.write();
|
||||
} // execute
|
||||
// --- Inst_VOP1__V_CVT_RPI_I32_F32 class methods ---
|
||||
|
||||
|
||||
@@ -2790,7 +2790,42 @@ namespace VegaISA
|
||||
void
|
||||
Inst_VOP3__V_CVT_F16_F32::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
panicUnimplemented();
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
ConstVecOperandF32 src0(gpuDynInst, extData.SRC0);
|
||||
VecOperandU32 vdst(gpuDynInst, instData.VDST);
|
||||
|
||||
src0.readSrc();
|
||||
vdst.read();
|
||||
|
||||
panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
|
||||
panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
|
||||
|
||||
unsigned abs = instData.ABS;
|
||||
unsigned neg = extData.NEG;
|
||||
int opsel = instData.OPSEL;
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (wf->execMask(lane)) {
|
||||
float tmp = src0[lane];
|
||||
|
||||
if ((abs & 1) && (tmp < 0)) tmp = -tmp;
|
||||
if (neg & 1) tmp = -tmp;
|
||||
|
||||
tmp = omodModifier(tmp, extData.OMOD);
|
||||
tmp = std::clamp(tmp, 0.0f, 1.0f);
|
||||
|
||||
AMDGPU::mxfloat16 out(tmp);
|
||||
|
||||
// If opsel[3] use upper 16-bits of dest, otherwise lower.
|
||||
if (opsel & 8) {
|
||||
replaceBits(vdst[lane], 31, 16, (out.data >> 16));
|
||||
} else {
|
||||
replaceBits(vdst[lane], 15, 0, (out.data >> 16));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vdst.write();
|
||||
} // execute
|
||||
// --- Inst_VOP3__V_CVT_F32_F16 class methods ---
|
||||
|
||||
@@ -2811,7 +2846,34 @@ namespace VegaISA
|
||||
void
|
||||
Inst_VOP3__V_CVT_F32_F16::execute(GPUDynInstPtr gpuDynInst)
|
||||
{
|
||||
panicUnimplemented();
|
||||
Wavefront *wf = gpuDynInst->wavefront();
|
||||
ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
|
||||
VecOperandF32 vdst(gpuDynInst, instData.VDST);
|
||||
|
||||
src0.readSrc();
|
||||
|
||||
panic_if(isSDWAInst(), "SDWA not implemented for %s", _opcode);
|
||||
panic_if(isDPPInst(), "DPP not implemented for %s", _opcode);
|
||||
panic_if(instData.OPSEL, "OPSEL not implemented for %s", _opcode);
|
||||
|
||||
unsigned abs = instData.ABS;
|
||||
unsigned neg = extData.NEG;
|
||||
|
||||
for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
|
||||
if (wf->execMask(lane)) {
|
||||
AMDGPU::mxfloat16 tmp(src0[lane]);
|
||||
|
||||
if ((abs & 1) && (tmp < 0)) tmp = -tmp;
|
||||
if (neg & 1) tmp = -tmp;
|
||||
|
||||
float out = omodModifier(float(tmp), extData.OMOD);
|
||||
out = std::clamp(out, 0.0f, 1.0f);
|
||||
|
||||
vdst[lane] = out;
|
||||
}
|
||||
}
|
||||
|
||||
vdst.write();
|
||||
} // execute
|
||||
// --- Inst_VOP3__V_CVT_RPI_I32_F32 class methods ---
|
||||
|
||||
|
||||
Reference in New Issue
Block a user