diff --git a/src/arch/arm/insts/vfp.hh b/src/arch/arm/insts/vfp.hh index 96db06388c..1fccba3253 100644 --- a/src/arch/arm/insts/vfp.hh +++ b/src/arch/arm/insts/vfp.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2013, 2019 ARM Limited + * Copyright (c) 2010-2013, 2019, 2024 Arm Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -435,6 +435,119 @@ vfpFpToFixed(T val, bool isSigned, uint8_t width, uint8_t imm, bool }; +template +T +vfpFpRint(T val, bool exact, bool defaultNan, bool useRmode = true, + VfpRoundingMode roundMode = VfpRoundZero) +{ + int rmode; + bool roundAwayFix = false; + + if (!useRmode) { + rmode = fegetround(); + } else { + switch (roundMode) + { + case VfpRoundNearest: + rmode = FeRoundNearest; + break; + case VfpRoundUpward: + rmode = FeRoundUpward; + break; + case VfpRoundDown: + rmode = FeRoundDown; + break; + case VfpRoundZero: + rmode = FeRoundZero; + break; + case VfpRoundAway: + // There is no equivalent rounding mode, use round down and we'll + // fix it later + rmode = FeRoundDown; + roundAwayFix = true; + break; + default: + panic("Unsupported roundMode %d\n", roundMode); + } + } + __asm__ __volatile__("" : "=m" (rmode) : "m" (rmode)); + __asm__ __volatile__("" : "=m" (val) : "m" (val)); + fesetround(rmode); + feclearexcept(FeAllExceptions); + __asm__ __volatile__("" : "=m" (val) : "m" (val)); + T origVal = val; + val = rint(val); + __asm__ __volatile__("" : "=m" (val) : "m" (val)); + + int exceptions = fetestexcept(FeAllExceptions); + if (!exact) { + exceptions &= ~FeInexact; + } + + int fpType = std::fpclassify(val); + if (fpType == FP_SUBNORMAL || fpType == FP_NAN) { + if (fpType == FP_NAN) { + if (isSnan(val)) { + exceptions |= FeInvalid; + } + if (defaultNan || !isSnan(val)) { + bool single = (sizeof(T) == sizeof(float)); + uint64_t qnan = single ? 0x7fc00000 : 0x7ff8000000000000ULL; + val = bitsToFp(qnan, (T)0.0); + } + } else { + val = 0.0; + } + } else if (origVal != val) { + switch (rmode) { + case FeRoundNearest: + if (origVal - val > 0.5) + val += 1.0; + else if (val - origVal > 0.5) + val -= 1.0; + break; + case FeRoundDown: + if (roundAwayFix) { + // The ordering on the subtraction looks a bit odd in that we + // don't do the obvious origVal - val, instead we do + // -(val - origVal). This is required to get the corruct bit + // exact behaviour when very close to the 0.5 threshold. + volatile T error = val; + error -= origVal; + error = -error; + if ( (error > 0.5) || + ((error == 0.5) && (val >= 0)) ) + val += 1.0; + } else { + if (origVal < val) + val -= 1.0; + } + break; + case FeRoundUpward: + if (origVal > val) + val += 1.0; + break; + } + if (exact) { + exceptions |= FeInexact; + } + } + // Fix signal of zero. + fpType = std::fpclassify(val); + if (fpType == FP_ZERO) { + bool single = (sizeof(T) == sizeof(float)); + uint64_t mask = single ? 0x80000000 : 0x8000000000000000ULL; + val = bitsToFp((fpToBits(val) & (~mask)) | (fpToBits(origVal) & mask), + (T)0.0); + } + + // __asm__ __volatile__("" : "=m" (val) : "m" (val)); + setFPExceptions(exceptions); + + return val; +}; + + float vfpUFixedToFpS(bool flush, bool defaultNan, uint64_t val, uint8_t width, uint8_t imm); float vfpSFixedToFpS(bool flush, bool defaultNan, diff --git a/src/arch/arm/isa/formats/fp.isa b/src/arch/arm/isa/formats/fp.isa index 45b0985838..09d74efb69 100644 --- a/src/arch/arm/isa/formats/fp.isa +++ b/src/arch/arm/isa/formats/fp.isa @@ -1804,6 +1804,108 @@ let {{ } else { return new SHA1SU1(machInst, vd, vm); } + case 0x8: + switch (size) { + case 0b01: + if (q) { + return new NVrintnhpQ(machInst, vd, vm); + } else { + return new NVrintnhpD(machInst, vd, vm); + } + case 0b10: + if (q) { + return new NVrintnspQ(machInst, vd, vm); + } else { + return new NVrintnspD(machInst, vd, vm); + } + default: + return new Unknown64(machInst); + } + case 0x9: + switch (size) { + case 0b01: + if (q) { + return new NVrintxhpQ(machInst, vd, vm); + } else { + return new NVrintxhpD(machInst, vd, vm); + } + case 0b10: + if (q) { + return new NVrintxspQ(machInst, vd, vm); + } else { + return new NVrintxspD(machInst, vd, vm); + } + default: + return new Unknown64(machInst); + } + case 0xa: + switch (size) { + case 0b01: + if (q) { + return new NVrintahpQ(machInst, vd, vm); + } else { + return new NVrintahpD(machInst, vd, vm); + } + case 0b10: + if (q) { + return new NVrintaspQ(machInst, vd, vm); + } else { + return new NVrintaspD(machInst, vd, vm); + } + default: + return new Unknown64(machInst); + } + case 0xb: + switch (size) { + case 0b01: + if (q) { + return new NVrintzhpQ(machInst, vd, vm); + } else { + return new NVrintzhpD(machInst, vd, vm); + } + case 0b10: + if (q) { + return new NVrintzspQ(machInst, vd, vm); + } else { + return new NVrintzspD(machInst, vd, vm); + } + default: + return new Unknown64(machInst); + } + case 0xd: + switch (size) { + case 0b01: + if (q) { + return new NVrintmhpQ(machInst, vd, vm); + } else { + return new NVrintmhpD(machInst, vd, vm); + } + case 0b10: + if (q) { + return new NVrintmspQ(machInst, vd, vm); + } else { + return new NVrintmspD(machInst, vd, vm); + } + default: + return new Unknown64(machInst); + } + case 0xf: + switch (size) { + case 0b01: + if (q) { + return new NVrintphpQ(machInst, vd, vm); + } else { + return new NVrintphpD(machInst, vd, vm); + } + case 0b10: + if (q) { + return new NVrintpspQ(machInst, vd, vm); + } else { + return new NVrintpspD(machInst, vd, vm); + } + default: + return new Unknown64(machInst); + } case 0xc: case 0xe: if (b == 0x18) { diff --git a/src/arch/arm/isa/insts/neon.isa b/src/arch/arm/isa/insts/neon.isa index 04d6929ae0..7db7f2125d 100644 --- a/src/arch/arm/isa/insts/neon.isa +++ b/src/arch/arm/isa/insts/neon.isa @@ -3701,6 +3701,102 @@ let {{ twoRegMiscInst("vcvtm.s32.f32", "NVcvt2ssMQ", "SimdCvtOp", ("int32_t",), 4, vcvtmsp2ssCode) + vrinthpCode = ''' + FPSCR fpscr = (FPSCR) FpscrExc; + VfpSavedState state = prepFpState(fpscr.rMode); + __asm__ __volatile__("" : "=m" (srcElem1) : "m" (srcElem1)); + float mid = vcvtFpHFpS(fpscr, fpscr.dn, fpscr.ahp, srcElem1); + if (flushToZero(mid)) + fpscr.idc = 1; + float mid2 = vfpFpRint(mid, %s, fpscr.dn, true, %s); + destElem = vcvtFpSFpH(fpscr, fpscr.fz, fpscr.dn, %s, fpscr.ahp, mid2); + __asm__ __volatile__("" :: "m" (destElem)); + finishVfp(fpscr, state, true); + FpscrExc = fpscr; + ''' + vrintnhpCode = vrinthpCode % ("false", + "VfpRoundNearest", "VfpRoundNearest") + twoRegMiscInst("vrintn.f16", "NVrintnhpD", "SimdCvtOp", + ("uint16_t",), 2, vrintnhpCode) + twoRegMiscInst("vrintn.f16", "NVrintnhpQ", "SimdCvtOp", + ("uint16_t",), 4, vrintnhpCode) + vrintxhpCode = vrinthpCode % ("true", + "VfpRoundNearest", "VfpRoundNearest") + twoRegMiscInst("vrintx.f16", "NVrintxhpD", "SimdCvtOp", + ("uint16_t",), 2, vrintxhpCode) + twoRegMiscInst("vrintx.f16", "NVrintxhpQ", "SimdCvtOp", + ("uint16_t",), 4, vrintxhpCode) + vrintahpCode = vrinthpCode % ("false", "VfpRoundAway", "VfpRoundAway") + twoRegMiscInst("vrinta.f16", "NVrintahpD", "SimdCvtOp", + ("uint16_t",), 2, vrintahpCode) + twoRegMiscInst("vrinta.f16", "NVrintahpQ", "SimdCvtOp", + ("uint16_t",), 4, vrintahpCode) + vrintzhpCode = vrinthpCode % ("false", "VfpRoundZero", "VfpRoundZero") + twoRegMiscInst("vrintz.f16", "NVrintzhpD", "SimdCvtOp", + ("uint16_t",), 2, vrintzhpCode) + twoRegMiscInst("vrintz.f16", "NVrintzhpQ", "SimdCvtOp", + ("uint16_t",), 4, vrintzhpCode) + vrintmhpCode = vrinthpCode % ("false", "VfpRoundDown", "VfpRoundDown") + twoRegMiscInst("vrintm.f16", "NVrintmhpD", "SimdCvtOp", + ("uint16_t",), 2, vrintmhpCode) + twoRegMiscInst("vrintm.f16", "NVrintmhpQ", "SimdCvtOp", + ("uint16_t",), 4, vrintmhpCode) + vrintphpCode = vrinthpCode % ("false", "VfpRoundUpward", "VfpRoundUpward") + twoRegMiscInst("vrintp.f16", "NVrintphpD", "SimdCvtOp", + ("uint16_t",), 2, vrintphpCode) + twoRegMiscInst("vrintp.f16", "NVrintphpQ", "SimdCvtOp", + ("uint16_t",), 4, vrintphpCode) + + vrintspCode = ''' + FPSCR fpscr = (FPSCR) FpscrExc; + VfpSavedState state = prepFpState(fpscr.rMode); + __asm__ __volatile__("" : "=m" (srcElem1) : "m" (srcElem1)); + float mid = bitsToFp(srcElem1, (float)0.0); + if (flushToZero(mid)) + fpscr.idc = 1; + float mid2 = vfpFpRint(mid, %s, fpscr.dn, true, %s); + destElem = fpToBits(mid2); + __asm__ __volatile__("" :: "m" (destElem)); + finishVfp(fpscr, state, true); + FpscrExc = fpscr; + ''' + + vrintnspCode = vrintspCode % ("false", "VfpRoundNearest") + twoRegMiscInst("vrintn.f32", "NVrintnspD", "SimdCvtOp", + ("uint32_t",), 2, vrintnspCode) + twoRegMiscInst("vrintn.f32", "NVrintnspQ", "SimdCvtOp", + ("uint32_t",), 4, vrintnspCode) + + vrintxspCode = vrintspCode % ("true", "VfpRoundNearest") + twoRegMiscInst("vrintx.f32", "NVrintxspD", "SimdCvtOp", + ("uint32_t",), 2, vrintxspCode) + twoRegMiscInst("vrintx.f32", "NVrintxspQ", "SimdCvtOp", + ("uint32_t",), 4, vrintxspCode) + + vrintaspCode = vrintspCode % ("false", "VfpRoundAway") + twoRegMiscInst("vrinta.f32", "NVrintaspD", "SimdCvtOp", + ("uint32_t",), 2, vrintaspCode) + twoRegMiscInst("vrinta.f32", "NVrintaspQ", "SimdCvtOp", + ("uint32_t",), 4, vrintaspCode) + + vrintzspCode = vrintspCode % ("false", "VfpRoundZero") + twoRegMiscInst("vrintz.f32", "NVrintzspD", "SimdCvtOp", + ("uint32_t",), 2, vrintzspCode) + twoRegMiscInst("vrintz.f32", "NVrintzspQ", "SimdCvtOp", + ("uint32_t",), 4, vrintzspCode) + + vrintmspCode = vrintspCode % ("false", "VfpRoundDown") + twoRegMiscInst("vrintm.f32", "NVrintmspD", "SimdCvtOp", + ("uint32_t",), 2, vrintmspCode) + twoRegMiscInst("vrintm.f32", "NVrintmspQ", "SimdCvtOp", + ("uint32_t",), 4, vrintmspCode) + + vrintpspCode = vrintspCode % ("false", "VfpRoundUpward") + twoRegMiscInst("vrintp.f32", "NVrintpspD", "SimdCvtOp", + ("uint32_t",), 2, vrintpspCode) + twoRegMiscInst("vrintp.f32", "NVrintpspQ", "SimdCvtOp", + ("uint32_t",), 4, vrintpspCode) + vrsqrteCode = ''' destElem = unsignedRSqrtEstimate(srcElem1); '''