arch-arm: Add support of AArch32 VRINTN/X/A/Z/M/P instructions.

Add decoder and function of AArch32 VRINTN, VRINTX, VRINTA, VRINTZ, VRINTM, and VRINTP (Advanced SIMD) instructions. Support both 16-bit and 32-bit variants. Add vfpFPRint in vfp.hh to perform the behavior of round-to-integer. Only support A32 encoding. Change-Id: Icb9b6f71edf16ea14a439e15c480351cd8e1eb88
2024-08-17 14:10:13 +08:00
parent 1c8ab47a54
commit 7df35187a0
3 changed files with 312 additions and 1 deletions
--- a/src/arch/arm/insts/vfp.hh
+++ b/src/arch/arm/insts/vfp.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2013, 2019 ARM Limited
+ * Copyright (c) 2010-2013, 2019, 2024 Arm Limited
 * All rights reserved
 *
 * The license below extends only to copyright in the software and shall
@@ -435,6 +435,119 @@ vfpFpToFixed(T val, bool isSigned, uint8_t width, uint8_t imm, bool
 };


+template <typename T>
+T
+vfpFpRint(T val, bool exact, bool defaultNan, bool useRmode = true,
+          VfpRoundingMode roundMode = VfpRoundZero)
+{
+    int  rmode;
+    bool roundAwayFix = false;
+
+    if (!useRmode) {
+        rmode = fegetround();
+    } else {
+        switch (roundMode)
+        {
+          case VfpRoundNearest:
+            rmode = FeRoundNearest;
+            break;
+          case VfpRoundUpward:
+            rmode = FeRoundUpward;
+            break;
+          case VfpRoundDown:
+            rmode = FeRoundDown;
+            break;
+          case VfpRoundZero:
+            rmode = FeRoundZero;
+            break;
+          case VfpRoundAway:
+            // There is no equivalent rounding mode, use round down and we'll
+            // fix it later
+            rmode        = FeRoundDown;
+            roundAwayFix = true;
+            break;
+          default:
+            panic("Unsupported roundMode %d\n", roundMode);
+        }
+    }
+    __asm__ __volatile__("" : "=m" (rmode) : "m" (rmode));
+    __asm__ __volatile__("" : "=m" (val) : "m" (val));
+    fesetround(rmode);
+    feclearexcept(FeAllExceptions);
+    __asm__ __volatile__("" : "=m" (val) : "m" (val));
+    T origVal = val;
+    val = rint(val);
+    __asm__ __volatile__("" : "=m" (val) : "m" (val));
+
+    int exceptions = fetestexcept(FeAllExceptions);
+    if (!exact) {
+        exceptions &= ~FeInexact;
+    }
+
+    int fpType = std::fpclassify(val);
+    if (fpType == FP_SUBNORMAL || fpType == FP_NAN) {
+        if (fpType == FP_NAN) {
+            if (isSnan(val)) {
+                exceptions |= FeInvalid;
+            }
+            if (defaultNan || !isSnan(val)) {
+                bool single = (sizeof(T) == sizeof(float));
+                uint64_t qnan = single ? 0x7fc00000 : 0x7ff8000000000000ULL;
+                val = bitsToFp(qnan, (T)0.0);
+            }
+        } else {
+            val = 0.0;
+        }
+    } else if (origVal != val) {
+        switch (rmode) {
+          case FeRoundNearest:
+            if (origVal - val > 0.5)
+                val += 1.0;
+            else if (val - origVal > 0.5)
+                val -= 1.0;
+            break;
+          case FeRoundDown:
+            if (roundAwayFix) {
+                // The ordering on the subtraction looks a bit odd in that we
+                // don't do the obvious origVal - val, instead we do
+                // -(val - origVal). This is required to get the corruct bit
+                // exact behaviour when very close to the 0.5 threshold.
+                volatile T error = val;
+                error -= origVal;
+                error = -error;
+                if ( (error >  0.5) ||
+                    ((error == 0.5) && (val >= 0)) )
+                    val += 1.0;
+            } else {
+                if (origVal < val)
+                    val -= 1.0;
+            }
+            break;
+          case FeRoundUpward:
+            if (origVal > val)
+                val += 1.0;
+            break;
+        }
+        if (exact) {
+            exceptions |= FeInexact;
+        }
+    }
+    // Fix signal of zero.
+    fpType = std::fpclassify(val);
+    if (fpType == FP_ZERO) {
+        bool single = (sizeof(T) == sizeof(float));
+        uint64_t mask = single ? 0x80000000 : 0x8000000000000000ULL;
+        val = bitsToFp((fpToBits(val) & (~mask)) | (fpToBits(origVal) & mask),
+                       (T)0.0);
+    }
+
+    // __asm__ __volatile__("" : "=m" (val) : "m" (val));
+    setFPExceptions(exceptions);
+
+    return val;
+};
+
+
 float vfpUFixedToFpS(bool flush, bool defaultNan,
        uint64_t val, uint8_t width, uint8_t imm);
 float vfpSFixedToFpS(bool flush, bool defaultNan,
--- a/src/arch/arm/isa/formats/fp.isa
+++ b/src/arch/arm/isa/formats/fp.isa
@@ -1804,6 +1804,108 @@ let {{
                } else {
                    return new SHA1SU1(machInst, vd, vm);
                }
+              case 0x8:
+                switch (size) {
+                  case 0b01:
+                    if (q) {
+                        return new NVrintnhpQ<uint16_t>(machInst, vd, vm);
+                    } else {
+                        return new NVrintnhpD<uint16_t>(machInst, vd, vm);
+                    }
+                  case 0b10:
+                    if (q) {
+                        return new NVrintnspQ<uint32_t>(machInst, vd, vm);
+                    } else {
+                        return new NVrintnspD<uint32_t>(machInst, vd, vm);
+                    }
+                  default:
+                    return new Unknown64(machInst);
+                }
+              case 0x9:
+                switch (size) {
+                  case 0b01:
+                    if (q) {
+                        return new NVrintxhpQ<uint16_t>(machInst, vd, vm);
+                    } else {
+                        return new NVrintxhpD<uint16_t>(machInst, vd, vm);
+                    }
+                  case 0b10:
+                    if (q) {
+                        return new NVrintxspQ<uint32_t>(machInst, vd, vm);
+                    } else {
+                        return new NVrintxspD<uint32_t>(machInst, vd, vm);
+                    }
+                  default:
+                    return new Unknown64(machInst);
+                }
+              case 0xa:
+                switch (size) {
+                  case 0b01:
+                    if (q) {
+                        return new NVrintahpQ<uint16_t>(machInst, vd, vm);
+                    } else {
+                        return new NVrintahpD<uint16_t>(machInst, vd, vm);
+                    }
+                  case 0b10:
+                    if (q) {
+                        return new NVrintaspQ<uint32_t>(machInst, vd, vm);
+                    } else {
+                        return new NVrintaspD<uint32_t>(machInst, vd, vm);
+                    }
+                  default:
+                    return new Unknown64(machInst);
+                }
+              case 0xb:
+                switch (size) {
+                  case 0b01:
+                    if (q) {
+                        return new NVrintzhpQ<uint16_t>(machInst, vd, vm);
+                    } else {
+                        return new NVrintzhpD<uint16_t>(machInst, vd, vm);
+                    }
+                  case 0b10:
+                    if (q) {
+                        return new NVrintzspQ<uint32_t>(machInst, vd, vm);
+                    } else {
+                        return new NVrintzspD<uint32_t>(machInst, vd, vm);
+                    }
+                  default:
+                    return new Unknown64(machInst);
+                }
+              case 0xd:
+                switch (size) {
+                  case 0b01:
+                    if (q) {
+                        return new NVrintmhpQ<uint16_t>(machInst, vd, vm);
+                    } else {
+                        return new NVrintmhpD<uint16_t>(machInst, vd, vm);
+                    }
+                  case 0b10:
+                    if (q) {
+                        return new NVrintmspQ<uint32_t>(machInst, vd, vm);
+                    } else {
+                        return new NVrintmspD<uint32_t>(machInst, vd, vm);
+                    }
+                  default:
+                    return new Unknown64(machInst);
+                }
+              case 0xf:
+                switch (size) {
+                  case 0b01:
+                    if (q) {
+                        return new NVrintphpQ<uint16_t>(machInst, vd, vm);
+                    } else {
+                        return new NVrintphpD<uint16_t>(machInst, vd, vm);
+                    }
+                  case 0b10:
+                    if (q) {
+                        return new NVrintpspQ<uint32_t>(machInst, vd, vm);
+                    } else {
+                        return new NVrintpspD<uint32_t>(machInst, vd, vm);
+                    }
+                  default:
+                    return new Unknown64(machInst);
+                }
              case 0xc:
              case 0xe:
                if (b == 0x18) {
--- a/src/arch/arm/isa/insts/neon.isa
+++ b/src/arch/arm/isa/insts/neon.isa
@@ -3701,6 +3701,102 @@ let {{
    twoRegMiscInst("vcvtm.s32.f32", "NVcvt2ssMQ", "SimdCvtOp",
                   ("int32_t",), 4, vcvtmsp2ssCode)

+    vrinthpCode = '''
+        FPSCR fpscr = (FPSCR) FpscrExc;
+        VfpSavedState state = prepFpState(fpscr.rMode);
+        __asm__ __volatile__("" : "=m" (srcElem1) : "m" (srcElem1));
+        float mid = vcvtFpHFpS(fpscr, fpscr.dn, fpscr.ahp, srcElem1);
+        if (flushToZero(mid))
+            fpscr.idc = 1;
+        float mid2 = vfpFpRint<float>(mid, %s, fpscr.dn, true, %s);
+        destElem = vcvtFpSFpH(fpscr, fpscr.fz, fpscr.dn, %s, fpscr.ahp, mid2);
+        __asm__ __volatile__("" :: "m" (destElem));
+        finishVfp(fpscr, state, true);
+        FpscrExc = fpscr;
+    '''
+    vrintnhpCode = vrinthpCode % ("false",
+                                  "VfpRoundNearest", "VfpRoundNearest")
+    twoRegMiscInst("vrintn.f16", "NVrintnhpD", "SimdCvtOp",
+                   ("uint16_t",), 2, vrintnhpCode)
+    twoRegMiscInst("vrintn.f16", "NVrintnhpQ", "SimdCvtOp",
+                   ("uint16_t",), 4, vrintnhpCode)
+    vrintxhpCode = vrinthpCode % ("true",
+                                  "VfpRoundNearest", "VfpRoundNearest")
+    twoRegMiscInst("vrintx.f16", "NVrintxhpD", "SimdCvtOp",
+                   ("uint16_t",), 2, vrintxhpCode)
+    twoRegMiscInst("vrintx.f16", "NVrintxhpQ", "SimdCvtOp",
+                   ("uint16_t",), 4, vrintxhpCode)
+    vrintahpCode = vrinthpCode % ("false", "VfpRoundAway", "VfpRoundAway")
+    twoRegMiscInst("vrinta.f16", "NVrintahpD", "SimdCvtOp",
+                   ("uint16_t",), 2, vrintahpCode)
+    twoRegMiscInst("vrinta.f16", "NVrintahpQ", "SimdCvtOp",
+                   ("uint16_t",), 4, vrintahpCode)
+    vrintzhpCode = vrinthpCode % ("false", "VfpRoundZero", "VfpRoundZero")
+    twoRegMiscInst("vrintz.f16", "NVrintzhpD", "SimdCvtOp",
+                   ("uint16_t",), 2, vrintzhpCode)
+    twoRegMiscInst("vrintz.f16", "NVrintzhpQ", "SimdCvtOp",
+                   ("uint16_t",), 4, vrintzhpCode)
+    vrintmhpCode = vrinthpCode % ("false", "VfpRoundDown", "VfpRoundDown")
+    twoRegMiscInst("vrintm.f16", "NVrintmhpD", "SimdCvtOp",
+                   ("uint16_t",), 2, vrintmhpCode)
+    twoRegMiscInst("vrintm.f16", "NVrintmhpQ", "SimdCvtOp",
+                   ("uint16_t",), 4, vrintmhpCode)
+    vrintphpCode = vrinthpCode % ("false", "VfpRoundUpward", "VfpRoundUpward")
+    twoRegMiscInst("vrintp.f16", "NVrintphpD", "SimdCvtOp",
+                   ("uint16_t",), 2, vrintphpCode)
+    twoRegMiscInst("vrintp.f16", "NVrintphpQ", "SimdCvtOp",
+                   ("uint16_t",), 4, vrintphpCode)
+
+    vrintspCode = '''
+        FPSCR fpscr = (FPSCR) FpscrExc;
+        VfpSavedState state = prepFpState(fpscr.rMode);
+        __asm__ __volatile__("" : "=m" (srcElem1) : "m" (srcElem1));
+        float mid = bitsToFp(srcElem1, (float)0.0);
+        if (flushToZero(mid))
+            fpscr.idc = 1;
+        float mid2 = vfpFpRint<float>(mid, %s, fpscr.dn, true, %s);
+        destElem = fpToBits(mid2);
+        __asm__ __volatile__("" :: "m" (destElem));
+        finishVfp(fpscr, state, true);
+        FpscrExc = fpscr;
+    '''
+
+    vrintnspCode = vrintspCode % ("false", "VfpRoundNearest")
+    twoRegMiscInst("vrintn.f32", "NVrintnspD", "SimdCvtOp",
+                   ("uint32_t",), 2, vrintnspCode)
+    twoRegMiscInst("vrintn.f32", "NVrintnspQ", "SimdCvtOp",
+                   ("uint32_t",), 4, vrintnspCode)
+
+    vrintxspCode = vrintspCode % ("true", "VfpRoundNearest")
+    twoRegMiscInst("vrintx.f32", "NVrintxspD", "SimdCvtOp",
+                   ("uint32_t",), 2, vrintxspCode)
+    twoRegMiscInst("vrintx.f32", "NVrintxspQ", "SimdCvtOp",
+                   ("uint32_t",), 4, vrintxspCode)
+
+    vrintaspCode = vrintspCode % ("false", "VfpRoundAway")
+    twoRegMiscInst("vrinta.f32", "NVrintaspD", "SimdCvtOp",
+                   ("uint32_t",), 2, vrintaspCode)
+    twoRegMiscInst("vrinta.f32", "NVrintaspQ", "SimdCvtOp",
+                   ("uint32_t",), 4, vrintaspCode)
+
+    vrintzspCode = vrintspCode % ("false", "VfpRoundZero")
+    twoRegMiscInst("vrintz.f32", "NVrintzspD", "SimdCvtOp",
+                   ("uint32_t",), 2, vrintzspCode)
+    twoRegMiscInst("vrintz.f32", "NVrintzspQ", "SimdCvtOp",
+                   ("uint32_t",), 4, vrintzspCode)
+
+    vrintmspCode = vrintspCode % ("false", "VfpRoundDown")
+    twoRegMiscInst("vrintm.f32", "NVrintmspD", "SimdCvtOp",
+                   ("uint32_t",), 2, vrintmspCode)
+    twoRegMiscInst("vrintm.f32", "NVrintmspQ", "SimdCvtOp",
+                   ("uint32_t",), 4, vrintmspCode)
+
+    vrintpspCode = vrintspCode % ("false", "VfpRoundUpward")
+    twoRegMiscInst("vrintp.f32", "NVrintpspD", "SimdCvtOp",
+                   ("uint32_t",), 2, vrintpspCode)
+    twoRegMiscInst("vrintp.f32", "NVrintpspQ", "SimdCvtOp",
+                   ("uint32_t",), 4, vrintpspCode)
+
    vrsqrteCode = '''
        destElem = unsignedRSqrtEstimate(srcElem1);
    '''