arch-arm: implement floating point aarch32 VCVTA family

These instructions round floating point to integer, and were added to
aarch32 as an extension to ARMv7.

Change-Id: I62d1705badc95a4e8954a5ad62b2b6bc9e4ffe00
Reviewed-on: https://gem5-review.googlesource.com/c/16788
Reviewed-by: Giacomo Travaglini <giacomo.travaglini@arm.com>
Maintainer: Andreas Sandberg <andreas.sandberg@arm.com>
This commit is contained in:
Ciro Santilli
2019-02-18 18:06:45 +00:00
parent b48e4a90bf
commit 2c242d665f
2 changed files with 200 additions and 105 deletions

View File

@@ -1,6 +1,6 @@
// -*- mode:c++ -*- // -*- mode:c++ -*-
// Copyright (c) 2010-2011, 2016-2018 ARM Limited // Copyright (c) 2010-2011, 2016-2019 ARM Limited
// All rights reserved // All rights reserved
// //
// The license below extends only to copyright in the software and shall // The license below extends only to copyright in the software and shall
@@ -2001,6 +2001,26 @@ let {{
decodeShortFpTransfer(ExtMachInst machInst); decodeShortFpTransfer(ExtMachInst machInst);
''' '''
decoder_output = ''' decoder_output = '''
IntRegIndex decodeFpVd(ExtMachInst machInst, uint32_t size, bool isInt)
{
if (!isInt and size == 3) {
return (IntRegIndex)((bits(machInst, 22) << 5) |
(bits(machInst, 15, 12) << 1));
} else {
return (IntRegIndex)(bits(machInst, 22) |
(bits(machInst, 15, 12) << 1));
}
}
IntRegIndex decodeFpVm(ExtMachInst machInst, uint32_t size, bool isInt)
{
if (!isInt and size == 3) {
return (IntRegIndex)((bits(machInst, 5) << 5) |
(bits(machInst, 3, 0) << 1));
} else {
return (IntRegIndex)(bits(machInst, 5) |
(bits(machInst, 3, 0) << 1));
}
}
StaticInstPtr StaticInstPtr
decodeShortFpTransfer(ExtMachInst machInst) decodeShortFpTransfer(ExtMachInst machInst)
{ {
@@ -2008,67 +2028,143 @@ let {{
const uint32_t c = bits(machInst, 8); const uint32_t c = bits(machInst, 8);
const uint32_t a = bits(machInst, 23, 21); const uint32_t a = bits(machInst, 23, 21);
const uint32_t b = bits(machInst, 6, 5); const uint32_t b = bits(machInst, 6, 5);
const uint32_t o1 = bits(machInst, 18);
if ((machInst.thumb == 1 && bits(machInst, 28) == 1) || if ((machInst.thumb == 1 && bits(machInst, 28) == 1) ||
(machInst.thumb == 0 && machInst.condCode == 0xf)) { (machInst.thumb == 0 && machInst.condCode == 0xf)) {
// Determine if this is backported aarch64 FP instruction // Determine if this is backported aarch64 FP instruction
const bool b31_b24 = bits(machInst, 31, 24) == 0xFE; const bool b31_b24 = bits(machInst, 31, 24) == 0xFE;
const bool b23 = bits(machInst, 23); const bool b23 = bits(machInst, 23);
const bool b21_b18 = bits(machInst, 21, 18) == 0xE; const bool b21_b19 = bits(machInst, 21, 19) == 0x7;
const bool b11_b9 = bits(machInst, 11, 9) == 0x5; const bool b11_b9 = bits(machInst, 11, 9) == 0x5;
const bool sz = bits(machInst, 8); const uint32_t size = bits(machInst, 9, 8);
const bool b7_b6 = bits(machInst, 7, 6) == 0x1; const bool op3 = bits(machInst, 6);
const bool b6 = bits(machInst, 6) == 0x0;
const bool b4 = bits(machInst, 4) == 0x0; const bool b4 = bits(machInst, 4) == 0x0;
if (b31_b24 && b23 && b21_b18 && b11_b9 && b7_b6 && b4) { const uint32_t rm = bits(machInst, 17, 16);
// VINT* Integer Rounding Instructon IntRegIndex vd = decodeFpVd(machInst, size, false);
const uint32_t rm = bits(machInst, 17, 16); IntRegIndex vm = decodeFpVm(machInst, size, false);
IntRegIndex vdInt = decodeFpVd(machInst, size, true);
if (sz) { if (b31_b24 && b23 && b21_b19 && b11_b9 && op3 && b4) {
const IntRegIndex vd = if (o1 == 0) {
(IntRegIndex)((bits(machInst, 22) << 5) | // VINT* Integer Rounding Instruction
(bits(machInst, 15, 12) << 1)); if (size == 3) {
const IntRegIndex vm = switch(rm) {
(IntRegIndex)((bits(machInst, 5) << 5) | case 0x0:
(bits(machInst, 3, 0) << 1)); return decodeVfpRegRegOp<VRIntAD>(machInst, vd, vm,
switch(rm) { true);
case 0x0: case 0x1:
return decodeVfpRegRegOp<VRIntAD>(machInst, vd, vm, return decodeVfpRegRegOp<VRIntND>(machInst, vd, vm,
true); true);
case 0x1: case 0x2:
return decodeVfpRegRegOp<VRIntND>(machInst, vd, vm, return decodeVfpRegRegOp<VRIntPD>(machInst, vd, vm,
true); true);
case 0x2: case 0x3:
return decodeVfpRegRegOp<VRIntPD>(machInst, vd, vm, return decodeVfpRegRegOp<VRIntMD>(machInst, vd, vm,
true); true);
case 0x3: default: return new Unknown(machInst);
return decodeVfpRegRegOp<VRIntMD>(machInst, vd, vm, }
true); } else {
default: return new Unknown(machInst); switch(rm) {
} case 0x0:
} else { return decodeVfpRegRegOp<VRIntAS>(machInst, vd, vm,
const IntRegIndex vd = false);
(IntRegIndex)(bits(machInst, 22) | case 0x1:
(bits(machInst, 15, 12) << 1)); return decodeVfpRegRegOp<VRIntNS>(machInst, vd, vm,
const IntRegIndex vm = false);
(IntRegIndex)(bits(machInst, 5) | case 0x2:
(bits(machInst, 3, 0) << 1)); return decodeVfpRegRegOp<VRIntPS>(machInst, vd, vm,
switch(rm) { false);
case 0x0: case 0x3:
return decodeVfpRegRegOp<VRIntAS>(machInst, vd, vm, return decodeVfpRegRegOp<VRIntMS>(machInst, vd, vm,
false); false);
case 0x1: default: return new Unknown(machInst);
return decodeVfpRegRegOp<VRIntNS>(machInst, vd, vm, }
false); }
case 0x2: } else {
return decodeVfpRegRegOp<VRIntPS>(machInst, vd, vm, const bool op = bits(machInst, 7);
false); switch(rm) {
case 0x3: case 0x0:
return decodeVfpRegRegOp<VRIntMS>(machInst, vd, vm, switch(size) {
false); case 0x0:
default: return new Unknown(machInst); return new Unknown(machInst);
} case 0x1:
} return new FailUnimplemented(
} else if (b31_b24 && !b23 && b11_b9 && b6 && b4){ "vcvta.u32.f16", machInst);
case 0x2:
if (op) {
return new VcvtaFpSIntS(machInst, vdInt, vm);
} else {
return new VcvtaFpUIntS(machInst, vdInt, vm);
}
case 0x3:
if (op) {
return new VcvtaFpSIntD(machInst, vdInt, vm);
} else {
return new VcvtaFpUIntD(machInst, vdInt, vm);
}
}
case 0x1:
switch(size) {
case 0x0:
return new Unknown(machInst);
case 0x1:
return new FailUnimplemented(
"vcvtn.u32.f16", machInst);
case 0x2:
if (op) {
return new VcvtnFpSIntS(machInst, vdInt, vm);
} else {
return new VcvtnFpUIntS(machInst, vdInt, vm);
}
case 0x3:
if (op) {
return new VcvtnFpSIntD(machInst, vdInt, vm);
} else {
return new VcvtnFpUIntD(machInst, vdInt, vm);
}
}
case 0x2:
switch(size) {
case 0x0:
return new Unknown(machInst);
case 0x1:
return new FailUnimplemented(
"vcvtp.u32.f16", machInst);
case 0x2:
if (op) {
return new VcvtpFpSIntS(machInst, vdInt, vm);
} else {
return new VcvtpFpUIntS(machInst, vdInt, vm);
}
case 0x3:
if (op) {
return new VcvtpFpSIntD(machInst, vdInt, vm);
} else {
return new VcvtpFpUIntD(machInst, vdInt, vm);
}
}
case 0x3:
switch(size) {
case 0x0:
return new Unknown(machInst);
case 0x1:
return new FailUnimplemented(
"vcvtm.u32.f16", machInst);
case 0x2:
if (op) {
return new VcvtmFpSIntS(machInst, vdInt, vm);
} else {
return new VcvtmFpUIntS(machInst, vdInt, vm);
}
case 0x3:
if (op) {
return new VcvtmFpSIntD(machInst, vdInt, vm);
} else {
return new VcvtmFpUIntD(machInst, vdInt, vm);
}
}
}
}
} else if (b31_b24 && !b23 && b11_b9 && !op3 && b4){
// VSEL* floating point conditional select // VSEL* floating point conditional select
ConditionCode cond; ConditionCode cond;
@@ -2079,24 +2175,12 @@ let {{
case 0x3: cond = COND_GT; break; case 0x3: cond = COND_GT; break;
} }
if (sz) { if (size == 3) {
const IntRegIndex vd =
(IntRegIndex)((bits(machInst, 22) << 5) |
(bits(machInst, 15, 12) << 1));
const IntRegIndex vm =
(IntRegIndex)((bits(machInst, 5) << 5) |
(bits(machInst, 3, 0) << 1));
const IntRegIndex vn = const IntRegIndex vn =
(IntRegIndex)((bits(machInst, 7) << 5) | (IntRegIndex)((bits(machInst, 7) << 5) |
(bits(machInst, 19, 16) << 1)); (bits(machInst, 19, 16) << 1));
return new VselD(machInst, vd, vn, vm, cond); return new VselD(machInst, vd, vn, vm, cond);
} else { } else {
const IntRegIndex vd =
(IntRegIndex)(bits(machInst, 22) |
(bits(machInst, 15, 12) << 1));
const IntRegIndex vm =
(IntRegIndex)(bits(machInst, 5) |
(bits(machInst, 3, 0) << 1));
const IntRegIndex vn = const IntRegIndex vn =
(IntRegIndex)((bits(machInst, 19, 16) << 1) | (IntRegIndex)((bits(machInst, 19, 16) << 1) |
bits(machInst, 7)); bits(machInst, 7));

View File

@@ -1,6 +1,6 @@
// -*- mode:c++ -*- // -*- mode:c++ -*-
// Copyright (c) 2010-2013,2016 ARM Limited // Copyright (c) 2010-2013,2016,2018-2019 ARM Limited
// All rights reserved // All rights reserved
// //
// The license below extends only to copyright in the software and shall // The license below extends only to copyright in the software and shall
@@ -993,85 +993,96 @@ let {{
decoder_output += FpRegRegOpConstructor.subst(vcvtFpSIntDRIop); decoder_output += FpRegRegOpConstructor.subst(vcvtFpSIntDRIop);
exec_output += PredOpExecute.subst(vcvtFpSIntDRIop); exec_output += PredOpExecute.subst(vcvtFpSIntDRIop);
vcvtFpUIntSCode = vfpEnabledCheckCode + ''' round_mode_suffix_to_mode = {
'': 'VfpRoundZero',
'a': 'VfpRoundAway',
'm': 'VfpRoundDown',
'n': 'VfpRoundNearest',
'p': 'VfpRoundUpward',
}
def buildVcvt(code, className, roundModeSuffix):
global header_output, decoder_output, exec_output, \
vfpEnabledCheckCode, round_mode_suffix_to_mode
full_code = vfpEnabledCheckCode + code.format(
round_mode=round_mode_suffix_to_mode[roundModeSuffix],
)
iop = InstObjParams(
"vcvt{}".format(roundModeSuffix),
className.format(roundModeSuffix),
"FpRegRegOp",
{ "code": full_code,
"predicate_test": predicateTest,
"op_class": "SimdFloatCvtOp" },
[]
)
header_output += FpRegRegOpDeclare.subst(iop);
decoder_output += FpRegRegOpConstructor.subst(iop);
exec_output += PredOpExecute.subst(iop);
code = '''
FPSCR fpscr = (FPSCR) FpscrExc; FPSCR fpscr = (FPSCR) FpscrExc;
vfpFlushToZero(fpscr, FpOp1); vfpFlushToZero(fpscr, FpOp1);
VfpSavedState state = prepFpState(fpscr.rMode); VfpSavedState state = prepFpState(fpscr.rMode);
fesetround(FeRoundZero); fesetround(FeRoundZero);
__asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1));
FpDest_uw = vfpFpToFixed<float>(FpOp1, false, 32, 0); FpDest_uw = vfpFpToFixed<float>(
FpOp1, false, 32, 0, true, {round_mode});
__asm__ __volatile__("" :: "m" (FpDest_uw)); __asm__ __volatile__("" :: "m" (FpDest_uw));
finishVfp(fpscr, state, fpscr.fz); finishVfp(fpscr, state, fpscr.fz);
FpscrExc = fpscr; FpscrExc = fpscr;
''' '''
vcvtFpUIntSIop = InstObjParams("vcvt", "VcvtFpUIntS", "FpRegRegOp", for round_mode_suffix in round_mode_suffix_to_mode:
{ "code": vcvtFpUIntSCode, buildVcvt(code, "Vcvt{}FpUIntS", round_mode_suffix)
"predicate_test": predicateTest,
"op_class": "SimdFloatCvtOp" }, [])
header_output += FpRegRegOpDeclare.subst(vcvtFpUIntSIop);
decoder_output += FpRegRegOpConstructor.subst(vcvtFpUIntSIop);
exec_output += PredOpExecute.subst(vcvtFpUIntSIop);
vcvtFpUIntDCode = vfpEnabledCheckCode + ''' code = '''
FPSCR fpscr = (FPSCR) FpscrExc; FPSCR fpscr = (FPSCR) FpscrExc;
double cOp1 = dbl(FpOp1P0_uw, FpOp1P1_uw); double cOp1 = dbl(FpOp1P0_uw, FpOp1P1_uw);
vfpFlushToZero(fpscr, cOp1); vfpFlushToZero(fpscr, cOp1);
VfpSavedState state = prepFpState(fpscr.rMode); VfpSavedState state = prepFpState(fpscr.rMode);
fesetround(FeRoundZero); fesetround(FeRoundZero);
__asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1)); __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1));
uint64_t result = vfpFpToFixed<double>(cOp1, false, 32, 0); uint64_t result = vfpFpToFixed<double>(
cOp1, false, 32, 0, true, {round_mode});
__asm__ __volatile__("" :: "m" (result)); __asm__ __volatile__("" :: "m" (result));
finishVfp(fpscr, state, fpscr.fz); finishVfp(fpscr, state, fpscr.fz);
FpDestP0_uw = result; FpDestP0_uw = result;
FpscrExc = fpscr; FpscrExc = fpscr;
''' '''
vcvtFpUIntDIop = InstObjParams("vcvt", "VcvtFpUIntD", "FpRegRegOp", for round_mode_suffix in round_mode_suffix_to_mode:
{ "code": vcvtFpUIntDCode, buildVcvt(code, "Vcvt{}FpUIntD", round_mode_suffix)
"predicate_test": predicateTest,
"op_class": "SimdFloatCvtOp" }, [])
header_output += FpRegRegOpDeclare.subst(vcvtFpUIntDIop);
decoder_output += FpRegRegOpConstructor.subst(vcvtFpUIntDIop);
exec_output += PredOpExecute.subst(vcvtFpUIntDIop);
vcvtFpSIntSCode = vfpEnabledCheckCode + ''' code = '''
FPSCR fpscr = (FPSCR) FpscrExc; FPSCR fpscr = (FPSCR) FpscrExc;
vfpFlushToZero(fpscr, FpOp1); vfpFlushToZero(fpscr, FpOp1);
VfpSavedState state = prepFpState(fpscr.rMode); VfpSavedState state = prepFpState(fpscr.rMode);
fesetround(FeRoundZero); fesetround(FeRoundZero);
__asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1));
FpDest_sw = vfpFpToFixed<float>(FpOp1, true, 32, 0); FpDest_sw = vfpFpToFixed<float>(
FpOp1, true, 32, 0, true, {round_mode});
__asm__ __volatile__("" :: "m" (FpDest_sw)); __asm__ __volatile__("" :: "m" (FpDest_sw));
finishVfp(fpscr, state, fpscr.fz); finishVfp(fpscr, state, fpscr.fz);
FpscrExc = fpscr; FpscrExc = fpscr;
''' '''
vcvtFpSIntSIop = InstObjParams("vcvt", "VcvtFpSIntS", "FpRegRegOp", for round_mode_suffix in round_mode_suffix_to_mode:
{ "code": vcvtFpSIntSCode, buildVcvt(code, "Vcvt{}FpSIntS", round_mode_suffix)
"predicate_test": predicateTest,
"op_class": "SimdFloatCvtOp" }, [])
header_output += FpRegRegOpDeclare.subst(vcvtFpSIntSIop);
decoder_output += FpRegRegOpConstructor.subst(vcvtFpSIntSIop);
exec_output += PredOpExecute.subst(vcvtFpSIntSIop);
vcvtFpSIntDCode = vfpEnabledCheckCode + ''' code = '''
FPSCR fpscr = (FPSCR) FpscrExc; FPSCR fpscr = (FPSCR) FpscrExc;
double cOp1 = dbl(FpOp1P0_uw, FpOp1P1_uw); double cOp1 = dbl(FpOp1P0_uw, FpOp1P1_uw);
vfpFlushToZero(fpscr, cOp1); vfpFlushToZero(fpscr, cOp1);
VfpSavedState state = prepFpState(fpscr.rMode); VfpSavedState state = prepFpState(fpscr.rMode);
fesetround(FeRoundZero); fesetround(FeRoundZero);
__asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1)); __asm__ __volatile__("" : "=m" (cOp1) : "m" (cOp1));
int64_t result = vfpFpToFixed<double>(cOp1, true, 32, 0); int64_t result = vfpFpToFixed<double>(
cOp1, true, 32, 0, true, {round_mode});
__asm__ __volatile__("" :: "m" (result)); __asm__ __volatile__("" :: "m" (result));
finishVfp(fpscr, state, fpscr.fz); finishVfp(fpscr, state, fpscr.fz);
FpDestP0_uw = result; FpDestP0_uw = result;
FpscrExc = fpscr; FpscrExc = fpscr;
''' '''
vcvtFpSIntDIop = InstObjParams("vcvt", "VcvtFpSIntD", "FpRegRegOp", for round_mode_suffix in round_mode_suffix_to_mode:
{ "code": vcvtFpSIntDCode, buildVcvt(code, "Vcvt{}FpSIntD", round_mode_suffix)
"predicate_test": predicateTest,
"op_class": "SimdFloatCvtOp" }, [])
header_output += FpRegRegOpDeclare.subst(vcvtFpSIntDIop);
decoder_output += FpRegRegOpConstructor.subst(vcvtFpSIntDIop);
exec_output += PredOpExecute.subst(vcvtFpSIntDIop);
vcvtFpSFpDCode = vfpEnabledCheckCode + ''' vcvtFpSFpDCode = vfpEnabledCheckCode + '''
FPSCR fpscr = (FPSCR) FpscrExc; FPSCR fpscr = (FPSCR) FpscrExc;