arch-arm: Add recursive reduce in Neon instruction.

FMAXV, FMINV, FMAXNMV, FMINNMV and ADDV instructions perform recursive
reduction. Different reduction methods lie to different result when
handle NaN values.

Reuse the template of `twoRegAcrossInstX`. Add one more option
`recursive` for recursive reduction.

Change-Id: I69e690ce7668baee818542d3ea463f7a5f269a69
Reviewed-by: Giacomo Travaglini <giacomo.travaglini@arm.com>
This commit is contained in:
Junshi Wang
2024-09-06 13:32:45 +08:00
committed by Giacomo Travaglini
parent e17875b7c7
commit a25d9a126f

View File

@@ -1,6 +1,6 @@
// -*- mode: c++ -*-
// Copyright (c) 2012-2013, 2015-2018, 2020 ARM Limited
// Copyright (c) 2012-2013, 2015-2018, 2020, 2024 ARM Limited
// All rights reserved
//
// The license below extends only to copyright in the software and shall
@@ -577,7 +577,7 @@ let {{
exec_output += NeonXExecDeclare.subst(substDict)
def twoRegAcrossInstX(name, Name, opClass, types, rCount, op,
doubleDest=False, long=False):
doubleDest=False, long=False, recursive=False):
global header_output, exec_output
destPrefix = "Big" if long else ""
eWalkCode = simd64EnabledCheckCode + '''
@@ -588,7 +588,25 @@ let {{
eWalkCode += '''
srcReg1.regs[%(reg)d] = htole(AA64FpOp1P%(reg)d_uw);
''' % { "reg" : reg }
eWalkCode += '''
if recursive:
eWalkCode += '''
RegVect tmpReg = srcReg1;
destReg.regs[0] = 0;
for (unsigned gap = 1; gap < eCount; gap = gap * 2) {
for (unsigned i = 0; i < eCount; i = i + gap * 2) {
unsigned src_id0 = i;
unsigned src_id1 = i + gap;
unsigned dst_id = i;
%(destPrefix)sElement destElem = letoh(tmpReg.elements[src_id0]);
%(destPrefix)sElement srcElem1 = letoh(tmpReg.elements[src_id1]);
%(op)s
tmpReg.elements[dst_id] = destElem;
}
}
destReg.elements[0] = htole(tmpReg.elements[0]);
''' % { "op" : op, "destPrefix" : destPrefix }
else:
eWalkCode += '''
destReg.regs[0] = 0;
%(destPrefix)sElement destElem = 0;
for (unsigned i = 0; i < eCount; i++) {
@@ -934,9 +952,9 @@ let {{
# Note: SimdAddOp can be a bit optimistic here
addAcrossCode = "destElem += srcElem1;"
twoRegAcrossInstX("addv", "AddvDX", "SimdAddOp", ("uint8_t", "uint16_t"),
2, addAcrossCode)
2, addAcrossCode, False, False, True)
twoRegAcrossInstX("addv", "AddvQX", "SimdAddOp", smallUnsignedTypes, 4,
addAcrossCode)
addAcrossCode, False, False, True)
# AND
andCode = "destElem = srcElem1 & srcElem2;"
threeEqualRegInstX("and", "AndDX", "SimdAluOp", ("uint64_t",), 2, andCode)
@@ -1649,7 +1667,7 @@ let {{
fpAcrossOp = fpOp % "fplib%s<Element>(destElem, srcElem1, fpscr)"
fmaxnmAcrossCode = fpAcrossOp % "MaxNum"
twoRegAcrossInstX("fmaxnmv", "FmaxnmvQX", "SimdFloatCmpOp", ("uint32_t",),
4, fmaxnmAcrossCode)
4, fmaxnmAcrossCode, False, False, True)
# FMAXP (scalar)
twoRegPairwiseScInstX("fmaxp", "FmaxpScDX", "SimdFloatCmpOp",
("uint32_t",), 2, fmaxCode)
@@ -1664,7 +1682,7 @@ let {{
# Note: SimdFloatCmpOp can be a bit optimistic here
fmaxAcrossCode = fpAcrossOp % "Max"
twoRegAcrossInstX("fmaxv", "FmaxvQX", "SimdFloatCmpOp", ("uint32_t",), 4,
fmaxAcrossCode)
fmaxAcrossCode, False, False, True)
# FMIN
fminCode = fpBinOp % "Min"
threeEqualRegInstX("fmin", "FminDX", "SimdFloatCmpOp", smallFloatTypes, 2,
@@ -1691,7 +1709,7 @@ let {{
# Note: SimdFloatCmpOp can be a bit optimistic here
fminnmAcrossCode = fpAcrossOp % "MinNum"
twoRegAcrossInstX("fminnmv", "FminnmvQX", "SimdFloatCmpOp", ("uint32_t",),
4, fminnmAcrossCode)
4, fminnmAcrossCode, False, False, True)
# FMINP (scalar)
twoRegPairwiseScInstX("fminp", "FminpScDX", "SimdFloatCmpOp",
("uint32_t",), 2, fminCode)
@@ -1706,7 +1724,7 @@ let {{
# Note: SimdFloatCmpOp can be a bit optimistic here
fminAcrossCode = fpAcrossOp % "Min"
twoRegAcrossInstX("fminv", "FminvQX", "SimdFloatCmpOp", ("uint32_t",), 4,
fminAcrossCode)
fminAcrossCode, False, False, True)
# FMLA (by element)
fmlaCode = fpOp % ("fplibMulAdd<Element>("
"destElem, srcElem1, srcElem2, fpscr)")