arch-arm: Add recursive reduce in Neon instruction. (#1616)

FMAXV, FMINV, FMAXNMV, FMINNMV and ADDV instructions perform recursive
reduction. Different reduction methods lie to different result when
handle NaN values.

Reuse the template of `twoRegAcrossInstX`. Add one more option
`recursive` for recursive reduction.

Change-Id: I69e690ce7668baee818542d3ea463f7a5f269a69
Reviewed-by: Giacomo Travaglini <giacomo.travaglini@arm.com>
This commit is contained in:
Giacomo Travaglini
2024-10-02 12:41:53 +02:00
committed by GitHub

View File

@@ -1,6 +1,6 @@
// -*- mode: c++ -*-
// Copyright (c) 2012-2013, 2015-2018, 2020 ARM Limited
// Copyright (c) 2012-2013, 2015-2018, 2020, 2024 ARM Limited
// All rights reserved
//
// The license below extends only to copyright in the software and shall
@@ -577,7 +577,7 @@ let {{
exec_output += NeonXExecDeclare.subst(substDict)
def twoRegAcrossInstX(name, Name, opClass, types, rCount, op,
doubleDest=False, long=False):
doubleDest=False, long=False, recursive=False):
global header_output, exec_output
destPrefix = "Big" if long else ""
eWalkCode = simd64EnabledCheckCode + '''
@@ -588,7 +588,25 @@ let {{
eWalkCode += '''
srcReg1.regs[%(reg)d] = htole(AA64FpOp1P%(reg)d_uw);
''' % { "reg" : reg }
eWalkCode += '''
if recursive:
eWalkCode += '''
RegVect tmpReg = srcReg1;
destReg.regs[0] = 0;
for (unsigned gap = 1; gap < eCount; gap = gap * 2) {
for (unsigned i = 0; i < eCount; i = i + gap * 2) {
unsigned src_id0 = i;
unsigned src_id1 = i + gap;
unsigned dst_id = i;
%(destPrefix)sElement destElem = letoh(tmpReg.elements[src_id0]);
%(destPrefix)sElement srcElem1 = letoh(tmpReg.elements[src_id1]);
%(op)s
tmpReg.elements[dst_id] = destElem;
}
}
destReg.elements[0] = htole(tmpReg.elements[0]);
''' % { "op" : op, "destPrefix" : destPrefix }
else:
eWalkCode += '''
destReg.regs[0] = 0;
%(destPrefix)sElement destElem = 0;
for (unsigned i = 0; i < eCount; i++) {
@@ -934,9 +952,9 @@ let {{
# Note: SimdAddOp can be a bit optimistic here
addAcrossCode = "destElem += srcElem1;"
twoRegAcrossInstX("addv", "AddvDX", "SimdAddOp", ("uint8_t", "uint16_t"),
2, addAcrossCode)
2, addAcrossCode, False, False, True)
twoRegAcrossInstX("addv", "AddvQX", "SimdAddOp", smallUnsignedTypes, 4,
addAcrossCode)
addAcrossCode, False, False, True)
# AND
andCode = "destElem = srcElem1 & srcElem2;"
threeEqualRegInstX("and", "AndDX", "SimdAluOp", ("uint64_t",), 2, andCode)
@@ -1649,7 +1667,7 @@ let {{
fpAcrossOp = fpOp % "fplib%s<Element>(destElem, srcElem1, fpscr)"
fmaxnmAcrossCode = fpAcrossOp % "MaxNum"
twoRegAcrossInstX("fmaxnmv", "FmaxnmvQX", "SimdFloatCmpOp", ("uint32_t",),
4, fmaxnmAcrossCode)
4, fmaxnmAcrossCode, False, False, True)
# FMAXP (scalar)
twoRegPairwiseScInstX("fmaxp", "FmaxpScDX", "SimdFloatCmpOp",
("uint32_t",), 2, fmaxCode)
@@ -1664,7 +1682,7 @@ let {{
# Note: SimdFloatCmpOp can be a bit optimistic here
fmaxAcrossCode = fpAcrossOp % "Max"
twoRegAcrossInstX("fmaxv", "FmaxvQX", "SimdFloatCmpOp", ("uint32_t",), 4,
fmaxAcrossCode)
fmaxAcrossCode, False, False, True)
# FMIN
fminCode = fpBinOp % "Min"
threeEqualRegInstX("fmin", "FminDX", "SimdFloatCmpOp", smallFloatTypes, 2,
@@ -1691,7 +1709,7 @@ let {{
# Note: SimdFloatCmpOp can be a bit optimistic here
fminnmAcrossCode = fpAcrossOp % "MinNum"
twoRegAcrossInstX("fminnmv", "FminnmvQX", "SimdFloatCmpOp", ("uint32_t",),
4, fminnmAcrossCode)
4, fminnmAcrossCode, False, False, True)
# FMINP (scalar)
twoRegPairwiseScInstX("fminp", "FminpScDX", "SimdFloatCmpOp",
("uint32_t",), 2, fminCode)
@@ -1706,7 +1724,7 @@ let {{
# Note: SimdFloatCmpOp can be a bit optimistic here
fminAcrossCode = fpAcrossOp % "Min"
twoRegAcrossInstX("fminv", "FminvQX", "SimdFloatCmpOp", ("uint32_t",), 4,
fminAcrossCode)
fminAcrossCode, False, False, True)
# FMLA (by element)
fmlaCode = fpOp % ("fplibMulAdd<Element>("
"destElem, srcElem1, srcElem2, fpscr)")