arch-arm: Add support for Armv8.2-DotProd NEON extension.
Add support for the Armv8.2-DotProd NEON extension. This provides the SDOT and UDOT SIMD Dot Product instructions. For more information please refer to the Arm Architecture Reference Manual (https://developer.arm.com/documentation/ddi0487/latest/). Change-Id: I4caa3b97a74c65f32421487c55c3e36427194e61 Reviewed-by: Richard Cooper <richard.cooper@arm.com> Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70736 Maintainer: Jason Lowe-Power <power.jg@gmail.com> Reviewed-by: Andreas Sandberg <andreas.sandberg@arm.com> Maintainer: Andreas Sandberg <andreas.sandberg@arm.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
committed by
Bobby Bruce
parent
fab3d8a1c1
commit
eb4f83b178
@@ -57,6 +57,7 @@ class ArmDefaultSERelease(ArmRelease):
|
||||
"FEAT_F64MM",
|
||||
"FEAT_SVE",
|
||||
"FEAT_I8MM",
|
||||
"FEAT_DOTPROD",
|
||||
# Armv8.3
|
||||
"FEAT_FCMA",
|
||||
"FEAT_JSCVT",
|
||||
|
||||
@@ -81,6 +81,7 @@ class ArmExtension(ScopedEnum):
|
||||
"FEAT_F32MM", # Optional in Armv8.2
|
||||
"FEAT_F64MM", # Optional in Armv8.2
|
||||
"FEAT_I8MM", # Optional in Armv8.2
|
||||
"FEAT_DOTPROD", # Optional in Armv8.2
|
||||
# Armv8.3
|
||||
"FEAT_FCMA",
|
||||
"FEAT_JSCVT",
|
||||
@@ -169,6 +170,7 @@ class ArmDefaultRelease(Armv8):
|
||||
"FEAT_F32MM",
|
||||
"FEAT_F64MM",
|
||||
"FEAT_I8MM",
|
||||
"FEAT_DOTPROD",
|
||||
# Armv8.3
|
||||
"FEAT_FCMA",
|
||||
"FEAT_JSCVT",
|
||||
@@ -205,6 +207,7 @@ class Armv82(Armv81):
|
||||
"FEAT_F32MM",
|
||||
"FEAT_F64MM",
|
||||
"FEAT_I8MM",
|
||||
"FEAT_DOTPROD",
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -510,6 +510,7 @@ namespace Aarch64
|
||||
decodeNeon3RegExtension(ExtMachInst machInst)
|
||||
{
|
||||
uint8_t q = bits(machInst, 30);
|
||||
uint8_t qu = bits(machInst, 30, 29);
|
||||
uint8_t size = bits(machInst, 23, 22);
|
||||
uint8_t opcode = bits(machInst, 15, 11);
|
||||
|
||||
@@ -532,6 +533,19 @@ namespace Aarch64
|
||||
else
|
||||
return decodeNeonSThreeHAndWReg<SqrdmlshDX>(
|
||||
size, machInst, vd, vn, vm);
|
||||
case 0x12:
|
||||
switch (qu) {
|
||||
case 0b00:
|
||||
return new SdotDX<int32_t>(machInst, vd, vn, vm);
|
||||
case 0b01:
|
||||
return new UdotDX<uint32_t>(machInst, vd, vn, vm);
|
||||
case 0b10:
|
||||
return new SdotQX<int32_t>(machInst, vd, vn, vm);
|
||||
case 0b11:
|
||||
return new UdotQX<uint32_t>(machInst, vd, vn, vm);
|
||||
default:
|
||||
return new Unknown64(machInst);
|
||||
}
|
||||
case 0x18:
|
||||
case 0x19:
|
||||
case 0x1a:
|
||||
@@ -1351,6 +1365,7 @@ namespace Aarch64
|
||||
{
|
||||
uint8_t q = bits(machInst, 30);
|
||||
uint8_t u = bits(machInst, 29);
|
||||
uint8_t qu = bits(machInst, 30, 29);
|
||||
uint8_t size = bits(machInst, 23, 22);
|
||||
uint8_t L = bits(machInst, 21);
|
||||
uint8_t M = bits(machInst, 20);
|
||||
@@ -1387,6 +1402,11 @@ namespace Aarch64
|
||||
}
|
||||
RegIndex vm_fp = (RegIndex) (uint8_t) (vmh << 4 | vm_bf);
|
||||
|
||||
// Index and 2nd register operand for FEAT_DOTPROD and
|
||||
// FEAT_I8MM instructions
|
||||
uint8_t index_dp = (H << 1) | L;
|
||||
RegIndex vm_dp = (RegIndex) (uint8_t) (M << 4 | vm_bf);
|
||||
|
||||
switch (opcode) {
|
||||
case 0x0:
|
||||
if (!u || (size == 0x0 || size == 0x3))
|
||||
@@ -1573,6 +1593,23 @@ namespace Aarch64
|
||||
case 0xf:
|
||||
return decodeNeonSThreeImmHAndWReg<SqrdmlshElemDX, SqrdmlshElemQX>(
|
||||
q, size, machInst, vd, vn, vm, index);
|
||||
case 0xe:
|
||||
switch (qu) {
|
||||
case 0b00:
|
||||
return new SdotElemDX<int32_t>(machInst,
|
||||
vd, vn, vm_dp, index_dp);
|
||||
case 0b01:
|
||||
return new UdotElemDX<uint32_t>(machInst,
|
||||
vd, vn, vm_dp, index_dp);
|
||||
case 0b10:
|
||||
return new SdotElemQX<int32_t>(machInst,
|
||||
vd, vn, vm_dp, index_dp);
|
||||
case 0b11:
|
||||
return new UdotElemQX<uint32_t>(machInst,
|
||||
vd, vn, vm_dp, index_dp);
|
||||
default:
|
||||
return new Unknown64(machInst);
|
||||
}
|
||||
default:
|
||||
return new Unknown64(machInst);
|
||||
}
|
||||
|
||||
@@ -1082,6 +1082,71 @@ let {{
|
||||
complex=True)
|
||||
threeEqualRegInstX("fcmla", "FcmlaQX", "SimdFloatMultAccOp",
|
||||
floatTypes, 4, fcmla_vec, True, complex=True)
|
||||
|
||||
def intDotInst(name, Name, opClass,
|
||||
destIsSigned, src1IsSigned, src2IsSigned,
|
||||
rCount, byElem):
|
||||
destType = "int32_t" if destIsSigned else "uint32_t"
|
||||
src1Type = "int8_t" if src1IsSigned else "uint8_t"
|
||||
src2Type = "int8_t" if src2IsSigned else "uint8_t"
|
||||
dotCode = '''
|
||||
using Src1Element = %(src1Type)s;
|
||||
using Src2Element = %(src2Type)s;
|
||||
|
||||
// Neon dot instructions always generate one output element
|
||||
// from 4 pairs of source elements.
|
||||
static_assert(sizeof(Element) == 4 * sizeof(Src1Element));
|
||||
static_assert(sizeof(Element) == 4 * sizeof(Src2Element));
|
||||
|
||||
// Extended source element types to avoid overflow of intermediate
|
||||
// calculations.
|
||||
using ExtendedSrc1Element =
|
||||
typename vector_element_traits::
|
||||
extend_element<Element, Src1Element>::type;
|
||||
using ExtendedSrc2Element =
|
||||
typename vector_element_traits::
|
||||
extend_element<Element, Src2Element>::type;
|
||||
|
||||
for (unsigned i = 0; i < eCount; ++i) {
|
||||
Element src1ElemsPacked = letoh(srcReg1.elements[i]);
|
||||
Element src2ElemsPacked = letoh(srcReg2.elements[%(src2Index)s]);
|
||||
|
||||
Src1Element *src1Elems =
|
||||
reinterpret_cast<Src1Element*>(&src1ElemsPacked);
|
||||
Src2Element *src2Elems =
|
||||
reinterpret_cast<Src2Element*>(&src2ElemsPacked);
|
||||
|
||||
// Dot instructions accumulate into the dest reg
|
||||
Element destElem = letoh(destReg.elements[i]);
|
||||
|
||||
for (unsigned j = 0; j < 4; ++j) {
|
||||
ExtendedSrc1Element src1Elem =
|
||||
static_cast<ExtendedSrc1Element>(src1Elems[j]);
|
||||
ExtendedSrc2Element src2Elem =
|
||||
static_cast<ExtendedSrc2Element>(src2Elems[j]);
|
||||
destElem += src1Elem * src2Elem;
|
||||
}
|
||||
destReg.elements[i] = htole(destElem);
|
||||
}
|
||||
''' % dict(src1Type=src1Type, src2Type=src2Type,
|
||||
src2Index="imm" if byElem else "i")
|
||||
threeEqualRegInstX(name, Name, opClass, (destType,), rCount,
|
||||
dotCode, readDest=True, byElem=byElem,
|
||||
complex=True)
|
||||
|
||||
# SDOT (vector)
|
||||
intDotInst('sdot', 'SdotDX', 'SimdAluOp', True, True, True, 2, False)
|
||||
intDotInst('sdot', 'SdotQX', 'SimdAluOp', True, True, True, 4, False)
|
||||
# SDOT (element)
|
||||
intDotInst('sdot', 'SdotElemDX', 'SimdAluOp', True, True, True, 2, True)
|
||||
intDotInst('sdot', 'SdotElemQX', 'SimdAluOp', True, True, True, 4, True)
|
||||
# UDOT (vector)
|
||||
intDotInst('udot', 'UdotDX', 'SimdAluOp', False, False, False, 2, False)
|
||||
intDotInst('udot', 'UdotQX', 'SimdAluOp', False, False, False, 4, False)
|
||||
# UDOT (element)
|
||||
intDotInst('udot', 'UdotElemDX', 'SimdAluOp', False, False, False, 2, True)
|
||||
intDotInst('udot', 'UdotElemQX', 'SimdAluOp', False, False, False, 4, True)
|
||||
|
||||
# CLS
|
||||
clsCode = '''
|
||||
unsigned count = 0;
|
||||
|
||||
@@ -3988,6 +3988,7 @@ ISA::initializeMiscRegMetadata()
|
||||
isar0_el1.sha1 = 0;
|
||||
isar0_el1.aes = 0;
|
||||
}
|
||||
isar0_el1.dp = release->has(ArmExtension::FEAT_DOTPROD) ? 0x1 : 0x0;
|
||||
isar0_el1.atomic = release->has(ArmExtension::FEAT_LSE) ? 0x2 : 0x0;
|
||||
isar0_el1.rdm = release->has(ArmExtension::FEAT_RDM) ? 0x1 : 0x0;
|
||||
isar0_el1.tme = release->has(ArmExtension::TME) ? 0x1 : 0x0;
|
||||
|
||||
Reference in New Issue
Block a user