arch-arm: ARMv8.3 CompNum, SIMD complex number support
This patch implements the CompNum SIMD instruction for armv8.3. This instructions are Fcadd, Fcmla(vector and element) and Vcadd, Vcmla ( vector and element). + isa/decoder/thumb.isa: Decoding changes for SIMD instructions in T32 + isa/formats/fp.isa: Decoding changes for SIMD instructions in A32 + isa/formats/uncond.isa: Decoding changes for SIMD instructions in A32 + isa/formats/aarch64.isa: Decoding changes for SIMD instructions in A64 + isa/formats/neon64.isa: Decoding changes for SIMD instructions in A64 + isa/insts/neon.isa: Vcadd, Vcmla instruction implementation + isa/insts/neon64.isa: Fcadd, Fcmla instruction implementation + isa/templates/neon.isa: Modify templates for adding byElement support Change-Id: I7f11ce88137dad077d2cad698dcaa9a79a3f317b Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/27183 Tested-by: Gem5 Cloud Project GCB service account <345032938727@cloudbuild.gserviceaccount.com> Reviewed-by: Giacomo Travaglini <giacomo.travaglini@arm.com> Maintainer: Giacomo Travaglini <giacomo.travaglini@arm.com>
This commit is contained in:
@@ -79,7 +79,7 @@ class ArmISA(BaseISA):
|
||||
id_isar2 = Param.UInt32(0x21232141, "Instruction Set Attribute Register 2")
|
||||
id_isar3 = Param.UInt32(0x01112131, "Instruction Set Attribute Register 3")
|
||||
id_isar4 = Param.UInt32(0x10010142, "Instruction Set Attribute Register 4")
|
||||
id_isar5 = Param.UInt32(0x00000000, "Instruction Set Attribute Register 5")
|
||||
id_isar5 = Param.UInt32(0x10000000, "Instruction Set Attribute Register 5")
|
||||
|
||||
fpsid = Param.UInt32(0x410430a0, "Floating-point System ID Register")
|
||||
|
||||
@@ -101,8 +101,8 @@ class ArmISA(BaseISA):
|
||||
id_aa64isar0_el1 = Param.UInt64(0x0000000000000000,
|
||||
"AArch64 Instruction Set Attribute Register 0")
|
||||
|
||||
# GPI = 0x0 | GPA = 0x1| API=0x0 | APA=0x1
|
||||
id_aa64isar1_el1 = Param.UInt64(0x0000000001000010,
|
||||
# GPI = 0x0 | GPA = 0x1| API=0x0 | APA=0x1 | FCMA
|
||||
id_aa64isar1_el1 = Param.UInt64(0x0000000001010010,
|
||||
"AArch64 Instruction Set Attribute Register 1")
|
||||
|
||||
# 4K | 64K | !16K | !BigEndEL0 | !SNSMem | !BigEnd | 8b ASID | 40b PA
|
||||
|
||||
@@ -138,9 +138,11 @@ decode BIGTHUMB {
|
||||
0x3: Thumb32LongMulMulAccAndDiv::thumb32LongMulMulAccAndDiv();
|
||||
default: Thumb32DataProcReg::thumb32DataProcReg();
|
||||
}
|
||||
0x2: Thumb32NeonSIMD::thumb32NeonSIMD();
|
||||
default: decode HTOPCODE_9_8 {
|
||||
0x2: decode LTOPCODE_4 {
|
||||
0x0: decode LTCOPROC {
|
||||
0x8: Thumb32NeonSIMD::thumb32NeonSIMD();
|
||||
0xa, 0xb: VfpData::vfpData();
|
||||
default: WarnUnimpl::cdp(); // cdp2
|
||||
}
|
||||
|
||||
@@ -2313,10 +2313,8 @@ namespace Aarch64
|
||||
} else {
|
||||
return new Unknown64(machInst);
|
||||
}
|
||||
} else if (bits(machInst, 24) ||
|
||||
bits(machInst, 21) ||
|
||||
bits(machInst, 15)) {
|
||||
return new Unknown64(machInst);
|
||||
} else if (bits(machInst, 15) == 1) {
|
||||
return decodeNeon3SameExtra<DecoderFeatures>(machInst);
|
||||
} else if (bits(machInst, 10) == 1) {
|
||||
if (bits(machInst, 23, 22))
|
||||
return new Unknown64(machInst);
|
||||
|
||||
@@ -96,6 +96,9 @@ let {{
|
||||
|
||||
StaticInstPtr
|
||||
decodeNeonData(ExtMachInst machInst);
|
||||
|
||||
StaticInstPtr
|
||||
decodeAdvancedSIMD(ExtMachInst machInst);
|
||||
'''
|
||||
|
||||
decoder_output = '''
|
||||
@@ -333,6 +336,84 @@ let {{
|
||||
return new Unknown(machInst);
|
||||
}
|
||||
'''
|
||||
decoder_output += '''
|
||||
StaticInstPtr
|
||||
decodeAdvancedSIMD(ExtMachInst machInst)
|
||||
{
|
||||
uint8_t op_code = (bits(machInst, 25) << 1)
|
||||
| bits(machInst, 21);
|
||||
|
||||
IntRegIndex vd = (IntRegIndex)(2 * (bits(machInst, 15, 12) |
|
||||
(bits(machInst, 22) << 4)));
|
||||
IntRegIndex vn = (IntRegIndex)(2 * (bits(machInst, 19, 16) |
|
||||
(bits(machInst, 7) << 4)));
|
||||
IntRegIndex vm = (IntRegIndex)(2 * (bits(machInst, 3, 0) |
|
||||
(bits(machInst, 5) << 4)));
|
||||
bool q = bits (machInst, 6);
|
||||
switch (op_code) {
|
||||
case 0x0:
|
||||
{
|
||||
// VCADD
|
||||
bool s = bits (machInst, 20);
|
||||
if (s) {
|
||||
if (q)
|
||||
return new VcaddQ<uint32_t>(machInst, vd, vn, vm);
|
||||
else
|
||||
return new VcaddD<uint32_t>(machInst, vd, vn, vm);
|
||||
} else {
|
||||
if (q)
|
||||
return new VcaddQ<uint16_t>(machInst, vd, vn, vm);
|
||||
else
|
||||
return new VcaddD<uint16_t>(machInst, vd, vn, vm);
|
||||
}
|
||||
}
|
||||
case 0x1:
|
||||
{
|
||||
// VCMLA
|
||||
bool s = bits (machInst, 20);
|
||||
if (s) {
|
||||
if (q)
|
||||
return new VcmlaQ<uint32_t>(machInst, vd, vn, vm);
|
||||
else
|
||||
return new VcmlaD<uint32_t>(machInst, vd, vn, vm);
|
||||
} else {
|
||||
if (q)
|
||||
return new VcmlaQ<uint16_t>(machInst, vd, vn, vm);
|
||||
else
|
||||
return new VcmlaD<uint16_t>(machInst, vd, vn, vm);
|
||||
}
|
||||
}
|
||||
case 0x2:
|
||||
case 0x3:
|
||||
{
|
||||
// VCMLA by element
|
||||
bool s = bits (machInst, 23);
|
||||
if (s) {
|
||||
uint8_t index_fp = 0;
|
||||
if (q)
|
||||
return new VcmlaElemQ<uint32_t>(machInst, vd, vn, vm,
|
||||
index_fp);
|
||||
else
|
||||
return new VcmlaElemD<uint32_t>(machInst, vd, vn, vm,
|
||||
index_fp);
|
||||
} else {
|
||||
vm = (IntRegIndex)(uint8_t)(2* bits(machInst, 3, 0));
|
||||
uint8_t index_fp = bits(machInst, 5);
|
||||
if (q)
|
||||
return new VcmlaElemQ<uint16_t>(machInst, vd, vn, vm,
|
||||
index_fp);
|
||||
else
|
||||
return new VcmlaElemD<uint16_t>(machInst, vd, vn, vm,
|
||||
index_fp);
|
||||
}
|
||||
}
|
||||
default:
|
||||
return new Unknown64(machInst);
|
||||
}
|
||||
|
||||
}
|
||||
'''
|
||||
|
||||
|
||||
decoder_output += '''
|
||||
static StaticInstPtr
|
||||
@@ -1869,6 +1950,12 @@ def format ThumbNeonData() {{
|
||||
'''
|
||||
}};
|
||||
|
||||
def format Thumb32NeonSIMD() {{
|
||||
decode_block = '''
|
||||
return decodeAdvancedSIMD(machInst);
|
||||
'''
|
||||
}};
|
||||
|
||||
let {{
|
||||
header_output = '''
|
||||
bool
|
||||
|
||||
@@ -39,6 +39,9 @@ namespace Aarch64
|
||||
// AdvSIMD three same
|
||||
template <typename DecoderFeatures>
|
||||
StaticInstPtr decodeNeon3Same(ExtMachInst machInst);
|
||||
// AdvSIMD three same Extra
|
||||
template <typename DecoderFeatures>
|
||||
StaticInstPtr decodeNeon3SameExtra(ExtMachInst machInst);
|
||||
// AdvSIMD three different
|
||||
inline StaticInstPtr decodeNeon3Diff(ExtMachInst machInst);
|
||||
// AdvSIMD two-reg misc
|
||||
@@ -500,6 +503,48 @@ namespace Aarch64
|
||||
}
|
||||
}
|
||||
|
||||
template <typename DecoderFeatures>
|
||||
StaticInstPtr
|
||||
decodeNeon3SameExtra(ExtMachInst machInst)
|
||||
{
|
||||
uint8_t q = bits(machInst, 30);
|
||||
uint8_t size = bits(machInst, 23, 22);
|
||||
uint8_t opcode = bits(machInst, 15, 11);
|
||||
|
||||
IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
|
||||
IntRegIndex vn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
|
||||
IntRegIndex vm = (IntRegIndex) (uint8_t) bits(machInst, 20, 16);
|
||||
|
||||
switch (opcode) {
|
||||
case 0x18:
|
||||
case 0x19:
|
||||
case 0x1a:
|
||||
case 0x1b:
|
||||
if (size == 0x1) {
|
||||
if (q)
|
||||
return new FcmlaQX<uint16_t>(machInst, vd, vn, vm);
|
||||
else
|
||||
return new FcmlaDX<uint16_t>(machInst, vd, vn, vm);
|
||||
} else
|
||||
return decodeNeonUThreeFpReg<FcmlaDX, FcmlaQX>(
|
||||
q, size & 0x1, machInst, vd, vn, vm);
|
||||
|
||||
case 0x1c:
|
||||
case 0x1e:
|
||||
if (size == 0x1) {
|
||||
if (q)
|
||||
return new FcaddQX<uint16_t>(machInst, vd, vn, vm);
|
||||
else
|
||||
return new FcaddDX<uint16_t>(machInst, vd, vn, vm);
|
||||
} else
|
||||
return decodeNeonUThreeFpReg<FcaddDX, FcaddQX>(
|
||||
q, size & 0x1, machInst, vd, vn, vm);
|
||||
|
||||
default:
|
||||
return new Unknown64(machInst);
|
||||
}
|
||||
}
|
||||
|
||||
StaticInstPtr
|
||||
decodeNeon3Diff(ExtMachInst machInst)
|
||||
{
|
||||
@@ -1324,7 +1369,27 @@ namespace Aarch64
|
||||
if (!u && size >= 2 && sz_q != 0x2 && sz_L != 0x3)
|
||||
return decodeNeonUThreeImmFpReg<FmlaElemDX, FmlaElemQX>(
|
||||
q, sz, machInst, vd, vn, vm_fp, index_fp);
|
||||
else
|
||||
else if (u && (size == 1 || size == 2)){
|
||||
// FCMLA by element
|
||||
if (size == 0x2) {
|
||||
index_fp = H;
|
||||
if (q)
|
||||
return new FcmlaElemQX<uint32_t>(machInst, vd, vn,
|
||||
vm_fp, index_fp);
|
||||
else
|
||||
return new FcmlaElemDX<uint32_t>(machInst, vd, vn,
|
||||
vm_fp, index_fp);
|
||||
} else {
|
||||
index_fp = (H << 1) | L;
|
||||
if (q)
|
||||
return new FcmlaElemQX<uint16_t>(machInst, vd, vn,
|
||||
vm_fp, index_fp);
|
||||
else
|
||||
return new FcmlaElemDX<uint16_t>(machInst, vd, vn,
|
||||
vm_fp, index_fp);
|
||||
}
|
||||
|
||||
} else
|
||||
return new Unknown64(machInst);
|
||||
case 0x2:
|
||||
if (size == 0x0 || size == 0x3)
|
||||
@@ -1336,7 +1401,26 @@ namespace Aarch64
|
||||
return decodeNeonSThreeImmHAndWReg<SmlalElemX, SmlalElem2X>(
|
||||
q, size, machInst, vd, vn, vm, index);
|
||||
case 0x3:
|
||||
if (u || (size == 0x0 || size == 0x3))
|
||||
if (u && (size == 1 || size == 2)){
|
||||
// FCMLA by element
|
||||
if (size == 0x2) {
|
||||
index_fp = H;
|
||||
if (q)
|
||||
return new FcmlaElemQX<uint32_t>(machInst, vd, vn,
|
||||
vm_fp, index_fp);
|
||||
else
|
||||
return new FcmlaElemDX<uint32_t>(machInst, vd, vn,
|
||||
vm_fp, index_fp);
|
||||
} else {
|
||||
index_fp = (H << 1) | L;
|
||||
if (q)
|
||||
return new FcmlaElemQX<uint16_t>(machInst, vd, vn,
|
||||
vm_fp, index_fp);
|
||||
else
|
||||
return new FcmlaElemDX<uint16_t>(machInst, vd, vn,
|
||||
vm_fp, index_fp);
|
||||
}
|
||||
} else if (u || (size == 0x0 || size == 0x3))
|
||||
return new Unknown64(machInst);
|
||||
else
|
||||
return decodeNeonSThreeImmHAndWReg<SqdmlalElemX,
|
||||
@@ -1352,7 +1436,26 @@ namespace Aarch64
|
||||
if (!u && size >= 0x2 && sz_L != 0x3 && sz_q != 0x2)
|
||||
return decodeNeonUThreeImmFpReg<FmlsElemDX, FmlsElemQX>(
|
||||
q, sz, machInst, vd, vn, vm_fp, index_fp);
|
||||
else
|
||||
else if (u && (size == 1 || size == 2)){
|
||||
// FCMLA by element
|
||||
if (size == 0x2) {
|
||||
index_fp = H;
|
||||
if (q)
|
||||
return new FcmlaElemQX<uint32_t>(machInst, vd, vn,
|
||||
vm_fp, index_fp);
|
||||
else
|
||||
return new FcmlaElemDX<uint32_t>(machInst, vd, vn,
|
||||
vm_fp, index_fp);
|
||||
} else {
|
||||
index_fp = (H << 1) | L;
|
||||
if (q)
|
||||
return new FcmlaElemQX<uint16_t>(machInst, vd, vn,
|
||||
vm_fp, index_fp);
|
||||
else
|
||||
return new FcmlaElemDX<uint16_t>(machInst, vd, vn,
|
||||
vm_fp, index_fp);
|
||||
}
|
||||
} else
|
||||
return new Unknown64(machInst);
|
||||
case 0x6:
|
||||
if (size == 0x0 || size == 0x3)
|
||||
@@ -1364,7 +1467,26 @@ namespace Aarch64
|
||||
return decodeNeonSThreeImmHAndWReg<SmlslElemX, SmlslElem2X>(
|
||||
q, size, machInst, vd, vn, vm, index);
|
||||
case 0x7:
|
||||
if (u || (size == 0x0 || size == 0x3))
|
||||
if (u && (size == 1 || size == 2)){
|
||||
// FCMLA by element
|
||||
if (size == 0x2) {
|
||||
index_fp = H;
|
||||
if (q)
|
||||
return new FcmlaElemQX<uint32_t>(machInst, vd, vn,
|
||||
vm_fp, index_fp);
|
||||
else
|
||||
return new FcmlaElemDX<uint32_t>(machInst, vd, vn,
|
||||
vm_fp, index_fp);
|
||||
} else {
|
||||
index_fp = (H << 1) | L;
|
||||
if (q)
|
||||
return new FcmlaElemQX<uint16_t>(machInst, vd, vn,
|
||||
vm_fp, index_fp);
|
||||
else
|
||||
return new FcmlaElemDX<uint16_t>(machInst, vd, vn,
|
||||
vm_fp, index_fp);
|
||||
}
|
||||
} else if (u || (size == 0x0 || size == 0x3))
|
||||
return new Unknown64(machInst);
|
||||
else
|
||||
return decodeNeonSThreeImmHAndWReg<SqdmlslElemX,
|
||||
|
||||
@@ -237,7 +237,9 @@ def format ArmUnconditional() {{
|
||||
return new BlxImm(machInst, imm, COND_UC);
|
||||
}
|
||||
case 0x2:
|
||||
if (bits(op1, 4, 0) != 0) {
|
||||
if (bits(machInst, 31, 25) == 0x7e){
|
||||
return decodeAdvancedSIMD(machInst);
|
||||
} else if (bits(op1, 4, 0) != 0) {
|
||||
if (CPNUM == 0xa || CPNUM == 0xb) {
|
||||
return decodeExtensionRegLoadStore(machInst);
|
||||
}
|
||||
@@ -262,7 +264,9 @@ def format ArmUnconditional() {{
|
||||
}
|
||||
break;
|
||||
case 0x3:
|
||||
if (bits(op1, 4) == 0) {
|
||||
if (bits(machInst, 31, 24) == 0xfe) {
|
||||
return decodeAdvancedSIMD(machInst);
|
||||
} else if (bits(op1, 4) == 0) {
|
||||
if (CPNUM == 0xa || CPNUM == 0xb) {
|
||||
return decodeShortFpTransfer(machInst);
|
||||
} else if (CPNUM == 0xe) {
|
||||
|
||||
@@ -1146,12 +1146,21 @@ let {{
|
||||
allTypes = unsignedTypes + signedTypes
|
||||
|
||||
def threeEqualRegInst(name, Name, opClass, types, rCount, op,
|
||||
readDest=False, pairwise=False,
|
||||
standardFpcsr=False):
|
||||
readDest=False, pairwise=False, byElem=False,
|
||||
standardFpcsr=False, complex=False):
|
||||
global header_output, exec_output
|
||||
eWalkCode = simdEnabledCheckCode + '''
|
||||
RegVect srcReg1, srcReg2, destReg;
|
||||
'''
|
||||
RegVect srcReg1, destReg;
|
||||
'''
|
||||
if byElem:
|
||||
# 2nd register operand has to be read fully
|
||||
eWalkCode += '''
|
||||
FullRegVect srcReg2;
|
||||
'''
|
||||
else:
|
||||
eWalkCode += '''
|
||||
RegVect srcReg2;
|
||||
'''
|
||||
for reg in range(rCount):
|
||||
eWalkCode += '''
|
||||
srcReg1.regs[%(reg)d] = htole(FpOp1P%(reg)d_uw);
|
||||
@@ -1161,6 +1170,13 @@ let {{
|
||||
eWalkCode += '''
|
||||
destReg.regs[%(reg)d] = htole(FpDestP%(reg)d_uw);
|
||||
''' % { "reg" : reg }
|
||||
if byElem:
|
||||
# 2nd operand has to be read fully
|
||||
for reg in range(rCount, 4):
|
||||
eWalkCode += '''
|
||||
srcReg2.regs[%(reg)d] = htole(FpOp2P%(reg)d_uw);
|
||||
''' % { "reg" : reg }
|
||||
|
||||
readDestCode = ''
|
||||
if standardFpcsr:
|
||||
eWalkCode += '''
|
||||
@@ -1168,7 +1184,10 @@ let {{
|
||||
'''
|
||||
if readDest:
|
||||
readDestCode = 'destElem = letoh(destReg.elements[i]);'
|
||||
if pairwise:
|
||||
|
||||
if complex:
|
||||
eWalkCode += op
|
||||
elif pairwise:
|
||||
eWalkCode += '''
|
||||
for (unsigned i = 0; i < eCount; i++) {
|
||||
Element srcElem1 = letoh(2 * i < eCount ?
|
||||
@@ -1203,12 +1222,15 @@ let {{
|
||||
FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
|
||||
''' % { "reg" : reg }
|
||||
iop = InstObjParams(name, Name,
|
||||
"RegRegRegOp",
|
||||
{ "code": eWalkCode,
|
||||
"r_count": rCount,
|
||||
"predicate_test": predicateTest,
|
||||
"op_class": opClass }, [])
|
||||
header_output += NeonRegRegRegOpDeclare.subst(iop)
|
||||
"RegRegRegImmOp" if byElem else "RegRegRegOp",
|
||||
{ "code": eWalkCode,
|
||||
"r_count": rCount,
|
||||
"predicate_test": predicateTest,
|
||||
"op_class": opClass }, [])
|
||||
if byElem:
|
||||
header_output += NeonRegRegRegImmOpDeclare.subst(iop)
|
||||
else:
|
||||
header_output += NeonRegRegRegOpDeclare.subst(iop)
|
||||
exec_output += NeonEqualRegExecute.subst(iop)
|
||||
for type in types:
|
||||
substDict = { "targs" : type,
|
||||
@@ -2186,6 +2208,119 @@ let {{
|
||||
'''
|
||||
threeRegNarrowInst("vrsubhn", "Vrsubhn", "SimdAddOp", smallTypes, vrsubhnCode)
|
||||
|
||||
vcaddCode = '''
|
||||
bool rot = bits(machInst, 24);
|
||||
Element el1;
|
||||
Element el3;
|
||||
|
||||
for (int i = 0; i < eCount/2; ++i) {
|
||||
Element srcElem1_1 = letoh(srcReg1.elements[2*i]);
|
||||
Element srcElem1_2 = letoh(srcReg1.elements[2*i+1]);
|
||||
Element srcElem2_1 = letoh(srcReg2.elements[2*i]);
|
||||
Element srcElem2_2 = letoh(srcReg2.elements[2*i+1]);
|
||||
Element destElem_1;
|
||||
Element destElem_2;
|
||||
if (rot) {
|
||||
el1 = srcElem2_2;
|
||||
el3 = fplibNeg<Element>(srcElem2_1);
|
||||
} else {
|
||||
el1 = fplibNeg<Element>(srcElem2_2);
|
||||
el3 = srcElem2_1;
|
||||
}
|
||||
|
||||
destElem_1 = fplibAdd<Element>(srcElem1_1, el1, fpscr);
|
||||
destElem_2 = fplibAdd<Element>(srcElem1_2, el3, fpscr);
|
||||
destReg.elements[2*i] = htole(destElem_1);
|
||||
destReg.elements[2*i+1] = htole(destElem_2);
|
||||
}
|
||||
'''
|
||||
|
||||
# VCADD
|
||||
threeEqualRegInst("vcadd", "VcaddD", "SimdFloatAddOp",
|
||||
("uint16_t", "uint32_t"), 2, vcaddCode,
|
||||
standardFpcsr=True, complex=True)
|
||||
threeEqualRegInst("vcadd", "VcaddQ", "SimdFloatAddOp",
|
||||
("uint16_t", "uint32_t"), 4,
|
||||
vcaddCode, standardFpcsr=True, complex=True)
|
||||
|
||||
vcmlaCode = '''
|
||||
uint8_t rot = bits(machInst, %(rot)s);
|
||||
Element el1;
|
||||
Element el2;
|
||||
Element el3;
|
||||
Element el4;
|
||||
for (int i = 0; i < eCount/2; ++i) {
|
||||
|
||||
Element srcElem1_1 = letoh(srcReg1.elements[2*i]);
|
||||
Element srcElem1_2 = letoh(srcReg1.elements[2*i+1]);
|
||||
Element srcElem2_1 = letoh(srcReg2.elements[2*%(index)s]);
|
||||
Element srcElem2_2 = letoh(srcReg2.elements[2*%(index)s+1]);
|
||||
Element destElem_1 = letoh(destReg.elements[2*i]);
|
||||
Element destElem_2 = letoh(destReg.elements[2*i+1]);
|
||||
|
||||
switch (rot) {
|
||||
case 0x0:
|
||||
{
|
||||
el1 = srcElem2_1;
|
||||
el2 = srcElem1_1;
|
||||
el3 = srcElem2_2;
|
||||
el4 = srcElem1_1;
|
||||
break;
|
||||
}
|
||||
case 0x1:
|
||||
{
|
||||
el1 = fplibNeg<Element>(srcElem2_2);
|
||||
el2 = srcElem1_2;
|
||||
el3 = srcElem2_1;
|
||||
el4 = srcElem1_2;
|
||||
break;
|
||||
}
|
||||
case 0x2:
|
||||
{
|
||||
el1 = fplibNeg<Element>(srcElem2_1);
|
||||
el2 = srcElem1_1;
|
||||
el3 = fplibNeg<Element>(srcElem2_2);
|
||||
el4 = srcElem1_1;
|
||||
break;
|
||||
}
|
||||
case 0x3:
|
||||
{
|
||||
el1 = srcElem2_2;
|
||||
el2 = srcElem1_2;
|
||||
el3 = fplibNeg<Element>(srcElem2_1);
|
||||
el4 = srcElem1_2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
destElem_1 = fplibMulAdd<Element>(destElem_1, el2, el1, fpscr);
|
||||
destElem_2 = fplibMulAdd<Element>(destElem_2, el4, el3, fpscr);
|
||||
|
||||
destReg.elements[2*i] = htole(destElem_1);
|
||||
destReg.elements[2*i+1] = htole(destElem_2);
|
||||
}
|
||||
'''
|
||||
|
||||
# VCMLA (by element)
|
||||
vcmla_imm = vcmlaCode % {'rot': '21, 20', 'index': 'imm'}
|
||||
threeEqualRegInst("vcmla", "VcmlaElemD", "SimdFloatMultAccOp",
|
||||
("uint16_t", "uint32_t"), 2, vcmla_imm,
|
||||
readDest=True, byElem=True, standardFpcsr=True,
|
||||
complex=True)
|
||||
threeEqualRegInst("vcmla", "VcmlaElemQ", "SimdFloatMultAccOp",
|
||||
("uint16_t", "uint32_t"), 4, vcmla_imm,
|
||||
readDest=True, byElem=True, standardFpcsr=True,
|
||||
complex=True)
|
||||
|
||||
# FCMLA (vector)
|
||||
vcmla_vec = vcmlaCode % {'rot': '24, 23', 'index': 'i'}
|
||||
threeEqualRegInst("vcmla", "VcmlaD", "SimdFloatMultAccOp",
|
||||
("uint16_t", "uint32_t"), 2, vcmla_vec,
|
||||
readDest=True, standardFpcsr=True, complex=True)
|
||||
threeEqualRegInst("vcmla", "VcmlaQ", "SimdFloatMultAccOp",
|
||||
("uint16_t", "uint32_t"), 4, vcmla_vec,
|
||||
readDest=True, standardFpcsr=True, complex=True)
|
||||
|
||||
vqaddSCode = '''
|
||||
destElem = srcElem1 + srcElem2;
|
||||
FPSCR fpscr = (FPSCR) FpscrQc;
|
||||
|
||||
@@ -52,7 +52,7 @@ let {{
|
||||
|
||||
def threeEqualRegInstX(name, Name, opClass, types, rCount, op,
|
||||
readDest=False, pairwise=False, scalar=False,
|
||||
byElem=False, decoder='Generic'):
|
||||
byElem=False, decoder='Generic', complex=False):
|
||||
assert (not pairwise) or ((not byElem) and (not scalar))
|
||||
global header_output, exec_output, decoders
|
||||
eWalkCode = simd64EnabledCheckCode + '''
|
||||
@@ -85,7 +85,10 @@ let {{
|
||||
readDestCode = ''
|
||||
if readDest:
|
||||
readDestCode = 'destElem = letoh(destReg.elements[i]);'
|
||||
if pairwise:
|
||||
|
||||
if complex:
|
||||
eWalkCode += op
|
||||
elif pairwise:
|
||||
eWalkCode += '''
|
||||
for (unsigned i = 0; i < eCount; i++) {
|
||||
Element srcElem1 = letoh(2 * i < eCount ?
|
||||
@@ -975,6 +978,119 @@ let {{
|
||||
True)
|
||||
threeEqualRegInstX("bsl", "BslQX", "SimdAluOp", ("uint64_t",), 4, bslCode,
|
||||
True)
|
||||
|
||||
# FCADD
|
||||
fcaddCode = '''
|
||||
bool rot = bits(machInst, 12);
|
||||
Element el1;
|
||||
Element el3;
|
||||
for (int i = 0; i < eCount/2; ++i) {
|
||||
FPSCR fpscr = (FPSCR) FpscrExc;
|
||||
|
||||
Element srcElem1_1 = letoh(srcReg1.elements[2*i]);
|
||||
Element srcElem1_2 = letoh(srcReg1.elements[2*i+1]);
|
||||
Element srcElem2_1 = letoh(srcReg2.elements[2*i]);
|
||||
Element srcElem2_2 = letoh(srcReg2.elements[2*i+1]);
|
||||
Element destElem_1;
|
||||
Element destElem_2;
|
||||
if (rot) {
|
||||
el1 = srcElem2_2;
|
||||
el3 = fplibNeg<Element>(srcElem2_1);
|
||||
} else {
|
||||
el1 = fplibNeg<Element>(srcElem2_2);
|
||||
el3 = srcElem2_1;
|
||||
}
|
||||
|
||||
destElem_1 = fplibAdd<Element>(srcElem1_1, el1, fpscr);
|
||||
destElem_2 = fplibAdd<Element>(srcElem1_2, el3, fpscr);
|
||||
|
||||
FpscrExc = fpscr;
|
||||
|
||||
destReg.elements[2*i] = htole(destElem_1);
|
||||
destReg.elements[2*i+1] = htole(destElem_2);
|
||||
}
|
||||
'''
|
||||
|
||||
threeEqualRegInstX("fcadd", "FcaddDX", "SimdFloatAddOp",
|
||||
("uint16_t", "uint32_t"), 2,
|
||||
fcaddCode, complex=True)
|
||||
threeEqualRegInstX("fcadd", "FcaddQX", "SimdFloatAddOp", floatTypes, 4,
|
||||
fcaddCode, complex=True)
|
||||
|
||||
fcmlaCode = '''
|
||||
uint8_t rot = bits(machInst, %(rot)s);
|
||||
Element el1;
|
||||
Element el2;
|
||||
Element el3;
|
||||
Element el4;
|
||||
for (int i = 0; i < eCount/2; ++i) {
|
||||
FPSCR fpscr = (FPSCR) FpscrExc;
|
||||
|
||||
Element srcElem1_1 = letoh(srcReg1.elements[2*i]);
|
||||
Element srcElem1_2 = letoh(srcReg1.elements[2*i+1]);
|
||||
Element srcElem2_1 = letoh(srcReg2.elements[2* %(index)s]);
|
||||
Element srcElem2_2 = letoh(srcReg2.elements[2* %(index)s +1]);
|
||||
Element destElem_1 = letoh(destReg.elements[2*i]);
|
||||
Element destElem_2 = letoh(destReg.elements[2*i+1]);
|
||||
|
||||
switch (rot) {
|
||||
case 0x0:
|
||||
{
|
||||
el1 = srcElem2_1;
|
||||
el2 = srcElem1_1;
|
||||
el3 = srcElem2_2;
|
||||
el4 = srcElem1_1;
|
||||
break;
|
||||
}
|
||||
case 0x1:
|
||||
{
|
||||
el1 = fplibNeg<Element>(srcElem2_2);
|
||||
el2 = srcElem1_2;
|
||||
el3 = srcElem2_1;
|
||||
el4 = srcElem1_2;
|
||||
break;
|
||||
}
|
||||
case 0x2:
|
||||
{
|
||||
el1 = fplibNeg<Element>(srcElem2_1);
|
||||
el2 = srcElem1_1;
|
||||
el3 = fplibNeg<Element>(srcElem2_2);
|
||||
el4 = srcElem1_1;
|
||||
break;
|
||||
}
|
||||
case 0x3:
|
||||
{
|
||||
el1 = srcElem2_2;
|
||||
el2 = srcElem1_2;
|
||||
el3 = fplibNeg<Element>(srcElem2_1);
|
||||
el4 = srcElem1_2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
destElem_1 = fplibMulAdd<Element>(destElem_1, el2, el1, fpscr);
|
||||
destElem_2 = fplibMulAdd<Element>(destElem_2, el4, el3, fpscr);
|
||||
|
||||
FpscrExc = fpscr;
|
||||
|
||||
destReg.elements[2*i] = htole(destElem_1);
|
||||
destReg.elements[2*i+1] = htole(destElem_2);
|
||||
}
|
||||
'''
|
||||
# FCMLA (by element)
|
||||
fcmla_imm = fcmlaCode % {'rot': '14, 13', 'index': 'imm'}
|
||||
threeEqualRegInstX("fcmla", "FcmlaElemDX", "SimdFloatMultAccOp",
|
||||
("uint16_t", "uint32_t"), 2, fcmla_imm, True,
|
||||
byElem=True, complex=True)
|
||||
threeEqualRegInstX("fcmla", "FcmlaElemQX", "SimdFloatMultAccOp",
|
||||
floatTypes, 4, fcmla_imm, True, byElem=True,
|
||||
complex=True)
|
||||
# FCMLA (vector)
|
||||
fcmla_vec = fcmlaCode % {'rot': '12, 11', 'index': 'i'}
|
||||
threeEqualRegInstX("fcmla", "FcmlaDX", "SimdFloatMultAccOp",
|
||||
("uint16_t", "uint32_t"), 2, fcmla_vec, True,
|
||||
complex=True)
|
||||
threeEqualRegInstX("fcmla", "FcmlaQX", "SimdFloatMultAccOp",
|
||||
floatTypes, 4, fcmla_vec, True, complex=True)
|
||||
# CLS
|
||||
clsCode = '''
|
||||
unsigned count = 0;
|
||||
|
||||
@@ -215,12 +215,18 @@ def template NeonEqualRegExecute {{
|
||||
|
||||
const unsigned rCount = %(r_count)d;
|
||||
const unsigned eCount = rCount * sizeof(uint32_t) / sizeof(Element);
|
||||
const unsigned eCountFull = 4 * sizeof(uint32_t) / sizeof(Element);
|
||||
|
||||
union RegVect {
|
||||
uint32_t regs[rCount];
|
||||
Element elements[eCount];
|
||||
};
|
||||
|
||||
union FullRegVect {
|
||||
uint32_t regs[4];
|
||||
Element elements[eCountFull];
|
||||
};
|
||||
|
||||
if (%(predicate_test)s)
|
||||
{
|
||||
%(code)s;
|
||||
|
||||
Reference in New Issue
Block a user