arch-arm: Support Arm SVE Load-Broadcast Octaword instructions.
Add support for the Arm SVE Load-Broadcast Octaword (LD1RO{B,H,W,D})
instructions. These are similar to the Load-Broadcast
Quadword (LD1RQ{B,H,W,D}) instructions, but work on a 32-byte memory
segment rather than a 16-byte memory segment. Consequently, the LD1ROx
implementations build on the code for the LD1RQx implementations.
For more information please refer to the "ARM Architecture Reference
Manual Supplement - The Scalable Vector Extension (SVE), for ARMv8-A"
(https://developer.arm.com/architectures/cpu-architecture/a-profile/
docs/arm-architecture-reference-manual-supplement-armv8-a)
Change-Id: I98ee4f56c8099bf40c9034baa488d318ae57d3aa
Reviewed-by: Richard Cooper <richard.cooper@arm.com>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70727
Reviewed-by: Andreas Sandberg <andreas.sandberg@arm.com>
Maintainer: Andreas Sandberg <andreas.sandberg@arm.com>
Tested-by: kokoro <noreply+kokoro@google.com>
This commit is contained in:
committed by
Bobby Bruce
parent
94a629b527
commit
19e8023043
@@ -3219,66 +3219,96 @@ namespace Aarch64
|
||||
} // decodeSveMemGather32
|
||||
|
||||
StaticInstPtr
|
||||
decodeSveLoadBcastQuadSS(ExtMachInst machInst)
|
||||
decodeSveLoadBcastMultiSS(ExtMachInst machInst)
|
||||
{
|
||||
uint8_t num = bits(machInst, 22, 21);
|
||||
if (num != 0x00) {
|
||||
return new Unknown64(machInst);
|
||||
}
|
||||
|
||||
RegIndex zt = (RegIndex)(uint8_t) bits(machInst, 4, 0);
|
||||
RegIndex rn = makeSP((RegIndex)(uint8_t) bits(machInst, 9, 5));
|
||||
RegIndex pg = (RegIndex)(uint8_t) bits(machInst, 12, 10);
|
||||
RegIndex rm = (RegIndex)(uint8_t) bits(machInst, 20, 16);
|
||||
uint8_t msz = bits(machInst, 24, 23);
|
||||
switch (msz) {
|
||||
case 0:
|
||||
|
||||
uint8_t msz_esz = bits(machInst, 24, 21);
|
||||
|
||||
switch (msz_esz) {
|
||||
// Load-Broadcast Quad-word Variants
|
||||
case 0b0000: // 0x0:
|
||||
return new SveLd1RqSS<uint8_t, uint8_t>("ld1rqb",
|
||||
machInst, zt, pg, rn, rm);
|
||||
case 1:
|
||||
return new SveLd1RqSS<uint16_t, uint16_t>("ld1rqh",
|
||||
case 0b0100: // 0x4:
|
||||
return new SveLd1RqSS<uint16_t, uint16_t>("ld1rqh",
|
||||
machInst, zt, pg, rn, rm);
|
||||
case 2:
|
||||
case 0b1000: // 0x8:
|
||||
return new SveLd1RqSS<uint32_t, uint32_t>("ld1rqw",
|
||||
machInst, zt, pg, rn, rm);
|
||||
case 3:
|
||||
case 0b1100: // 0xc:
|
||||
return new SveLd1RqSS<uint64_t, uint64_t>("ld1rqd",
|
||||
machInst, zt, pg, rn, rm);
|
||||
|
||||
// Load-Broadcast Octa-word Variants
|
||||
case 0b0001: // 0x1:
|
||||
return new SveLd1RoSS<uint8_t, uint8_t>("ld1rob",
|
||||
machInst, zt, pg, rn, rm);
|
||||
case 0b0101: // 0x5:
|
||||
return new SveLd1RoSS<uint16_t, uint16_t>("ld1roh",
|
||||
machInst, zt, pg, rn, rm);
|
||||
case 0b1001: // 0x9:
|
||||
return new SveLd1RoSS<uint32_t, uint32_t>("ld1row",
|
||||
machInst, zt, pg, rn, rm);
|
||||
case 0b1101: // 0xd:
|
||||
return new SveLd1RoSS<uint64_t, uint64_t>("ld1rod",
|
||||
machInst, zt, pg, rn, rm);
|
||||
|
||||
default:
|
||||
return new Unknown64(machInst);
|
||||
}
|
||||
|
||||
return new Unknown64(machInst);
|
||||
} // decodeSveLoadBcastQuadSS
|
||||
} // decodeSveLoadBcastMultiSS
|
||||
|
||||
StaticInstPtr
|
||||
decodeSveLoadBcastQuadSI(ExtMachInst machInst)
|
||||
decodeSveLoadBcastMultiSI(ExtMachInst machInst)
|
||||
{
|
||||
uint8_t num = bits(machInst, 22, 21);
|
||||
if (num != 0x00) {
|
||||
return new Unknown64(machInst);
|
||||
}
|
||||
|
||||
RegIndex zt = (RegIndex)(uint8_t) bits(machInst, 4, 0);
|
||||
RegIndex rn = makeSP((RegIndex)(uint8_t) bits(machInst, 9, 5));
|
||||
RegIndex pg = (RegIndex)(uint8_t) bits(machInst, 12, 10);
|
||||
uint64_t imm = sext<4>(bits(machInst, 19, 16));
|
||||
uint8_t msz = bits(machInst, 24, 23);
|
||||
switch (msz) {
|
||||
case 0:
|
||||
|
||||
uint8_t msz_esz = bits(machInst, 24, 21);
|
||||
|
||||
switch (msz_esz) {
|
||||
// Load-Broadcast Quad-word Variants
|
||||
case 0b0000: // 0x0:
|
||||
return new SveLd1RqSI<uint8_t, uint8_t>("ld1rqb",
|
||||
machInst, zt, pg, rn, imm);
|
||||
case 1:
|
||||
case 0b0100: // 0x4:
|
||||
return new SveLd1RqSI<uint16_t, uint16_t>("ld1rqh",
|
||||
machInst, zt, pg, rn, imm);
|
||||
case 2:
|
||||
case 0b1000: // 0x8:
|
||||
return new SveLd1RqSI<uint32_t, uint32_t>("ld1rqw",
|
||||
machInst, zt, pg, rn, imm);
|
||||
case 3:
|
||||
case 0b1100: // 0xc:
|
||||
return new SveLd1RqSI<uint64_t, uint64_t>("ld1rqd",
|
||||
machInst, zt, pg, rn, imm);
|
||||
|
||||
// Load-Broadcast Octa-word Variants
|
||||
case 0b0001: // 0x1:
|
||||
return new SveLd1RoSI<uint8_t, uint8_t>("ld1rob",
|
||||
machInst, zt, pg, rn, imm);
|
||||
case 0b0101: // 0x5:
|
||||
return new SveLd1RoSI<uint16_t, uint16_t>("ld1roh",
|
||||
machInst, zt, pg, rn, imm);
|
||||
case 0b1001: // 0x9:
|
||||
return new SveLd1RoSI<uint32_t, uint32_t>("ld1row",
|
||||
machInst, zt, pg, rn, imm);
|
||||
case 0b1101: // 0xd:
|
||||
return new SveLd1RoSI<uint64_t, uint64_t>("ld1rod",
|
||||
machInst, zt, pg, rn, imm);
|
||||
|
||||
default:
|
||||
return new Unknown64(machInst);
|
||||
}
|
||||
|
||||
return new Unknown64(machInst);
|
||||
} // decodeSveLoadBcastQuadSI
|
||||
} // decodeSveLoadBcastMultiSI
|
||||
|
||||
StaticInstPtr
|
||||
decodeSveContigLoadSS(ExtMachInst machInst)
|
||||
@@ -3388,10 +3418,10 @@ namespace Aarch64
|
||||
{
|
||||
switch (bits(machInst, 15, 13)) {
|
||||
case 0x0:
|
||||
return decodeSveLoadBcastQuadSS(machInst);
|
||||
return decodeSveLoadBcastMultiSS(machInst);
|
||||
case 0x1:
|
||||
if (bits(machInst, 20) == 0x0) {
|
||||
return decodeSveLoadBcastQuadSI(machInst);
|
||||
return decodeSveLoadBcastMultiSI(machInst);
|
||||
}
|
||||
break;
|
||||
case 0x2:
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// Copyright (c) 2017-2019 ARM Limited
|
||||
// Copyright (c) 2017-2020 ARM Limited
|
||||
// All rights reserved
|
||||
//
|
||||
// The license below extends only to copyright in the software and shall
|
||||
@@ -1480,20 +1480,33 @@ let {{
|
||||
exec_output += SveStructMemExecDeclare.subst(substDict)
|
||||
|
||||
# Generates definitions for SVE load-and-replicate quadword instructions
|
||||
def emitSveLoadAndReplQuad(offsetIsImm):
|
||||
def emitSveLoadAndReplMulti(offsetIsImm, numQwordSegments):
|
||||
global header_output, exec_output, decoders
|
||||
assert(numQwordSegments in (1, 2)) # Quadword or Octaword
|
||||
from collections import namedtuple
|
||||
InstConfig = namedtuple("_InstConfig", "mnemonic classname baseclass")
|
||||
INST_CONFIGURATIONS = {
|
||||
# (offsetIsImm, numQwordSegments) -> InstConfig Recors
|
||||
(True, 1): InstConfig("ld1rq", "SveLd1RqSI", "SveContigMemSI"),
|
||||
(False, 1): InstConfig("ld1rq", "SveLd1RqSS", "SveContigMemSS"),
|
||||
(True, 2): InstConfig("ld1ro", "SveLd1RoSI", "SveContigMemSI"),
|
||||
(False, 2): InstConfig("ld1ro", "SveLd1RoSS", "SveContigMemSS"),
|
||||
}
|
||||
inst_config = INST_CONFIGURATIONS[(offsetIsImm, numQwordSegments)]
|
||||
memAccessSize = numQwordSegments * 16;
|
||||
tplHeader = 'template <class RegElemType, class MemElemType>'
|
||||
tplArgs = '<RegElemType, MemElemType>'
|
||||
eaCode = SPAlignmentCheckCode + '''
|
||||
int memAccessSize = 16;
|
||||
EA = XBase + '''
|
||||
int memAccessSize = %(memAccessSize)d;
|
||||
EA = XBase + ''' % dict(memAccessSize=memAccessSize)
|
||||
if offsetIsImm:
|
||||
eaCode += '(((int64_t) this->imm) * 16);'
|
||||
eaCode += ('(((int64_t) this->imm) * %(memAccessSize)d);'
|
||||
% dict(memAccessSize=memAccessSize))
|
||||
else:
|
||||
eaCode += '(XOffset * sizeof(MemElemType));'
|
||||
loadRdEnableCode = '''
|
||||
eCount = 16/sizeof(RegElemType);
|
||||
auto rdEn = std::vector<bool>(16, true);
|
||||
eCount = %(memAccessSize)d/sizeof(RegElemType);
|
||||
auto rdEn = std::vector<bool>(%(memAccessSize)d, true);
|
||||
for (int i = 0; i < eCount; ++i) {
|
||||
if (!GpOp_x[i]) {
|
||||
for (int j = 0; j < sizeof(RegElemType); ++j) {
|
||||
@@ -1501,26 +1514,40 @@ let {{
|
||||
}
|
||||
}
|
||||
}
|
||||
'''
|
||||
''' % dict(memAccessSize=memAccessSize)
|
||||
memAccCode = '''
|
||||
__uint128_t qword;
|
||||
RegElemType* qp = reinterpret_cast<RegElemType*>(&qword);
|
||||
for (int i = 0; i < 16/sizeof(RegElemType); ++i) {
|
||||
// Copy active elements of the data from memory into a temporary
|
||||
// quadword/octaword
|
||||
__uint128_t qwords[%(numQwordSegments)d];
|
||||
eCount = %(memAccessSize)d/sizeof(RegElemType);
|
||||
RegElemType* qp = reinterpret_cast<RegElemType*>(&qwords);
|
||||
for (int i = 0; i < eCount; ++i) {
|
||||
if (GpOp_x[i]) {
|
||||
qp[i] = memDataView[i];
|
||||
} else {
|
||||
qp[i] = 0;
|
||||
}
|
||||
}
|
||||
eCount = ArmStaticInst::getCurSveVecLen<__uint128_t>(
|
||||
// Repeat the temporary quadword/octaword segment into the
|
||||
// vector register. Zero fill the remainder for non-full
|
||||
// octawords.
|
||||
unsigned numQwords = ArmStaticInst::getCurSveVecLen<__uint128_t>(
|
||||
xc->tcBase());
|
||||
for (int i = 0; i < eCount; ++i) {
|
||||
AA64FpDest_uq[i] = qword;
|
||||
unsigned numFullQwords = numQwords -
|
||||
(numQwords %% %(numQwordSegments)d);
|
||||
for (int i = 0; i < numQwords; ++i) {
|
||||
if (i < numFullQwords) {
|
||||
AA64FpDest_uq[i] = qwords[i %% %(numQwordSegments)d];
|
||||
} else {
|
||||
AA64FpDest_uq[i] = 0;
|
||||
}
|
||||
}
|
||||
'''
|
||||
iop = ArmInstObjParams('ld1rq',
|
||||
'SveLd1RqSI' if offsetIsImm else 'SveLd1RqSS',
|
||||
'SveContigMemSI' if offsetIsImm else 'SveContigMemSS',
|
||||
''' % dict(memAccessSize=memAccessSize,
|
||||
numQwordSegments=numQwordSegments)
|
||||
iop = ArmInstObjParams(
|
||||
inst_config.mnemonic,
|
||||
inst_config.classname,
|
||||
inst_config.baseclass,
|
||||
{'tpl_header': tplHeader,
|
||||
'tpl_args': tplArgs,
|
||||
'rden_code': loadRdEnableCode,
|
||||
@@ -1539,8 +1566,7 @@ let {{
|
||||
SveContigLoadCompleteAcc.subst(iop))
|
||||
for ttype in ('uint8_t', 'uint16_t', 'uint32_t', 'uint64_t'):
|
||||
substDict = {'tpl_args': '<%s, %s>' % (ttype, ttype),
|
||||
'class_name': 'SveLd1RqSI' if offsetIsImm
|
||||
else 'SveLd1RqSS'}
|
||||
'class_name': inst_config.classname}
|
||||
exec_output += SveContigMemExecDeclare.subst(substDict)
|
||||
|
||||
# LD1[S]{B,H,W,D} (scalar plus immediate)
|
||||
@@ -1556,9 +1582,14 @@ let {{
|
||||
emitSveLoadAndRepl()
|
||||
|
||||
# LD1RQ{B,H,W,D} (scalar plus immediate)
|
||||
emitSveLoadAndReplQuad(offsetIsImm = True)
|
||||
emitSveLoadAndReplMulti(offsetIsImm=True, numQwordSegments=1)
|
||||
# LD1RQ{B,H,W,D} (scalar plus scalar)
|
||||
emitSveLoadAndReplQuad(offsetIsImm = False)
|
||||
emitSveLoadAndReplMulti(offsetIsImm=False, numQwordSegments=1)
|
||||
|
||||
# LD1RO{B,H,W,D} (scalar plus immediate)
|
||||
emitSveLoadAndReplMulti(offsetIsImm=True, numQwordSegments=2)
|
||||
# LD1RO{B,H,W,D} (scalar plus scalar)
|
||||
emitSveLoadAndReplMulti(offsetIsImm=False, numQwordSegments=2)
|
||||
|
||||
# LD{2,3,4}{B,H,W,D} (scalar plus immediate)
|
||||
# ST{2,3,4}{B,H,W,D} (scalar plus immediate)
|
||||
|
||||
Reference in New Issue
Block a user