From 19e802304335d78956a2cdaa965ccf010c9152fb Mon Sep 17 00:00:00 2001 From: Richard Cooper Date: Mon, 14 Sep 2020 18:55:09 +0100 Subject: [PATCH] arch-arm: Support Arm SVE Load-Broadcast Octaword instructions. Add support for the Arm SVE Load-Broadcast Octaword (LD1RO{B,H,W,D}) instructions. These are similar to the Load-Broadcast Quadword (LD1RQ{B,H,W,D}) instructions, but work on a 32-byte memory segment rather than a 16-byte memory segment. Consequently, the LD1ROx implementations build on the code for the LD1RQx implementations. For more information please refer to the "ARM Architecture Reference Manual Supplement - The Scalable Vector Extension (SVE), for ARMv8-A" (https://developer.arm.com/architectures/cpu-architecture/a-profile/ docs/arm-architecture-reference-manual-supplement-armv8-a) Change-Id: I98ee4f56c8099bf40c9034baa488d318ae57d3aa Reviewed-by: Richard Cooper Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70727 Reviewed-by: Andreas Sandberg Maintainer: Andreas Sandberg Tested-by: kokoro --- src/arch/arm/isa/formats/sve_2nd_level.isa | 88 +++++++++++++++------- src/arch/arm/isa/insts/sve_mem.isa | 75 ++++++++++++------ 2 files changed, 112 insertions(+), 51 deletions(-) diff --git a/src/arch/arm/isa/formats/sve_2nd_level.isa b/src/arch/arm/isa/formats/sve_2nd_level.isa index 440722ac72..f74181a062 100644 --- a/src/arch/arm/isa/formats/sve_2nd_level.isa +++ b/src/arch/arm/isa/formats/sve_2nd_level.isa @@ -3219,66 +3219,96 @@ namespace Aarch64 } // decodeSveMemGather32 StaticInstPtr - decodeSveLoadBcastQuadSS(ExtMachInst machInst) + decodeSveLoadBcastMultiSS(ExtMachInst machInst) { - uint8_t num = bits(machInst, 22, 21); - if (num != 0x00) { - return new Unknown64(machInst); - } - RegIndex zt = (RegIndex)(uint8_t) bits(machInst, 4, 0); RegIndex rn = makeSP((RegIndex)(uint8_t) bits(machInst, 9, 5)); RegIndex pg = (RegIndex)(uint8_t) bits(machInst, 12, 10); RegIndex rm = (RegIndex)(uint8_t) bits(machInst, 20, 16); - uint8_t msz = bits(machInst, 24, 23); - switch (msz) { - case 0: + + uint8_t msz_esz = bits(machInst, 24, 21); + + switch (msz_esz) { + // Load-Broadcast Quad-word Variants + case 0b0000: // 0x0: return new SveLd1RqSS("ld1rqb", machInst, zt, pg, rn, rm); - case 1: - return new SveLd1RqSS("ld1rqh", + case 0b0100: // 0x4: + return new SveLd1RqSS("ld1rqh", machInst, zt, pg, rn, rm); - case 2: + case 0b1000: // 0x8: return new SveLd1RqSS("ld1rqw", machInst, zt, pg, rn, rm); - case 3: + case 0b1100: // 0xc: return new SveLd1RqSS("ld1rqd", machInst, zt, pg, rn, rm); + + // Load-Broadcast Octa-word Variants + case 0b0001: // 0x1: + return new SveLd1RoSS("ld1rob", + machInst, zt, pg, rn, rm); + case 0b0101: // 0x5: + return new SveLd1RoSS("ld1roh", + machInst, zt, pg, rn, rm); + case 0b1001: // 0x9: + return new SveLd1RoSS("ld1row", + machInst, zt, pg, rn, rm); + case 0b1101: // 0xd: + return new SveLd1RoSS("ld1rod", + machInst, zt, pg, rn, rm); + + default: + return new Unknown64(machInst); } return new Unknown64(machInst); - } // decodeSveLoadBcastQuadSS + } // decodeSveLoadBcastMultiSS StaticInstPtr - decodeSveLoadBcastQuadSI(ExtMachInst machInst) + decodeSveLoadBcastMultiSI(ExtMachInst machInst) { - uint8_t num = bits(machInst, 22, 21); - if (num != 0x00) { - return new Unknown64(machInst); - } - RegIndex zt = (RegIndex)(uint8_t) bits(machInst, 4, 0); RegIndex rn = makeSP((RegIndex)(uint8_t) bits(machInst, 9, 5)); RegIndex pg = (RegIndex)(uint8_t) bits(machInst, 12, 10); uint64_t imm = sext<4>(bits(machInst, 19, 16)); - uint8_t msz = bits(machInst, 24, 23); - switch (msz) { - case 0: + + uint8_t msz_esz = bits(machInst, 24, 21); + + switch (msz_esz) { + // Load-Broadcast Quad-word Variants + case 0b0000: // 0x0: return new SveLd1RqSI("ld1rqb", machInst, zt, pg, rn, imm); - case 1: + case 0b0100: // 0x4: return new SveLd1RqSI("ld1rqh", machInst, zt, pg, rn, imm); - case 2: + case 0b1000: // 0x8: return new SveLd1RqSI("ld1rqw", machInst, zt, pg, rn, imm); - case 3: + case 0b1100: // 0xc: return new SveLd1RqSI("ld1rqd", machInst, zt, pg, rn, imm); + + // Load-Broadcast Octa-word Variants + case 0b0001: // 0x1: + return new SveLd1RoSI("ld1rob", + machInst, zt, pg, rn, imm); + case 0b0101: // 0x5: + return new SveLd1RoSI("ld1roh", + machInst, zt, pg, rn, imm); + case 0b1001: // 0x9: + return new SveLd1RoSI("ld1row", + machInst, zt, pg, rn, imm); + case 0b1101: // 0xd: + return new SveLd1RoSI("ld1rod", + machInst, zt, pg, rn, imm); + + default: + return new Unknown64(machInst); } return new Unknown64(machInst); - } // decodeSveLoadBcastQuadSI + } // decodeSveLoadBcastMultiSI StaticInstPtr decodeSveContigLoadSS(ExtMachInst machInst) @@ -3388,10 +3418,10 @@ namespace Aarch64 { switch (bits(machInst, 15, 13)) { case 0x0: - return decodeSveLoadBcastQuadSS(machInst); + return decodeSveLoadBcastMultiSS(machInst); case 0x1: if (bits(machInst, 20) == 0x0) { - return decodeSveLoadBcastQuadSI(machInst); + return decodeSveLoadBcastMultiSI(machInst); } break; case 0x2: diff --git a/src/arch/arm/isa/insts/sve_mem.isa b/src/arch/arm/isa/insts/sve_mem.isa index 8a73d131ce..bece3689b4 100644 --- a/src/arch/arm/isa/insts/sve_mem.isa +++ b/src/arch/arm/isa/insts/sve_mem.isa @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2019 ARM Limited +// Copyright (c) 2017-2020 ARM Limited // All rights reserved // // The license below extends only to copyright in the software and shall @@ -1480,20 +1480,33 @@ let {{ exec_output += SveStructMemExecDeclare.subst(substDict) # Generates definitions for SVE load-and-replicate quadword instructions - def emitSveLoadAndReplQuad(offsetIsImm): + def emitSveLoadAndReplMulti(offsetIsImm, numQwordSegments): global header_output, exec_output, decoders + assert(numQwordSegments in (1, 2)) # Quadword or Octaword + from collections import namedtuple + InstConfig = namedtuple("_InstConfig", "mnemonic classname baseclass") + INST_CONFIGURATIONS = { + # (offsetIsImm, numQwordSegments) -> InstConfig Recors + (True, 1): InstConfig("ld1rq", "SveLd1RqSI", "SveContigMemSI"), + (False, 1): InstConfig("ld1rq", "SveLd1RqSS", "SveContigMemSS"), + (True, 2): InstConfig("ld1ro", "SveLd1RoSI", "SveContigMemSI"), + (False, 2): InstConfig("ld1ro", "SveLd1RoSS", "SveContigMemSS"), + } + inst_config = INST_CONFIGURATIONS[(offsetIsImm, numQwordSegments)] + memAccessSize = numQwordSegments * 16; tplHeader = 'template ' tplArgs = '' eaCode = SPAlignmentCheckCode + ''' - int memAccessSize = 16; - EA = XBase + ''' + int memAccessSize = %(memAccessSize)d; + EA = XBase + ''' % dict(memAccessSize=memAccessSize) if offsetIsImm: - eaCode += '(((int64_t) this->imm) * 16);' + eaCode += ('(((int64_t) this->imm) * %(memAccessSize)d);' + % dict(memAccessSize=memAccessSize)) else: eaCode += '(XOffset * sizeof(MemElemType));' loadRdEnableCode = ''' - eCount = 16/sizeof(RegElemType); - auto rdEn = std::vector(16, true); + eCount = %(memAccessSize)d/sizeof(RegElemType); + auto rdEn = std::vector(%(memAccessSize)d, true); for (int i = 0; i < eCount; ++i) { if (!GpOp_x[i]) { for (int j = 0; j < sizeof(RegElemType); ++j) { @@ -1501,26 +1514,40 @@ let {{ } } } - ''' + ''' % dict(memAccessSize=memAccessSize) memAccCode = ''' - __uint128_t qword; - RegElemType* qp = reinterpret_cast(&qword); - for (int i = 0; i < 16/sizeof(RegElemType); ++i) { + // Copy active elements of the data from memory into a temporary + // quadword/octaword + __uint128_t qwords[%(numQwordSegments)d]; + eCount = %(memAccessSize)d/sizeof(RegElemType); + RegElemType* qp = reinterpret_cast(&qwords); + for (int i = 0; i < eCount; ++i) { if (GpOp_x[i]) { qp[i] = memDataView[i]; } else { qp[i] = 0; } } - eCount = ArmStaticInst::getCurSveVecLen<__uint128_t>( + // Repeat the temporary quadword/octaword segment into the + // vector register. Zero fill the remainder for non-full + // octawords. + unsigned numQwords = ArmStaticInst::getCurSveVecLen<__uint128_t>( xc->tcBase()); - for (int i = 0; i < eCount; ++i) { - AA64FpDest_uq[i] = qword; + unsigned numFullQwords = numQwords - + (numQwords %% %(numQwordSegments)d); + for (int i = 0; i < numQwords; ++i) { + if (i < numFullQwords) { + AA64FpDest_uq[i] = qwords[i %% %(numQwordSegments)d]; + } else { + AA64FpDest_uq[i] = 0; + } } - ''' - iop = ArmInstObjParams('ld1rq', - 'SveLd1RqSI' if offsetIsImm else 'SveLd1RqSS', - 'SveContigMemSI' if offsetIsImm else 'SveContigMemSS', + ''' % dict(memAccessSize=memAccessSize, + numQwordSegments=numQwordSegments) + iop = ArmInstObjParams( + inst_config.mnemonic, + inst_config.classname, + inst_config.baseclass, {'tpl_header': tplHeader, 'tpl_args': tplArgs, 'rden_code': loadRdEnableCode, @@ -1539,8 +1566,7 @@ let {{ SveContigLoadCompleteAcc.subst(iop)) for ttype in ('uint8_t', 'uint16_t', 'uint32_t', 'uint64_t'): substDict = {'tpl_args': '<%s, %s>' % (ttype, ttype), - 'class_name': 'SveLd1RqSI' if offsetIsImm - else 'SveLd1RqSS'} + 'class_name': inst_config.classname} exec_output += SveContigMemExecDeclare.subst(substDict) # LD1[S]{B,H,W,D} (scalar plus immediate) @@ -1556,9 +1582,14 @@ let {{ emitSveLoadAndRepl() # LD1RQ{B,H,W,D} (scalar plus immediate) - emitSveLoadAndReplQuad(offsetIsImm = True) + emitSveLoadAndReplMulti(offsetIsImm=True, numQwordSegments=1) # LD1RQ{B,H,W,D} (scalar plus scalar) - emitSveLoadAndReplQuad(offsetIsImm = False) + emitSveLoadAndReplMulti(offsetIsImm=False, numQwordSegments=1) + + # LD1RO{B,H,W,D} (scalar plus immediate) + emitSveLoadAndReplMulti(offsetIsImm=True, numQwordSegments=2) + # LD1RO{B,H,W,D} (scalar plus scalar) + emitSveLoadAndReplMulti(offsetIsImm=False, numQwordSegments=2) # LD{2,3,4}{B,H,W,D} (scalar plus immediate) # ST{2,3,4}{B,H,W,D} (scalar plus immediate)