From 19e802304335d78956a2cdaa965ccf010c9152fb Mon Sep 17 00:00:00 2001
From: Richard Cooper <richard.cooper@arm.com>
Date: Mon, 14 Sep 2020 18:55:09 +0100
Subject: [PATCH] arch-arm: Support Arm SVE Load-Broadcast Octaword
 instructions.

Add support for the Arm SVE Load-Broadcast Octaword (LD1RO{B,H,W,D})
instructions. These are similar to the Load-Broadcast
Quadword (LD1RQ{B,H,W,D}) instructions, but work on a 32-byte memory
segment rather than a 16-byte memory segment. Consequently, the LD1ROx
implementations build on the code for the LD1RQx implementations.

For more information please refer to the "ARM Architecture Reference
Manual Supplement - The Scalable Vector Extension (SVE), for ARMv8-A"
(https://developer.arm.com/architectures/cpu-architecture/a-profile/
docs/arm-architecture-reference-manual-supplement-armv8-a)

Change-Id: I98ee4f56c8099bf40c9034baa488d318ae57d3aa
Reviewed-by: Richard Cooper <richard.cooper@arm.com>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70727
Reviewed-by: Andreas Sandberg <andreas.sandberg@arm.com>
Maintainer: Andreas Sandberg <andreas.sandberg@arm.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---
 src/arch/arm/isa/formats/sve_2nd_level.isa | 88 +++++++++++++++-------
 src/arch/arm/isa/insts/sve_mem.isa         | 75 ++++++++++++------
 2 files changed, 112 insertions(+), 51 deletions(-)

diff --git a/src/arch/arm/isa/formats/sve_2nd_level.isa b/src/arch/arm/isa/formats/sve_2nd_level.isa
index 440722ac72..f74181a062 100644
--- a/src/arch/arm/isa/formats/sve_2nd_level.isa
+++ b/src/arch/arm/isa/formats/sve_2nd_level.isa
@@ -3219,66 +3219,96 @@ namespace Aarch64
     }  // decodeSveMemGather32
 
     StaticInstPtr
-    decodeSveLoadBcastQuadSS(ExtMachInst machInst)
+    decodeSveLoadBcastMultiSS(ExtMachInst machInst)
     {
-        uint8_t num = bits(machInst, 22, 21);
-        if (num != 0x00) {
-            return new Unknown64(machInst);
-        }
-
         RegIndex zt = (RegIndex)(uint8_t) bits(machInst, 4, 0);
         RegIndex rn = makeSP((RegIndex)(uint8_t) bits(machInst, 9, 5));
         RegIndex pg = (RegIndex)(uint8_t) bits(machInst, 12, 10);
         RegIndex rm = (RegIndex)(uint8_t) bits(machInst, 20, 16);
-        uint8_t msz = bits(machInst, 24, 23);
-        switch (msz) {
-            case 0:
+
+        uint8_t msz_esz = bits(machInst, 24, 21);
+
+        switch (msz_esz) {
+            // Load-Broadcast Quad-word Variants
+            case 0b0000: // 0x0:
                 return new SveLd1RqSS<uint8_t, uint8_t>("ld1rqb",
                         machInst, zt, pg, rn, rm);
-            case 1:
-                return new SveLd1RqSS<uint16_t, uint16_t>("ld1rqh",
+            case 0b0100: // 0x4:
+                 return new SveLd1RqSS<uint16_t, uint16_t>("ld1rqh",
                         machInst, zt, pg, rn, rm);
-            case 2:
+            case 0b1000: // 0x8:
                 return new SveLd1RqSS<uint32_t, uint32_t>("ld1rqw",
                         machInst, zt, pg, rn, rm);
-            case 3:
+            case 0b1100: // 0xc:
                 return new SveLd1RqSS<uint64_t, uint64_t>("ld1rqd",
                         machInst, zt, pg, rn, rm);
+
+            // Load-Broadcast Octa-word Variants
+            case 0b0001: // 0x1:
+                return new SveLd1RoSS<uint8_t, uint8_t>("ld1rob",
+                        machInst, zt, pg, rn, rm);
+            case 0b0101: // 0x5:
+                return new SveLd1RoSS<uint16_t, uint16_t>("ld1roh",
+                        machInst, zt, pg, rn, rm);
+            case 0b1001: // 0x9:
+                return new SveLd1RoSS<uint32_t, uint32_t>("ld1row",
+                        machInst, zt, pg, rn, rm);
+            case 0b1101: // 0xd:
+                return new SveLd1RoSS<uint64_t, uint64_t>("ld1rod",
+                        machInst, zt, pg, rn, rm);
+
+            default:
+              return new Unknown64(machInst);
         }
 
         return new Unknown64(machInst);
-    }  // decodeSveLoadBcastQuadSS
+    }  // decodeSveLoadBcastMultiSS
 
     StaticInstPtr
-    decodeSveLoadBcastQuadSI(ExtMachInst machInst)
+    decodeSveLoadBcastMultiSI(ExtMachInst machInst)
     {
-        uint8_t num = bits(machInst, 22, 21);
-        if (num != 0x00) {
-            return new Unknown64(machInst);
-        }
-
         RegIndex zt = (RegIndex)(uint8_t) bits(machInst, 4, 0);
         RegIndex rn = makeSP((RegIndex)(uint8_t) bits(machInst, 9, 5));
         RegIndex pg = (RegIndex)(uint8_t) bits(machInst, 12, 10);
         uint64_t imm = sext<4>(bits(machInst, 19, 16));
-        uint8_t msz = bits(machInst, 24, 23);
-        switch (msz) {
-            case 0:
+
+        uint8_t msz_esz = bits(machInst, 24, 21);
+
+        switch (msz_esz) {
+            // Load-Broadcast Quad-word Variants
+            case 0b0000: // 0x0:
                 return new SveLd1RqSI<uint8_t, uint8_t>("ld1rqb",
                         machInst, zt, pg, rn, imm);
-            case 1:
+            case 0b0100: // 0x4:
                 return new SveLd1RqSI<uint16_t, uint16_t>("ld1rqh",
                         machInst, zt, pg, rn, imm);
-            case 2:
+            case 0b1000: // 0x8:
                 return new SveLd1RqSI<uint32_t, uint32_t>("ld1rqw",
                         machInst, zt, pg, rn, imm);
-            case 3:
+            case 0b1100: // 0xc:
                 return new SveLd1RqSI<uint64_t, uint64_t>("ld1rqd",
                         machInst, zt, pg, rn, imm);
+
+            // Load-Broadcast Octa-word Variants
+            case 0b0001: // 0x1:
+                return new SveLd1RoSI<uint8_t, uint8_t>("ld1rob",
+                        machInst, zt, pg, rn, imm);
+            case 0b0101: // 0x5:
+                return new SveLd1RoSI<uint16_t, uint16_t>("ld1roh",
+                        machInst, zt, pg, rn, imm);
+            case 0b1001: // 0x9:
+                return new SveLd1RoSI<uint32_t, uint32_t>("ld1row",
+                        machInst, zt, pg, rn, imm);
+            case 0b1101: // 0xd:
+                return new SveLd1RoSI<uint64_t, uint64_t>("ld1rod",
+                        machInst, zt, pg, rn, imm);
+
+            default:
+              return new Unknown64(machInst);
         }
 
         return new Unknown64(machInst);
-    }  // decodeSveLoadBcastQuadSI
+    }  // decodeSveLoadBcastMultiSI
 
     StaticInstPtr
     decodeSveContigLoadSS(ExtMachInst machInst)
@@ -3388,10 +3418,10 @@ namespace Aarch64
     {
         switch (bits(machInst, 15, 13)) {
           case 0x0:
-            return decodeSveLoadBcastQuadSS(machInst);
+            return decodeSveLoadBcastMultiSS(machInst);
           case 0x1:
             if (bits(machInst, 20) == 0x0) {
-                return decodeSveLoadBcastQuadSI(machInst);
+                return decodeSveLoadBcastMultiSI(machInst);
             }
             break;
           case 0x2:
diff --git a/src/arch/arm/isa/insts/sve_mem.isa b/src/arch/arm/isa/insts/sve_mem.isa
index 8a73d131ce..bece3689b4 100644
--- a/src/arch/arm/isa/insts/sve_mem.isa
+++ b/src/arch/arm/isa/insts/sve_mem.isa
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2019 ARM Limited
+// Copyright (c) 2017-2020 ARM Limited
 // All rights reserved
 //
 // The license below extends only to copyright in the software and shall
@@ -1480,20 +1480,33 @@ let {{
             exec_output += SveStructMemExecDeclare.subst(substDict)
 
     # Generates definitions for SVE load-and-replicate quadword instructions
-    def emitSveLoadAndReplQuad(offsetIsImm):
+    def emitSveLoadAndReplMulti(offsetIsImm, numQwordSegments):
         global header_output, exec_output, decoders
+        assert(numQwordSegments in (1, 2)) # Quadword or Octaword
+        from collections import namedtuple
+        InstConfig = namedtuple("_InstConfig", "mnemonic classname baseclass")
+        INST_CONFIGURATIONS = {
+            # (offsetIsImm, numQwordSegments) -> InstConfig Recors
+            (True, 1): InstConfig("ld1rq", "SveLd1RqSI", "SveContigMemSI"),
+            (False, 1): InstConfig("ld1rq", "SveLd1RqSS", "SveContigMemSS"),
+            (True, 2): InstConfig("ld1ro", "SveLd1RoSI", "SveContigMemSI"),
+            (False, 2): InstConfig("ld1ro", "SveLd1RoSS", "SveContigMemSS"),
+        }
+        inst_config = INST_CONFIGURATIONS[(offsetIsImm, numQwordSegments)]
+        memAccessSize = numQwordSegments * 16;
         tplHeader = 'template <class RegElemType, class MemElemType>'
         tplArgs = '<RegElemType, MemElemType>'
         eaCode = SPAlignmentCheckCode + '''
-        int memAccessSize = 16;
-        EA = XBase + '''
+        int memAccessSize = %(memAccessSize)d;
+        EA = XBase + ''' % dict(memAccessSize=memAccessSize)
         if offsetIsImm:
-            eaCode += '(((int64_t) this->imm) * 16);'
+            eaCode += ('(((int64_t) this->imm) * %(memAccessSize)d);'
+                       % dict(memAccessSize=memAccessSize))
         else:
             eaCode += '(XOffset * sizeof(MemElemType));'
         loadRdEnableCode = '''
-        eCount = 16/sizeof(RegElemType);
-        auto rdEn = std::vector<bool>(16, true);
+        eCount = %(memAccessSize)d/sizeof(RegElemType);
+        auto rdEn = std::vector<bool>(%(memAccessSize)d, true);
         for (int i = 0; i < eCount; ++i) {
             if (!GpOp_x[i]) {
                 for (int j = 0; j < sizeof(RegElemType); ++j) {
@@ -1501,26 +1514,40 @@ let {{
                 }
             }
         }
-        '''
+        ''' % dict(memAccessSize=memAccessSize)
         memAccCode = '''
-        __uint128_t qword;
-        RegElemType* qp = reinterpret_cast<RegElemType*>(&qword);
-        for (int i = 0; i < 16/sizeof(RegElemType); ++i) {
+        // Copy active elements of the data from memory into a temporary
+        // quadword/octaword
+        __uint128_t qwords[%(numQwordSegments)d];
+        eCount = %(memAccessSize)d/sizeof(RegElemType);
+        RegElemType* qp = reinterpret_cast<RegElemType*>(&qwords);
+        for (int i = 0; i < eCount; ++i) {
             if (GpOp_x[i]) {
                 qp[i] = memDataView[i];
             } else {
                 qp[i] = 0;
             }
         }
-        eCount = ArmStaticInst::getCurSveVecLen<__uint128_t>(
+        // Repeat the temporary quadword/octaword segment into the
+        // vector register. Zero fill the remainder for non-full
+        // octawords.
+        unsigned numQwords = ArmStaticInst::getCurSveVecLen<__uint128_t>(
                 xc->tcBase());
-        for (int i = 0; i < eCount; ++i) {
-            AA64FpDest_uq[i] = qword;
+        unsigned numFullQwords = numQwords -
+                                 (numQwords %% %(numQwordSegments)d);
+        for (int i = 0; i < numQwords; ++i) {
+            if (i < numFullQwords) {
+                AA64FpDest_uq[i] = qwords[i %% %(numQwordSegments)d];
+            } else {
+                AA64FpDest_uq[i] = 0;
+            }
         }
-        '''
-        iop = ArmInstObjParams('ld1rq',
-                'SveLd1RqSI' if offsetIsImm else 'SveLd1RqSS',
-                'SveContigMemSI' if offsetIsImm else 'SveContigMemSS',
+        ''' % dict(memAccessSize=memAccessSize,
+                   numQwordSegments=numQwordSegments)
+        iop = ArmInstObjParams(
+                inst_config.mnemonic,
+                inst_config.classname,
+                inst_config.baseclass,
                 {'tpl_header': tplHeader,
                  'tpl_args': tplArgs,
                  'rden_code': loadRdEnableCode,
@@ -1539,8 +1566,7 @@ let {{
                 SveContigLoadCompleteAcc.subst(iop))
         for ttype in ('uint8_t', 'uint16_t', 'uint32_t', 'uint64_t'):
             substDict = {'tpl_args': '<%s, %s>' % (ttype, ttype),
-                    'class_name': 'SveLd1RqSI' if offsetIsImm
-                                  else 'SveLd1RqSS'}
+                         'class_name': inst_config.classname}
             exec_output += SveContigMemExecDeclare.subst(substDict)
 
     # LD1[S]{B,H,W,D} (scalar plus immediate)
@@ -1556,9 +1582,14 @@ let {{
     emitSveLoadAndRepl()
 
     # LD1RQ{B,H,W,D} (scalar plus immediate)
-    emitSveLoadAndReplQuad(offsetIsImm = True)
+    emitSveLoadAndReplMulti(offsetIsImm=True, numQwordSegments=1)
     # LD1RQ{B,H,W,D} (scalar plus scalar)
-    emitSveLoadAndReplQuad(offsetIsImm = False)
+    emitSveLoadAndReplMulti(offsetIsImm=False, numQwordSegments=1)
+
+    # LD1RO{B,H,W,D} (scalar plus immediate)
+    emitSveLoadAndReplMulti(offsetIsImm=True, numQwordSegments=2)
+    # LD1RO{B,H,W,D} (scalar plus scalar)
+    emitSveLoadAndReplMulti(offsetIsImm=False, numQwordSegments=2)
 
     # LD{2,3,4}{B,H,W,D} (scalar plus immediate)
     # ST{2,3,4}{B,H,W,D} (scalar plus immediate)