arch-arm: Added Armv8.2-I8MM SVE mixed-sign dot product instrs.

Add support for the SVE mixed sign dot product instructions (USDOT, SUDOT) required by the Armv8.2 SVE Int8 matrix multiplication extension (ARMv8.2-I8MM). For more information please refer to the "ARM Architecture Reference Manual Supplement - The Scalable Vector Extension (SVE), for ARMv8-A" (https://developer.arm.com/architectures/cpu-architecture/a-profile/ docs/arm-architecture-reference-manual-supplement-armv8-a) Change-Id: I83841654cee74b940f967b3a37b99d87c01bd92c Reviewed-by: Richard Cooper <richard.cooper@arm.com> Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/70732 Maintainer: Jason Lowe-Power <power.jg@gmail.com> Reviewed-by: Jason Lowe-Power <power.jg@gmail.com> Reviewed-by: Andreas Sandberg <andreas.sandberg@arm.com> Tested-by: kokoro <noreply+kokoro@google.com>
2020-10-01 18:31:49 +01:00
parent 9421a46d71
commit f8b60b7a1d
3 changed files with 98 additions and 61 deletions
--- a/src/arch/arm/isa/formats/sve_2nd_level.isa
+++ b/src/arch/arm/isa/formats/sve_2nd_level.isa
@@ -2256,19 +2256,19 @@ namespace Aarch64
        uint8_t usig = (uint8_t) bits(machInst, 10);
        if (size & 0x1) {
            if (usig) {
-                return new SveUdotv<uint16_t, uint64_t>(machInst,
-                                                        zda, zn, zm);
+                return new SveUdotv<uint16_t, uint16_t, uint64_t>
+                                        (machInst, zda, zn, zm);
            } else {
-                return new SveSdotv<int16_t, int64_t>(machInst,
-                                                        zda, zn, zm);
+                return new SveSdotv<int16_t, int16_t, int64_t>
+                                        (machInst, zda, zn, zm);
            }
        } else {
            if (usig) {
-                return new SveUdotv<uint8_t, uint32_t>(machInst,
-                                                        zda, zn, zm);
+                return new SveUdotv<uint8_t, uint8_t, uint32_t>
+                                        (machInst, zda, zn, zm);
            } else {
-                return new SveSdotv<int8_t, int32_t>(machInst,
-                                                        zda, zn, zm);
+                return new SveSdotv<int8_t, int8_t, int32_t>
+                                        (machInst, zda, zn, zm);
            }
        }

@@ -2292,21 +2292,21 @@ namespace Aarch64
            RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 19, 16);
            uint8_t i1 = (uint8_t) bits(machInst, 20);
            if (usig) {
-                return new SveUdoti<uint16_t, uint64_t>(machInst,
-                                                        zda, zn, zm, i1);
+                return new SveUdoti<uint16_t, uint16_t, uint64_t>
+                                       (machInst, zda, zn, zm, i1);
            } else {
-                return new SveSdoti<int16_t, int64_t>(machInst,
-                                                        zda, zn, zm, i1);
+                return new SveSdoti<int16_t, int16_t, int64_t>
+                                       (machInst, zda, zn, zm, i1);
            }
        } else {
            RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 18, 16);
            uint8_t i2 = (uint8_t) bits(machInst, 20, 19);
            if (usig) {
-                return new SveUdoti<uint8_t, uint32_t>(machInst,
-                                                        zda, zn, zm, i2);
+                return new SveUdoti<uint8_t, uint8_t, uint32_t>
+                                        (machInst, zda, zn, zm, i2);
            } else {
-                return new SveSdoti<int8_t, int32_t>(machInst,
-                                                        zda, zn, zm, i2);
+                return new SveSdoti<int8_t, int8_t, int32_t>
+                                        (machInst, zda, zn, zm, i2);
            }
        }
        return new Unknown64(machInst);
@@ -2320,16 +2320,12 @@ namespace Aarch64
            return new Unknown64(machInst);
        }

-        RegIndex zda M5_VAR_USED = (RegIndex)
-                                          (uint8_t) bits(machInst, 4, 0);
-        RegIndex zn M5_VAR_USED = (RegIndex)
-                                          (uint8_t) bits(machInst, 9, 5);
-        RegIndex zm M5_VAR_USED = (RegIndex)
-                                          (uint8_t) bits(machInst, 20, 16);
+        RegIndex zda = (RegIndex) (uint8_t) bits(machInst, 4, 0);
+        RegIndex zn = (RegIndex) (uint8_t) bits(machInst, 9, 5);
+        RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 20, 16);

-        // Placeholder for SveUsdotv
-        //return SveUsdotv<int32_t, uint8_t, int8_t>(machInst, zda, zn, zm);
-        return new Unknown64(machInst);
+        return new SveUsdotv<uint8_t, int8_t, int32_t>
+                                 (machInst, zda, zn, zm);
    } // decodeSveMixedSignDotProduct

    StaticInstPtr
@@ -2340,26 +2336,18 @@ namespace Aarch64
            return new Unknown64(machInst);
        }

-        RegIndex zda M5_VAR_USED = (RegIndex)
-                                          (uint8_t) bits(machInst, 4, 0);
-        RegIndex zn M5_VAR_USED = (RegIndex)
-                                          (uint8_t) bits(machInst, 9, 5);
-        RegIndex zm M5_VAR_USED = (RegIndex)
-                                          (uint8_t) bits(machInst, 18, 16);
-        uint8_t i2 M5_VAR_USED = (uint8_t) bits(machInst, 20, 19);
-
+        RegIndex zda = (RegIndex) (uint8_t) bits(machInst, 4, 0);
+        RegIndex zn = (RegIndex) (uint8_t) bits(machInst, 9, 5);
+        RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 18, 16);
+        uint8_t i2 = (uint8_t) bits(machInst, 20, 19);
        uint8_t usig = (uint8_t) bits(machInst, 10);

        if (usig) {
-            // Placeholder for SveSudoti
-            //return SveSudoti<int32_t, int8_t, uint8_t>
-            //                         (machInst, zda, zn, zm, i2);
-            return new Unknown64(machInst);
+            return new SveSudoti<int8_t, uint8_t, int32_t>
+                                     (machInst, zda, zn, zm, i2);
        } else {
-            // Placeholder for SveUsdoti
-            //return SveUsdoti<int32_t, uint8_t, int8_t>
-            //                         (machInst, zda, zn, zm, i2);
-            return new Unknown64(machInst);
+            return new SveUsdoti<uint8_t, int8_t, int32_t>
+                                     (machInst, zda, zn, zm, i2);
        }

    } // decodeSveMixedSignDotProductIndexed
--- a/src/arch/arm/isa/insts/sve.isa
+++ b/src/arch/arm/isa/insts/sve.isa
@@ -3098,6 +3098,15 @@ let {{
    def sveDotInst(name, Name, opClass, types, isIndexed = True):
        global header_output, exec_output, decoders
        code = sveEnabledCheckCode + '''
+        // Types of the extended versions of the source elements.
+        // Required to make sure the intermediate calculations don't overflow.
+        using ExtendedElementA = typename vector_element_traits::
+                                   extend_element<DElement,
+                                                  SElementA>::type;
+        using ExtendedElementB = typename vector_element_traits::
+                                   extend_element<DElement,
+                                                  SElementB>::type;
+
        unsigned eCount = ArmStaticInst::getCurSveVecLen<Element>(
                xc->tcBase());
        for (int i = 0; i < eCount; ++i) {'''
@@ -3107,17 +3116,21 @@ let {{
            int s = segbase + imm;'''
        code += '''
            DElement res = AA64FpDest_xd[i];
-            DElement srcElem1, srcElem2;
+            ExtendedElementA srcElemA;
+            ExtendedElementB srcElemB;
            for (int j = 0; j <= 3; ++j) {
-                srcElem1 = static_cast<DElement>(AA64FpOp1_xs[4 * i + j]);'''
+                srcElemA = static_cast<ExtendedElementA>
+                                          (AA64FpOp1_srcA[4 * i + j]);'''
        if isIndexed:
            code += '''
-                srcElem2 = static_cast<DElement>(AA64FpOp2_xs[4 * s + j]);'''
+                srcElemB = static_cast<ExtendedElementB>
+                                          (AA64FpOp2_srcB[4 * s + j]);'''
        else:
            code += '''
-                srcElem2 = static_cast<DElement>(AA64FpOp2_xs[4 * i + j]);'''
+                srcElemB = static_cast<ExtendedElementB>
+                                          (AA64FpOp2_srcB[4 * i + j]);'''
        code += '''
-                res += srcElem1 * srcElem2;
+                res += srcElemA * srcElemB;
            }
            AA64FpDestMerge_xd[i] = res;
        }'''
@@ -3129,7 +3142,7 @@ let {{
            header_output += SveWideningTerImmOpDeclare.subst(iop)
        else:
            header_output += SveWideningTerOpDeclare.subst(iop)
-        exec_output += SveWideningOpExecute.subst(iop)
+        exec_output += SveWideningTerOpExecute.subst(iop)
        for type in types:
            substDict = {'targs': type, 'class_name': 'Sve' + Name}
            exec_output += SveOpExecDeclare.subst(substDict)
@@ -4468,11 +4481,14 @@ let {{
    sveBinInst('sdivr', 'Sdivr', 'SimdDivOp', signedTypes, sdivrCode,
               PredType.MERGE, True)
    # SDOT (indexed)
-    sveDotInst('sdot', 'Sdoti', 'SimdAluOp', ['int8_t, int32_t',
-        'int16_t, int64_t'], isIndexed = True)
+    sveDotInst('sdot', 'Sdoti', 'SimdAluOp', ['int8_t, int8_t, int32_t',
+        'int16_t, int16_t, int64_t'], isIndexed = True)
    # SDOT (vectors)
-    sveDotInst('sdot', 'Sdotv', 'SimdAluOp', ['int8_t, int32_t',
-        'int16_t, int64_t'], isIndexed = False)
+    sveDotInst('sdot', 'Sdotv', 'SimdAluOp', ['int8_t, int8_t, int32_t',
+        'int16_t, int16_t, int64_t'], isIndexed = False)
+    # SUDOT (indexed)
+    sveDotInst('sudot', 'Sudoti', 'SimdAluOp', ['int8_t, uint8_t, int32_t'],
+               isIndexed = True)
    # SEL (predicates)
    selCode = 'destElem = srcElem1;'
    svePredLogicalInst('sel', 'PredSel', 'SimdPredAluOp', ('uint8_t',),
@@ -4857,11 +4873,17 @@ let {{
    sveBinInst('udivr', 'Udivr', 'SimdDivOp', unsignedTypes, udivrCode,
               PredType.MERGE, True)
    # UDOT (indexed)
-    sveDotInst('udot', 'Udoti', 'SimdAluOp', ['uint8_t, uint32_t',
-        'uint16_t, uint64_t'], isIndexed = True)
+    sveDotInst('udot', 'Udoti', 'SimdAluOp', ['uint8_t, uint8_t, uint32_t',
+        'uint16_t, uint16_t, uint64_t'], isIndexed = True)
    # UDOT (vectors)
-    sveDotInst('udot', 'Udotv', 'SimdAluOp', ['uint8_t, uint32_t',
-        'uint16_t, uint64_t'], isIndexed = False)
+    sveDotInst('udot', 'Udotv', 'SimdAluOp', ['uint8_t, uint8_t, uint32_t',
+        'uint16_t, uint16_t, uint64_t'], isIndexed = False)
+    # USDOT (indexed)
+    sveDotInst('usdot', 'Usdoti', 'SimdAluOp', ['uint8_t, int8_t, int32_t'],
+               isIndexed = True)
+    # USDOT (vectors)
+    sveDotInst('usdot', 'Usdotv', 'SimdAluOp', ['uint8_t, int8_t, int32_t'],
+               isIndexed = False)
    # UMAX (immediate)
    sveWideImmInst('umax', 'UmaxImm', 'SimdCmpOp', unsignedTypes, maxCode)
    # UMAX (vectors)
--- a/src/arch/arm/isa/templates/sve.isa
+++ b/src/arch/arm/isa/templates/sve.isa
@@ -1139,17 +1139,22 @@ class %(class_name)s : public %(base_class)s
 }};

 def template SveWideningTerImmOpDeclare {{
-template <class _SElement, class _DElement>
+template <class _SElementA, class _SElementB, class _DElement>
 class %(class_name)s : public %(base_class)s
 {
+  static_assert(sizeof(_SElementA) == sizeof(_SElementB),
+                "Source elements must have the same size.");
+
  private:
    %(reg_idx_arr_decl)s;

  protected:
    typedef _DElement Element;
-    typedef _SElement SElement;
+    typedef _SElementA SElementA;
+    typedef _SElementB SElementB;
    typedef _DElement DElement;
-    typedef _SElement TPSElem;
+    typedef _SElementA TPSrcAElem;
+    typedef _SElementB TPSrcBElem;
    typedef _DElement TPDElem;

  public:
@@ -1168,7 +1173,7 @@ class %(class_name)s : public %(base_class)s
 }};

 def template SveWideningTerOpDeclare {{
-template <class _SElement, class _DElement>
+template <class _SElementA, class _SElementB, class _DElement>
 class %(class_name)s : public %(base_class)s
 {
  private:
@@ -1176,9 +1181,11 @@ class %(class_name)s : public %(base_class)s

  protected:
    typedef _DElement Element;
-    typedef _SElement SElement;
+    typedef _SElementA SElementA;
+    typedef _SElementB SElementB;
    typedef _DElement DElement;
-    typedef _SElement TPSElem;
+    typedef _SElementA TPSrcAElem;
+    typedef _SElementB TPSrcBElem;
    typedef _DElement TPDElem;

  public:
@@ -1295,6 +1302,26 @@ def template SveWideningOpExecute {{
    }
 }};

+def template SveWideningTerOpExecute {{
+    template <class SElementA, class SElementB, class DElement>
+    Fault %(class_name)s<SElementA, SElementB, DElement>::execute
+           (ExecContext *xc,
+            trace::InstRecord *traceData) const
+    {
+        Fault fault = NoFault;
+        %(op_decl)s;
+        %(op_rd)s;
+
+        %(code)s;
+        if (fault == NoFault)
+        {
+            %(op_wb)s;
+        }
+
+        return fault;
+    }
+}};
+
 def template SveNonTemplatedOpExecute {{
    Fault
    %(class_name)s::execute(ExecContext *xc,