x86: Fix the multiplication microops.

If the operands were 64 bit, an intermediate calculation could lose a carry bit. This change rearranges that intermediate calculation if the operand width is large, and reworks the microop implementation in general in an attempt to make it easier to understand. Change-Id: Ib36333f3f2695a33cd9623e43682de22ebd2e7ea Reviewed-on: https://gem5-review.googlesource.com/3381 Reviewed-by: Jason Lowe-Power <jason@lowepower.com> Reviewed-by: Anthony Gutierrez <anthony.gutierrez@amd.com> Maintainer: Anthony Gutierrez <anthony.gutierrez@amd.com>
2017-05-15 19:39:51 -07:00
parent 05c486c5eb
commit c1ec4c4f8c
1 changed files with 58 additions and 22 deletions
--- a/src/arch/x86/isa/microops/regop.isa
+++ b/src/arch/x86/isa/microops/regop.isa
@@ -546,23 +546,42 @@ let {{
    class Mul1s(WrRegOp):
        op_class = 'IntMultOp'

+        # Multiply two values Aa and Bb where Aa = A << p + a, then correct for
+        # negative operands.
+        #   Aa * Bb
+        # = (A << p + a) * (B << p + b)
+        # = (A * B) << 2p + (A * b + a * B) << p + a * b
        code = '''
            ProdLow = psrc1 * op2;
-            int halfSize = (dataSize * 8) / 2;
-            uint64_t shifter = (ULL(1) << halfSize);
-            uint64_t hiResult;
-            uint64_t psrc1_h = psrc1 / shifter;
-            uint64_t psrc1_l = psrc1 & mask(halfSize);
-            uint64_t psrc2_h = (op2 / shifter) & mask(halfSize);
-            uint64_t psrc2_l = op2 & mask(halfSize);
-            hiResult = ((psrc1_l * psrc2_h + psrc1_h * psrc2_l +
-                        ((psrc1_l * psrc2_l) / shifter)) /shifter) +
-                       psrc1_h * psrc2_h;
+
+            int p = (dataSize * 8) / 2;
+            uint64_t A = bits(psrc1, 2 * p - 1, p);
+            uint64_t a = bits(psrc1, p - 1, 0);
+            uint64_t B = bits<uint64_t>(op2, 2 * p - 1, p);
+            uint64_t b = bits<uint64_t>(op2, p - 1, 0);
+
+            uint64_t c1, c2; // Carry between place values.
+            uint64_t ab = a * b, Ab = A * b, aB = a * B, AB = A * B;
+
+            c1 = ab >> p;
+
+            // Be careful to avoid overflow if p is large.
+            if (p == 32) {
+                c2 = (c1 >> 1) + (Ab >> 1) + (aB >> 1);
+                c2 += ((c1 & 0x1) + (Ab & 0x1) + (aB & 0x1)) >> 1;
+                c2 >>= (p - 1);
+            } else {
+                c2 = (c1 + Ab + aB) >> p;
+            }
+
+            uint64_t hi = AB + c2;
+
            if (bits(psrc1, dataSize * 8 - 1))
-                hiResult -= op2;
+                hi -= op2;
            if (bits(op2, dataSize * 8 - 1))
-                hiResult -= psrc1;
-            ProdHi = hiResult;
+                hi -= psrc1;
+
+            ProdHi = hi;
            '''
        flag_code = '''
            if ((-ProdHi & mask(dataSize * 8)) !=
@@ -578,17 +597,34 @@ let {{
    class Mul1u(WrRegOp):
        op_class = 'IntMultOp'

+        # Multiply two values Aa and Bb where Aa = A << p + a.
+        #   Aa * Bb
+        # = (A << p + a) * (B << p + b)
+        # = (A * B) << 2p + (A * b + a * B) << p + a * b
        code = '''
            ProdLow = psrc1 * op2;
-            int halfSize = (dataSize * 8) / 2;
-            uint64_t shifter = (ULL(1) << halfSize);
-            uint64_t psrc1_h = psrc1 / shifter;
-            uint64_t psrc1_l = psrc1 & mask(halfSize);
-            uint64_t psrc2_h = (op2 / shifter) & mask(halfSize);
-            uint64_t psrc2_l = op2 & mask(halfSize);
-            ProdHi = ((psrc1_l * psrc2_h + psrc1_h * psrc2_l +
-                      ((psrc1_l * psrc2_l) / shifter)) / shifter) +
-                     psrc1_h * psrc2_h;
+
+            int p = (dataSize * 8) / 2;
+            uint64_t A = bits(psrc1, 2 * p - 1, p);
+            uint64_t a = bits(psrc1, p - 1, 0);
+            uint64_t B = bits<uint64_t>(op2, 2 * p - 1, p);
+            uint64_t b = bits<uint64_t>(op2, p - 1, 0);
+
+            uint64_t c1, c2; // Carry between place values.
+            uint64_t ab = a * b, Ab = A * b, aB = a * B, AB = A * B;
+
+            c1 = ab >> p;
+
+            // Be careful to avoid overflow if p is large.
+            if (p == 32) {
+                c2 = (c1 >> 1) + (Ab >> 1) + (aB >> 1);
+                c2 += ((c1 & 0x1) + (Ab & 0x1) + (aB & 0x1)) >> 1;
+                c2 >>= (p - 1);
+            } else {
+                c2 = (c1 + Ab + aB) >> p;
+            }
+
+            ProdHi = AB + c2;
            '''
        flag_code = '''
            if (ProdHi) {