From f56459470a481c49d1c81992dc4ae34a13589f66 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Chang <rogerycchang@google.com>
Date: Fri, 19 Jan 2024 04:40:11 +0800
Subject: [PATCH] arch-riscv: Refactor the RISC-V multiplication utility (#780)

1. Add the new double width for int64_t and uint64_t
2. Use the wider type to get the upper result of multiplication

Change-Id: Id6cfa6f274c65592b2b3e2b70c00f82954b41f1a
---
 src/arch/riscv/isa/decoder.isa | 24 +++++-----
 src/arch/riscv/utility.hh      | 84 +++++++++++-----------------------
 2 files changed, 39 insertions(+), 69 deletions(-)

diff --git a/src/arch/riscv/isa/decoder.isa b/src/arch/riscv/isa/decoder.isa
index be4b621a37..11d1a64545 100644
--- a/src/arch/riscv/isa/decoder.isa
+++ b/src/arch/riscv/isa/decoder.isa
@@ -1467,9 +1467,9 @@ decode QUADRANT default Unknown::unknown() {
                     }});
                     0x1: mulh({{
                         if (machInst.rv_type == RV32) {
-                            Rd_sd = mulh_32(Rs1_sd, Rs2_sd);
+                            Rd_sd = mulh<int32_t>(Rs1_sd, Rs2_sd);
                         } else {
-                            Rd_sd = mulh_64(Rs1_sd, Rs2_sd);
+                            Rd_sd = mulh<int64_t>(Rs1_sd, Rs2_sd);
                         }
                     }}, IntMultOp);
                     0x5: clmul({{
@@ -1506,9 +1506,9 @@ decode QUADRANT default Unknown::unknown() {
                     }});
                     0x1: mulhsu({{
                         if (machInst.rv_type == RV32) {
-                            Rd_sd = mulhsu_32(Rs1_sd, Rs2);
+                            Rd_sd = mulhsu<int32_t>(Rs1_sd, Rs2);
                         } else {
-                            Rd_sd = mulhsu_64(Rs1_sd, Rs2);
+                            Rd_sd = mulhsu<int64_t>(Rs1_sd, Rs2);
                         }
                     }}, IntMultOp);
                     0x5: clmulr({{
@@ -1539,9 +1539,9 @@ decode QUADRANT default Unknown::unknown() {
                     }});
                     0x1: mulhu({{
                         if (machInst.rv_type == RV32) {
-                            Rd = (int32_t)mulhu_32(Rs1, Rs2);
+                            Rd = (int32_t)mulhu<uint32_t>(Rs1, Rs2);
                         } else {
-                            Rd = mulhu_64(Rs1, Rs2);
+                            Rd = mulhu<uint64_t>(Rs1, Rs2);
                         }
                     }}, IntMultOp);
                     0x5: clmulh({{
@@ -3292,7 +3292,7 @@ decode QUADRANT default Unknown::unknown() {
                             Vd_vu[i] = ((uint64_t)Vs2_vu[i] * Vs1_vu[i])
                                         >> sew;
                         } else {
-                            Vd_vu[i] = mulhu_64(Vs2_vu[i], Vs1_vu[i]);
+                            Vd_vu[i] = mulhu<uint64_t>(Vs2_vu[i], Vs1_vu[i]);
                         }
                     }}, OPMVV, VectorIntegerArithOp);
                     0x25: vmul_vv({{
@@ -3304,7 +3304,7 @@ decode QUADRANT default Unknown::unknown() {
                                         (uint64_t)Vs1_vu[i])
                                         >> sew;
                         } else {
-                            Vd_vi[i] = mulhsu_64(Vs2_vi[i], Vs1_vu[i]);
+                            Vd_vi[i] = mulhsu<int64_t>(Vs2_vi[i], Vs1_vu[i]);
                         }
                     }}, OPMVV, VectorIntegerArithOp);
                     0x27: vmulh_vv({{
@@ -3312,7 +3312,7 @@ decode QUADRANT default Unknown::unknown() {
                             Vd_vi[i] = ((int64_t)Vs2_vi[i] * Vs1_vi[i])
                                         >> sew;
                         } else {
-                            Vd_vi[i] = mulh_64(Vs2_vi[i], Vs1_vi[i]);
+                            Vd_vi[i] = mulh<int64_t>(Vs2_vi[i], Vs1_vi[i]);
                         }
                     }}, OPMVV, VectorIntegerArithOp);
                     0x29: vmadd_vv({{
@@ -4384,7 +4384,7 @@ decode QUADRANT default Unknown::unknown() {
                             Vd_vu[i] = ((uint64_t)Vs2_vu[i] * Rs1_vu)
                                         >> sew;
                         else
-                            Vd_vu[i] = mulhu_64(Vs2_vu[i], Rs1_vu);
+                            Vd_vu[i] = mulhu<uint64_t>(Vs2_vu[i], Rs1_vu);
                     }}, OPMVX, VectorIntegerArithOp);
                     0x25: vmul_vx({{
                         Vd_vi[i] = Vs2_vi[i] * Rs1_vi;
@@ -4395,14 +4395,14 @@ decode QUADRANT default Unknown::unknown() {
                                         (uint64_t)Rs1_vu)
                                         >> sew;
                         else
-                            Vd_vi[i] = mulhsu_64(Vs2_vi[i], Rs1_vu);
+                            Vd_vi[i] = mulhsu<int64_t>(Vs2_vi[i], Rs1_vu);
                     }}, OPMVX, VectorIntegerArithOp);
                     0x27: vmulh_vx({{
                         if (sew < 64)
                             Vd_vi[i] = ((int64_t)Vs2_vi[i] * Rs1_vi)
                                         >> sew;
                         else
-                            Vd_vi[i] = mulh_64(Vs2_vi[i], Rs1_vi);
+                            Vd_vi[i] = mulh<int64_t>(Vs2_vi[i], Rs1_vi);
                     }}, OPMVX, VectorIntegerArithOp);
                     0x29: vmadd_vx({{
                         Vd_vi[i] = Vs3_vi[i] * Rs1_vi + Vs2_vi[i];
diff --git a/src/arch/riscv/utility.hh b/src/arch/riscv/utility.hh
index bac499e523..cf5620d250 100644
--- a/src/arch/riscv/utility.hh
+++ b/src/arch/riscv/utility.hh
@@ -65,6 +65,21 @@ namespace gem5
 namespace RiscvISA
 {
 
+template<typename Type> struct double_width;
+template<> struct double_width<uint8_t>     { using type = uint16_t;};
+template<> struct double_width<uint16_t>    { using type = uint32_t;};
+template<> struct double_width<uint32_t>    { using type = uint64_t;};
+template<> struct double_width<uint64_t>    { using type = __uint128_t;};
+template<> struct double_width<int8_t>      { using type = int16_t; };
+template<> struct double_width<int16_t>     { using type = int32_t; };
+template<> struct double_width<int32_t>     { using type = int64_t; };
+template<> struct double_width<int64_t>     { using type = __int128_t; };
+template<> struct double_width<float32_t>   { using type = float64_t;};
+
+template<typename Type> struct double_widthf;
+template<> struct double_widthf<uint32_t>    { using type = float64_t;};
+template<> struct double_widthf<int32_t>     { using type = float64_t;};
+
 template<typename T> inline bool
 isquietnan(T val)
 {
@@ -146,57 +161,25 @@ registerName(RegId reg)
     }
 }
 
-inline uint32_t
-mulhu_32(uint32_t rs1, uint32_t rs2)
+template <typename T> inline std::make_unsigned_t<T>
+mulhu(std::make_unsigned_t<T> rs1, std::make_unsigned_t<T> rs2)
 {
-    return ((uint64_t)rs1 * rs2) >> 32;
+    using WideT = typename double_width<std::make_unsigned_t<T>>::type;
+    return ((WideT)rs1 * rs2) >> (sizeof(T) * 8);
 }
 
-inline uint64_t
-mulhu_64(uint64_t rs1, uint64_t rs2)
+template <typename T> inline std::make_signed_t<T>
+mulh(std::make_signed_t<T> rs1, std::make_signed_t<T> rs2)
 {
-    uint64_t rs1_lo = (uint32_t)rs1;
-    uint64_t rs1_hi = rs1 >> 32;
-    uint64_t rs2_lo = (uint32_t)rs2;
-    uint64_t rs2_hi = rs2 >> 32;
-
-    uint64_t hi = rs1_hi * rs2_hi;
-    uint64_t mid1 = rs1_hi * rs2_lo;
-    uint64_t mid2 = rs1_lo * rs2_hi;
-    uint64_t lo = rs1_lo * rs2_lo;
-    uint64_t carry = ((uint64_t)(uint32_t)mid1
-            + (uint64_t)(uint32_t)mid2
-            + (lo >> 32)) >> 32;
-
-    return hi + (mid1 >> 32) + (mid2 >> 32) + carry;
+    using WideT = typename double_width<std::make_signed_t<T>>::type;
+    return ((WideT)rs1 * rs2) >> (sizeof(T) * 8);
 }
 
-inline int32_t
-mulh_32(int32_t rs1, int32_t rs2)
+template <typename T> inline std::make_signed_t<T>
+mulhsu(std::make_signed_t<T> rs1, std::make_unsigned_t<T> rs2)
 {
-    return ((int64_t)rs1 * rs2) >> 32;
-}
-
-inline int64_t
-mulh_64(int64_t rs1, int64_t rs2)
-{
-    bool negate = (rs1 < 0) != (rs2 < 0);
-    uint64_t res = mulhu_64(std::abs(rs1), std::abs(rs2));
-    return negate ? ~res + (rs1 * rs2 == 0 ? 1 : 0) : res;
-}
-
-inline int32_t
-mulhsu_32(int32_t rs1, uint32_t rs2)
-{
-    return ((int64_t)rs1 * rs2) >> 32;
-}
-
-inline int64_t
-mulhsu_64(int64_t rs1, uint64_t rs2)
-{
-    bool negate = rs1 < 0;
-    uint64_t res = mulhu_64(std::abs(rs1), rs2);
-    return negate ? ~res + (rs1 * rs2 == 0 ? 1 : 0) : res;
+    using WideT = typename double_width<std::make_signed_t<T>>::type;
+    return ((WideT)rs1 * rs2) >> (sizeof(T) * 8);
 }
 
 template<typename T> inline T
@@ -323,19 +306,6 @@ elem_mask(const T* vs, const int index)
     return (vs[idx] >> pos) & 1;
 }
 
-template<typename Type> struct double_width;
-template<> struct double_width<uint8_t>     { using type = uint16_t;};
-template<> struct double_width<uint16_t>    { using type = uint32_t;};
-template<> struct double_width<uint32_t>    { using type = uint64_t;};
-template<> struct double_width<int8_t>      { using type = int16_t; };
-template<> struct double_width<int16_t>     { using type = int32_t; };
-template<> struct double_width<int32_t>     { using type = int64_t; };
-template<> struct double_width<float32_t>   { using type = float64_t;};
-
-template<typename Type> struct double_widthf;
-template<> struct double_widthf<uint32_t>    { using type = float64_t;};
-template<> struct double_widthf<int32_t>     { using type = float64_t;};
-
 template<typename FloatType, typename IntType = decltype(FloatType::v)> auto
 ftype(IntType a) -> FloatType
 {