diff --git a/src/arch/riscv/insts/vector.cc b/src/arch/riscv/insts/vector.cc
index f2bde629e9..a1ccf402c9 100644
--- a/src/arch/riscv/insts/vector.cc
+++ b/src/arch/riscv/insts/vector.cc
@@ -122,6 +122,93 @@ VConfOp::generateZimmDisassembly() const
     return s.str();
 }
 
+std::string
+VectorNonSplitInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+        << registerName(srcRegIdx(0));
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VectorArithMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", ";
+    if (machInst.funct3 == 0x3) {
+        // OPIVI
+      ss  << registerName(srcRegIdx(0)) << ", " << machInst.vecimm;
+    } else {
+      ss  << registerName(srcRegIdx(1)) << ", " << registerName(srcRegIdx(0));
+    }
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VectorArithMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", ";
+    if (machInst.funct3 == 0x3) {
+        // OPIVI
+      ss  << registerName(srcRegIdx(0)) << ", " << machInst.vecimm;
+    } else {
+      ss  << registerName(srcRegIdx(1)) << ", " << registerName(srcRegIdx(0));
+    }
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VectorVMUNARY0MicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0));
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VectorVMUNARY0MacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0));
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VectorSlideMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) <<  ", ";
+    if (machInst.funct3 == 0x3) {
+      ss  << registerName(srcRegIdx(0)) << ", " << machInst.vecimm;
+    } else {
+      ss  << registerName(srcRegIdx(1)) << ", " << registerName(srcRegIdx(0));
+    }
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VectorSlideMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", ";
+    if (machInst.funct3 == 0x3) {
+      ss  << registerName(srcRegIdx(0)) << ", " << machInst.vecimm;
+    } else {
+      ss  << registerName(srcRegIdx(1)) << ", " << registerName(srcRegIdx(0));
+    }
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
 std::string VleMicroInst::generateDisassembly(Addr pc,
         const loader::SymbolTable *symtab) const
 {
@@ -295,5 +382,25 @@ std::string VsIndexMicroInst::generateDisassembly(Addr pc,
     return ss.str();
 }
 
+std::string
+VMvWholeMacroInst::generateDisassembly(Addr pc,
+    const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+        registerName(srcRegIdx(1));
+    return ss.str();
+}
+
+std::string
+VMvWholeMicroInst::generateDisassembly(Addr pc,
+    const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+        registerName(srcRegIdx(1));
+    return ss.str();
+}
+
 } // namespace RiscvISA
 } // namespace gem5
diff --git a/src/arch/riscv/insts/vector.hh b/src/arch/riscv/insts/vector.hh
index f989d7ffbf..5d0874a994 100644
--- a/src/arch/riscv/insts/vector.hh
+++ b/src/arch/riscv/insts/vector.hh
@@ -89,6 +89,24 @@ inline uint8_t checked_vtype(bool vill, uint8_t vtype) {
     return vtype;
 }
 
+class VectorNonSplitInst : public RiscvStaticInst
+{
+  protected:
+    uint32_t vl;
+    uint8_t vtype;
+    VectorNonSplitInst(const char* mnem, ExtMachInst _machInst,
+                   OpClass __opClass)
+        : RiscvStaticInst(mnem, _machInst, __opClass),
+        vl(_machInst.vl),
+        vtype(checked_vtype(_machInst.vill, _machInst.vtype8))
+    {
+        this->flags[IsVector] = true;
+    }
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
 class VectorMacroInst : public RiscvMacroInst
 {
   protected:
@@ -170,6 +188,63 @@ class VectorArithMacroInst : public VectorMacroInst
             Addr pc, const loader::SymbolTable *symtab) const override;
 };
 
+class VectorVMUNARY0MicroInst : public VectorMicroInst
+{
+protected:
+    VectorVMUNARY0MicroInst(const char *mnem, ExtMachInst _machInst,
+                         OpClass __opClass, uint8_t _microVl,
+                         uint8_t _microIdx)
+        : VectorMicroInst(mnem, _machInst, __opClass, _microVl, _microIdx)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VectorVMUNARY0MacroInst : public VectorMacroInst
+{
+  protected:
+    VectorVMUNARY0MacroInst(const char* mnem, ExtMachInst _machInst,
+                         OpClass __opClass)
+        : VectorMacroInst(mnem, _machInst, __opClass)
+    {
+        this->flags[IsVector] = true;
+    }
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VectorSlideMacroInst : public VectorMacroInst
+{
+  protected:
+    VectorSlideMacroInst(const char* mnem, ExtMachInst _machInst,
+                         OpClass __opClass)
+        : VectorMacroInst(mnem, _machInst, __opClass)
+    {
+        this->flags[IsVector] = true;
+    }
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VectorSlideMicroInst : public VectorMicroInst
+{
+  protected:
+    uint8_t vdIdx;
+    uint8_t vs2Idx;
+    VectorSlideMicroInst(const char *mnem, ExtMachInst _machInst,
+                         OpClass __opClass, uint8_t _microVl,
+                         uint8_t _microIdx, uint8_t _vdIdx, uint8_t _vs2Idx)
+        : VectorMicroInst(mnem, _machInst, __opClass, _microVl, _microIdx)
+        , vdIdx(_vdIdx), vs2Idx(_vs2Idx)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
 class VectorMemMicroInst : public VectorMicroInst
 {
   protected:
@@ -421,6 +496,131 @@ class VsIndexMicroInst : public VectorMemMicroInst
         Addr pc, const loader::SymbolTable *symtab) const override;
 };
 
+class VMvWholeMacroInst : public VectorArithMacroInst
+{
+  protected:
+    VMvWholeMacroInst(const char* mnem, ExtMachInst _machInst,
+                         OpClass __opClass)
+        : VectorArithMacroInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VMvWholeMicroInst : public VectorArithMicroInst
+{
+  protected:
+    VMvWholeMicroInst(const char *mnem, ExtMachInst _machInst,
+                         OpClass __opClass, uint8_t _microVl,
+                         uint8_t _microIdx)
+        : VectorArithMicroInst(mnem, _machInst, __opClass, _microVl, _microIdx)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+template<typename ElemType>
+class VMaskMergeMicroInst : public VectorArithMicroInst
+{
+  private:
+    RegId srcRegIdxArr[NumVecInternalRegs];
+    RegId destRegIdxArr[1];
+
+  public:
+    VMaskMergeMicroInst(ExtMachInst extMachInst, uint8_t _dstReg,
+        uint8_t _numSrcs)
+        : VectorArithMicroInst("vmask_mv_micro", extMachInst,
+          VectorIntegerArithOp, 0, 0)
+    {
+        setRegIdxArrays(
+            reinterpret_cast<RegIdArrayPtr>(
+                &std::remove_pointer_t<decltype(this)>::srcRegIdxArr),
+            reinterpret_cast<RegIdArrayPtr>(
+                &std::remove_pointer_t<decltype(this)>::destRegIdxArr));
+
+        _numSrcRegs = 0;
+        _numDestRegs = 0;
+
+        setDestRegIdx(_numDestRegs++, vecRegClass[_dstReg]);
+        _numTypedDestRegs[VecRegClass]++;
+        for (uint8_t i=0; i<_numSrcs; i++) {
+            setSrcRegIdx(_numSrcRegs++, vecRegClass[VecMemInternalReg0 + i]);
+        }
+    }
+
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)
+            const override {
+        vreg_t tmp_d0 = *(vreg_t *)xc->getWritableRegOperand(this, 0);
+        auto Vd = tmp_d0.as<uint8_t>();
+        constexpr uint8_t elems_per_vreg = VLENB / sizeof(ElemType);
+        size_t bit_cnt = elems_per_vreg;
+        vreg_t tmp_s;
+        xc->getRegOperand(this, 0, &tmp_s);
+        auto s = tmp_s.as<uint8_t>();
+        // cp the first result and tail
+        memcpy(Vd, s, VLENB);
+        for (uint8_t i = 1; i < this->_numSrcRegs; i++) {
+            xc->getRegOperand(this, i, &tmp_s);
+            s = tmp_s.as<uint8_t>();
+            if constexpr (elems_per_vreg < 8) {
+                constexpr uint8_t m = (1 << elems_per_vreg) - 1;
+                const uint8_t mask = m << (i * elems_per_vreg % 8);
+                // clr & ext bits
+                Vd[bit_cnt/8] ^= Vd[bit_cnt/8] & mask;
+                Vd[bit_cnt/8] |= s[bit_cnt/8] & mask;
+                bit_cnt += elems_per_vreg;
+            } else {
+                constexpr uint8_t byte_offset = elems_per_vreg / 8;
+                memcpy(Vd + i * byte_offset, s + i * byte_offset, byte_offset);
+            }
+        }
+        xc->setRegOperand(this, 0, &tmp_d0);
+        if (traceData)
+            traceData->setData(vecRegClass, &tmp_d0);
+        return NoFault;
+    }
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0));
+        for (uint8_t i = 0; i < this->_numSrcRegs; i++) {
+            ss << ", " << registerName(srcRegIdx(i));
+        }
+        ss << ", offset:" << VLENB / sizeof(ElemType);
+        return ss.str();
+    }
+};
+
+class VxsatMicroInst : public VectorArithMicroInst
+{
+  private:
+    bool* vxsat;
+  public:
+    VxsatMicroInst(bool* Vxsat, ExtMachInst extMachInst)
+        : VectorArithMicroInst("vxsat_micro", extMachInst,
+          VectorIntegerArithOp, 0, 0)
+    {
+        vxsat = Vxsat;
+    }
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)
+    const override
+    {
+        xc->setMiscReg(MISCREG_VXSAT,*vxsat);
+        auto vcsr = xc->readMiscReg(MISCREG_VCSR);
+        xc->setMiscReg(MISCREG_VCSR, ((vcsr&~1)|*vxsat));
+        return NoFault;
+    }
+    std::string generateDisassembly(Addr pc, const loader::SymbolTable *symtab)
+      const override
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << "VXSAT" << ", " << (*vxsat ? "0x1" : "0x0");
+        return ss.str();
+    }
+};
 
 } // namespace RiscvISA
 } // namespace gem5
diff --git a/src/arch/riscv/isa/decoder.isa b/src/arch/riscv/isa/decoder.isa
index 0288f37ad8..2b46752ffe 100644
--- a/src/arch/riscv/isa/decoder.isa
+++ b/src/arch/riscv/isa/decoder.isa
@@ -2281,6 +2281,2060 @@ decode QUADRANT default Unknown::unknown() {
         }
 
         0x15: decode FUNCT3 {
+            // OPIVV
+            0x0: decode VFUNCT6 {
+                format VectorIntFormat {
+                    0x0: vadd_vv({{
+                        Vd_vu[i] = Vs2_vu[i] + Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x2: vsub_vv({{
+                        Vd_vu[i] = Vs2_vu[i] - Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x4: vminu_vv({{
+                        Vd_vu[i] = Vs2_vu[i] < Vs1_vu[i] ?
+                                Vs2_vu[i] : Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x5: vmin_vv({{
+                        Vd_vi[i] = Vs2_vi[i] < Vs1_vi[i] ?
+                                Vs2_vi[i] : Vs1_vi[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x6: vmaxu_vv({{
+                        Vd_vu[i] = Vs2_vu[i] > Vs1_vu[i] ?
+                                Vs2_vu[i] : Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x7: vmax_vv({{
+                        Vd_vi[i] = Vs2_vi[i] > Vs1_vi[i] ?
+                                Vs2_vi[i] : Vs1_vi[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x9: vand_vv({{
+                        Vd_vu[i] = Vs2_vu[i] & Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0xa: vor_vv({{
+                        Vd_vu[i] = Vs2_vu[i] | Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0xb: vxor_vv({{
+                        Vd_vu[i] = Vs2_vu[i] ^ Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                }
+                0x0c: VectorGatherFormat::vrgather_vv({{
+                    for (uint32_t i = 0; i < microVl; i++) {
+                        uint32_t ei = i + vs1_idx * vs1_elems + vs1_bias;
+                        if (this->vm || elem_mask(v0, ei)) {
+                            const uint64_t idx = Vs1_vu[i]
+                                - vs2_elems * vs2_idx;
+                            auto res = (Vs1_vu[i] >= vlmax) ? 0
+                                : (idx < vs2_elems) ? Vs2_vu[idx]
+                                : Vs3_vu[i];
+                            Vd_vu[i] = res;
+                        }
+                    }
+                }}, OPIVV, VectorMiscOp);
+                0x0e: VectorGatherFormat::vrgatherei16_vv({{
+                    for (uint32_t i = 0; i < microVl; i++) {
+                        uint32_t ei = i + vs1_idx * vs1_elems + vs1_bias;
+                        if (this->vm || elem_mask(v0, ei)) {
+                            const uint16_t idx = Vs1_uh[i + vs1_bias]
+                                - vs2_elems * vs2_idx;
+                            auto res = (Vs1_uh[i + vs1_bias] >= vlmax) ? 0
+                                : (idx < vs2_elems) ? Vs2_vu[idx]
+                                : Vs3_vu[i + vd_bias];
+                            Vd_vu[i + vd_bias] = res;
+                        }
+                    }
+                }}, OPIVV, VectorMiscOp);
+                format VectorIntFormat {
+                    0x10: decode VM {
+                        0x0: vadc_vvm({{
+                            Vd_vi[i] = Vs2_vi[i] + Vs1_vi[i]
+                                    + elem_mask(v0, ei);
+                        }}, OPIVV, VectorIntegerArithOp);
+                        // the unmasked versions (vm=1) are reserved
+                    }
+                    0x12: decode VM {
+                        0x0: vsbc_vvm({{
+                            Vd_vi[i] = Vs2_vi[i] - Vs1_vi[i]
+                                    - elem_mask(v0, ei);
+                        }}, OPIVV, VectorIntegerArithOp);
+                        // the unmasked versions (vm=1) are reserved
+                    }
+                    0x17: decode VM {
+                        0x0: vmerge_vvm({{
+                            Vd_vu[i] = elem_mask(v0, ei)
+                                    ? Vs1_vu[i]
+                                    : Vs2_vu[i];
+                        }}, OPIVV, VectorIntegerArithOp);
+                        0x1: decode VS2 {
+                            0x0: vmv_v_v({{
+                                Vd_vu[i] = Vs1_vu[i];
+                            }}, OPIVV, VectorIntegerArithOp);
+                        }
+                    }
+                }
+                format VectorIntVxsatFormat{
+                    0x20: vsaddu_vv({{
+                        Vd_vu[i] = sat_addu<vu>(Vs2_vu[i], Vs1_vu[i],
+                            vxsatptr);
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x21: vsadd_vv({{
+                        Vd_vu[i] = sat_add<vi>(Vs2_vu[i], Vs1_vu[i],
+                            vxsatptr);
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x22: vssubu_vv({{
+                        Vd_vu[i] = sat_subu<vu>(Vs2_vu[i], Vs1_vu[i],
+                            vxsatptr);
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x23: vssub_vv({{
+                        Vd_vu[i] = sat_sub<vi>(Vs2_vu[i], Vs1_vu[i],
+                            vxsatptr);
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x27: vsmul_vv({{
+                        vi max = std::numeric_limits<vi>::max();
+                        vi min = std::numeric_limits<vi>::min();
+                        bool overflow = Vs1_vi[i] == Vs2_vi[i] &&
+                                        Vs1_vi[i] == min;
+                        __int128_t result = (__int128_t)Vs1_vi[i] *
+                                            (__int128_t)Vs2_vi[i];
+                        result = int_rounding<__int128_t>(
+                            result, 0 /* TODO */, sew - 1);
+                        result = result >> (sew - 1);
+                        if (overflow) {
+                            result = max;
+                            *vxsatptr = true;
+                        }
+
+                        Vd_vi[i] = (vi)result;
+                    }}, OPIVV, VectorIntegerArithOp);
+                }
+                format VectorIntFormat {
+                    0x25: vsll_vv({{
+                        Vd_vu[i] = Vs2_vu[i] << (Vs1_vu[i] & (sew - 1));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x28: vsrl_vv({{
+                        Vd_vu[i] = Vs2_vu[i] >> (Vs1_vu[i] & (sew - 1));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x29: vsra_vv({{
+                        Vd_vi[i] = Vs2_vi[i] >> (Vs1_vu[i] & (sew - 1));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x2a: vssrl_vv({{
+                        int sh = Vs1_vu[i] & (sew - 1);
+                        __uint128_t val = Vs2_vu[i];
+
+                        val = int_rounding<__uint128_t>(val,
+                            xc->readMiscReg(MISCREG_VXRM), sh);
+                        Vd_vu[i] = val >> sh;
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x2b: vssra_vv({{
+                        int sh = Vs1_vi[i] & (sew - 1);
+                        __int128_t val = Vs2_vi[i];
+
+                        val = int_rounding<__int128_t>(val,
+                            xc->readMiscReg(MISCREG_VXRM), sh);
+                        Vd_vi[i] = val >> sh;
+                    }}, OPIVV, VectorIntegerArithOp);
+                }
+                format VectorReduceIntWideningFormat {
+                    0x30: vwredsumu_vs({{
+                        Vd_vwu[0] = reduce_loop(std::plus<vwu>(),
+                            Vs1_vwu, Vs2_vu);
+                    }}, OPIVV, VectorIntegerReduceOp);
+                    0x31: vwredsum_vs({{
+                        Vd_vwu[0] = reduce_loop(std::plus<vwi>(),
+                            Vs1_vwi, Vs2_vi);
+                    }}, OPIVV, VectorIntegerReduceOp);
+                }
+                format VectorIntMaskFormat {
+                    0x11: decode VM {
+                        0x0: vmadc_vvm({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                carry_out(Vs2_vu[i], Vs1_vu[i],
+                                    elem_mask(v0, ei)));
+                        }}, OPIVV, VectorIntegerArithOp);
+                        0x1: vmadc_vv({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                carry_out(Vs2_vu[i], Vs1_vu[i]));
+                        }}, OPIVV, VectorIntegerArithOp);
+                    }
+                    0x13: decode VM {
+                        0x0: vmsbc_vvm({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                borrow_out(Vs2_vi[i], Vs1_vi[i],
+                                    elem_mask(v0, ei)));
+                        }}, OPIVV, VectorIntegerArithOp);
+                        0x1: vmsbc_vv({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                borrow_out(Vs2_vi[i], Vs1_vi[i]));
+                        }}, OPIVV, VectorIntegerArithOp);
+                    }
+                    0x18: vmseq_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] == Vs1_vu[i]));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x19: vmsne_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] != Vs1_vu[i]));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x1a: vmsltu_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] < Vs1_vu[i]));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x1b: vmslt_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] < Vs1_vi[i]));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x1c: vmsleu_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] <= Vs1_vu[i]));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x1d: vmsle_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] <= Vs1_vi[i]));
+                    }}, OPIVV, VectorIntegerArithOp);
+                }
+                format VectorIntNarrowingFormat {
+                    0x2c: vnsrl_wv({{
+                        Vd_vu[i + offset] = (vu)(Vs2_vwu[i] >>
+                            ((vwu)Vs1_vu[i + offset] & (sew * 2 - 1)));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x2d: vnsra_wv({{
+                        Vd_vi[i + offset] = (vi)(Vs2_vwi[i] >>
+                            ((vwu)Vs1_vu[i + offset] & (sew * 2 - 1)));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x2e: vnclipu_wv({{
+                        vu max = std::numeric_limits<vu>::max();
+                        uint64_t sign_mask =
+                            std::numeric_limits<uint64_t>::max() << sew;
+                        __uint128_t res = Vs2_vwu[i];
+                        unsigned shift = Vs1_vu[i + offset] & ((sew * 2) - 1);
+
+                        res = int_rounding<__uint128_t>(
+                            res, 0 /* TODO */, shift) >> shift;
+
+                        if (res & sign_mask) {
+                            res = max;
+                            // TODO: vxsat
+                        }
+
+                        Vd_vu[i + offset] = (vu)res;
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x2f: vnclip_wv({{
+                        vi max = std::numeric_limits<vi>::max();
+                        vi min = std::numeric_limits<vi>::min();
+                        __int128_t res = Vs2_vwi[i];
+                        unsigned shift = Vs1_vi[i + offset] & ((sew * 2) - 1);
+
+                        res = int_rounding<__int128_t>(
+                            res, 0 /* TODO */, shift) >> shift;
+
+                        if (res < min) {
+                            res = min;
+                            // TODO: vxsat
+                        } else if (res > max) {
+                            res = max;
+                            // TODO: vxsat
+                        }
+
+                        Vd_vi[i + offset] = (vi)res;
+                    }}, OPIVV, VectorIntegerArithOp);
+                }
+            }
+            // OPFVV
+            0x1: decode VFUNCT6 {
+                0x00: VectorFloatFormat::vfadd_vv({{
+                    auto fd = fadd<et>(ftype<et>(Vs2_vu[i]),
+                                       ftype<et>(Vs1_vu[i]));
+                    Vd_vu[i] = fd.v;
+                }}, OPFVV, VectorFloatArithOp);
+                0x01: VectorReduceFloatFormat::vfredusum_vs({{
+                    Vd_vu[0] = reduce_loop([](const vu& src1, const vu& src2) {
+                        return fadd<et>(ftype<et>(src1), ftype<et>(src2));
+                    }, Vs1_vu, Vs2_vu);
+                }}, OPFVV, VectorFloatReduceOp);
+                0x02: VectorFloatFormat::vfsub_vv({{
+                    auto fd = fsub<et>(ftype<et>(Vs2_vu[i]),
+                                       ftype<et>(Vs1_vu[i]));
+                    Vd_vu[i] = fd.v;
+                }}, OPFVV, VectorFloatArithOp);
+                0x03: VectorReduceFloatFormat::vfredosum_vs({{
+                    Vd_vu[0] = reduce_loop([](const vu& src1, const vu& src2) {
+                        return fadd<et>(ftype<et>(src1), ftype<et>(src2));
+                    }, Vs1_vu, Vs2_vu);
+                }}, OPFVV, VectorFloatReduceOp);
+                0x04: VectorFloatFormat::vfmin_vv({{
+                    auto fd = fmin<et>(ftype<et>(Vs2_vu[i]),
+                                       ftype<et>(Vs1_vu[i]));
+                    Vd_vu[i] = fd.v;
+                }}, OPFVV, VectorFloatArithOp);
+                0x05: VectorReduceFloatFormat::vfredmin_vs({{
+                    Vd_vu[0] = reduce_loop([](const vu& src1, const vu& src2) {
+                        return fmin<et>(ftype<et>(src1), ftype<et>(src2));
+                    }, Vs1_vu, Vs2_vu);
+                }}, OPFVV, VectorFloatReduceOp);
+                0x06: VectorFloatFormat::vfmax_vv({{
+                    auto fd = fmax<et>(ftype<et>(Vs2_vu[i]),
+                                       ftype<et>(Vs1_vu[i]));
+                    Vd_vu[i] = fd.v;
+                }}, OPFVV, VectorFloatArithOp);
+                0x07: VectorReduceFloatFormat::vfredmax_vs({{
+                    Vd_vu[0] = reduce_loop([](const vu& src1, const vu& src2) {
+                        return fmax<et>(ftype<et>(src1), ftype<et>(src2));
+                    }, Vs1_vu, Vs2_vu);
+                }}, OPFVV, VectorFloatReduceOp);
+                0x08: VectorFloatFormat::vfsgnj_vv({{
+                    Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
+                                         ftype<et>(Vs1_vu[i]),
+                                         false, false).v;
+                }}, OPFVV, VectorFloatArithOp);
+                0x09: VectorFloatFormat::vfsgnjn_vv({{
+                    Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
+                                         ftype<et>(Vs1_vu[i]),
+                                         true, false).v;
+                }}, OPFVV, VectorFloatArithOp);
+                0x0a: VectorFloatFormat::vfsgnjx_vv({{
+                    Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
+                                         ftype<et>(Vs1_vu[i]),
+                                         false, true).v;
+                }}, OPFVV, VectorFloatArithOp);
+                // VWFUNARY0
+                0x10: decode VS1 {
+                    0x00: decode VM {
+                        // The encodings corresponding to the masked versions
+                        // (vm=0) of vfmv.f.s are reserved
+                        0x1: VectorNonSplitFormat::vfmv_f_s({{
+                            freg_t fd = freg(Vs2_vu[0]);
+                            Fd_bits = fd.v;
+                        }}, OPFVV, VectorMiscOp);
+                    }
+                }
+                0x12: decode VS1 {
+                    format VectorFloatCvtFormat {
+                        0x00: vfcvt_xu_f_v({{
+                            Vd_vu[i] = f_to_ui<et>(ftype<et>(Vs2_vu[i]),
+                                                   softfloat_roundingMode);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x01: vfcvt_x_f_v({{
+                            Vd_vu[i] = f_to_i<et>(ftype<et>(Vs2_vu[i]),
+                                                  softfloat_roundingMode);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x02: vfcvt_f_xu_v({{
+                            auto fd = ui_to_f<et>(Vs2_vu[i]);
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x03: vfcvt_f_x_v({{
+                            auto fd = i_to_f<et>(Vs2_vu[i]);
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x06: vfcvt_rtz_xu_f_v({{
+                            Vd_vu[i] = f_to_ui<et>(ftype<et>(Vs2_vu[i]),
+                                                   softfloat_round_minMag);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x07: vfcvt_rtz_x_f_v({{
+                            Vd_vu[i] = f_to_i<et>(ftype<et>(Vs2_vu[i]),
+                                                  softfloat_round_minMag);
+                        }}, OPFVV, VectorFloatConvertOp);
+                    }
+                    format VectorFloatWideningCvtFormat {
+                        0x08: vfwcvt_xu_f_v({{
+                            Vd_vwu[i] = f_to_wui<et>(
+                                ftype<et>(Vs2_vu[i + offset]),
+                                softfloat_roundingMode);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x09: vfwcvt_x_f_v({{
+                            Vd_vwu[i] = f_to_wi<et>(
+                                ftype<et>(Vs2_vu[i + offset]),
+                                softfloat_roundingMode);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x0a: vfwcvt_f_xu_v({{
+                            auto fd = ui_to_wf<vu>(Vs2_vu[i + offset]);
+                            Vd_vwu[i] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x0b: vfwcvt_f_x_v({{
+                            auto fd = i_to_wf<vu>(Vs2_vu[i + offset]);
+                            Vd_vwu[i] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x0c: vfwcvt_f_f_v({{
+                            auto fd = f_to_wf<et>(
+                                ftype<et>(Vs2_vu[i + offset]));
+                            Vd_vwu[i] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x0e: vfwcvt_rtz_xu_f_v({{
+                            Vd_vwu[i] = f_to_wui<et>(
+                                ftype<et>(Vs2_vu[i + offset]),
+                                softfloat_round_minMag);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x0f: vfwcvt_rtz_x_f_v({{
+                            Vd_vwu[i] = f_to_wi<et>(
+                                ftype<et>(Vs2_vu[i + offset]),
+                                softfloat_round_minMag);
+                        }}, OPFVV, VectorFloatConvertOp);
+                    }
+                    format VectorFloatNarrowingCvtFormat {
+                        0x10: vfncvt_xu_f_w({{
+                            Vd_vu[i + offset] = f_to_nui<vu>(
+                                ftype<ewt>(Vs2_vwu[i]),
+                                softfloat_roundingMode);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x11: vfncvt_x_f_w({{
+                            Vd_vu[i + offset] = f_to_ni<vu>(
+                                ftype<ewt>(Vs2_vwu[i]),
+                                softfloat_roundingMode);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x12: vfncvt_f_xu_w({{
+                            auto fd = ui_to_nf<et>(Vs2_vwu[i]);
+                            Vd_vu[i + offset] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x13: vfncvt_f_x_w({{
+                            auto fd = i_to_nf<et>(Vs2_vwu[i]);
+                            Vd_vu[i + offset] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x14: vfncvt_f_f_w({{
+                            auto fd = f_to_nf<et>(ftype<ewt>(Vs2_vwu[i]));
+                            Vd_vu[i + offset] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x15: vfncvt_rod_f_f_w({{
+                            softfloat_roundingMode = softfloat_round_odd;
+                            auto fd = f_to_nf<et>(ftype<ewt>(Vs2_vwu[i]));
+                            Vd_vu[i + offset] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x16: vfncvt_rtz_xu_f_w({{
+                            Vd_vu[i + offset] = f_to_nui<vu>(
+                                ftype<ewt>(Vs2_vwu[i]),
+                                softfloat_round_minMag);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x17: vfncvt_rtz_x_f_w({{
+                            Vd_vu[i + offset] = f_to_ni<vu>(
+                                ftype<ewt>(Vs2_vwu[i]),
+                                softfloat_round_minMag);
+                        }}, OPFVV, VectorFloatConvertOp);
+                    }
+                }
+                0x13: decode VS1 {
+                    format VectorFloatCvtFormat {
+                        0x00: vfsqrt_v({{
+                            auto fd = fsqrt<et>(ftype<et>(Vs2_vu[i]));
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVV, VectorFloatArithOp);
+                        0x04: vfrsqrt7_v({{
+                            auto fd = frsqrte7<et>(ftype<et>(Vs2_vu[i]));
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVV, VectorFloatArithOp);
+                        0x05: vfrec7_v({{
+                            auto fd = frecip7<et>(ftype<et>(Vs2_vu[i]));
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVV, VectorFloatArithOp);
+                        0x10: vfclass_v({{
+                            auto fd = fclassify<et>(ftype<et>(Vs2_vu[i]));
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVV, VectorFloatArithOp);
+                    }
+                }
+
+                format VectorFloatMaskFormat {
+                    0x18: vmfeq_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            feq<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype<et>(Vs1_vu[i])));
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x19: vmfle_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            fle<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype<et>(Vs1_vu[i])));
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x1b: vmflt_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            flt<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype<et>(Vs1_vu[i])));
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x1c: vmfne_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            !feq<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype<et>(Vs1_vu[i])));
+                    }}, OPFVV, VectorFloatArithOp);
+                }
+                format VectorFloatFormat {
+                    0x20: vfdiv_vv({{
+                        auto fd = fdiv<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype<et>(Vs1_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x24: vfmul_vv({{
+                        auto fd = fmul<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype<et>(Vs1_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x28: vfmadd_vv({{
+                        auto fd = fmadd<et>(ftype<et>(Vs3_vu[i]),
+                                            ftype<et>(Vs1_vu[i]),
+                                            ftype<et>(Vs2_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x29: vfnmadd_vv({{
+                        auto fd = fmadd<et>(fneg(ftype<et>(Vs3_vu[i])),
+                                            ftype<et>(Vs1_vu[i]),
+                                            fneg(ftype<et>(Vs2_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x2a: vfmsub_vv({{
+                        auto fd = fmadd<et>(ftype<et>(Vs3_vu[i]),
+                                            ftype<et>(Vs1_vu[i]),
+                                            fneg(ftype<et>(Vs2_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x2b: vfnmsub_vv({{
+                        auto fd = fmadd<et>(fneg(ftype<et>(Vs3_vu[i])),
+                                            ftype<et>(Vs1_vu[i]),
+                                            ftype<et>(Vs2_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x2c: vfmacc_vv({{
+                        auto fd = fmadd<et>(ftype<et>(Vs1_vu[i]),
+                                            ftype<et>(Vs2_vu[i]),
+                                            ftype<et>(Vs3_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x2d: vfnmacc_vv({{
+                        auto fd = fmadd<et>(fneg(ftype<et>(Vs1_vu[i])),
+                                            ftype<et>(Vs2_vu[i]),
+                                            fneg(ftype<et>(Vs3_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x2e: vfmsac_vv({{
+                        auto fd = fmadd<et>(ftype<et>(Vs1_vu[i]),
+                                            ftype<et>(Vs2_vu[i]),
+                                            fneg(ftype<et>(Vs3_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x2f: vfnmsac_vv({{
+                        auto fd = fmadd<et>(fneg(ftype<et>(Vs1_vu[i])),
+                                            ftype<et>(Vs2_vu[i]),
+                                            ftype<et>(Vs3_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x31: VectorReduceFloatWideningFormat::vfwredusum_vs({{
+                        Vd_vwu[0] = reduce_loop(
+                            [](const vwu& src1, const vu& src2) {
+                                return fadd<ewt>(
+                                    ftype<ewt>(src1),
+                                    f_to_wf<et>(ftype<et>(src2))
+                                );
+                            }, Vs1_vwu, Vs2_vu);
+                    }}, OPFVV, VectorFloatReduceOp);
+                    0x33: VectorReduceFloatWideningFormat::vfwredosum_vs({{
+                        Vd_vwu[0] = reduce_loop(
+                            [](const vwu& src1, const vu& src2) {
+                                return fadd<ewt>(
+                                    ftype<ewt>(src1),
+                                    f_to_wf<et>(ftype<et>(src2))
+                                );
+                            }, Vs1_vwu, Vs2_vu);
+                    }}, OPFVV, VectorFloatReduceOp);
+                }
+                format VectorFloatWideningFormat {
+                    0x30: vfwadd_vv({{
+                        auto fd = fadd<ewt>(
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fwiden(ftype<et>(Vs1_vu[i + offset])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x32: vfwsub_vv({{
+                        auto fd = fsub<ewt>(
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fwiden(ftype<et>(Vs1_vu[i + offset])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x34: vfwadd_wv({{
+                        auto fd = fadd<ewt>(
+                            ftype<ewt>(Vs2_vwu[i]),
+                            fwiden(ftype<et>(Vs1_vu[i + offset])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x36: vfwsub_wv({{
+                        auto fd = fsub<ewt>(
+                            ftype<ewt>(Vs2_vwu[i]),
+                            fwiden(ftype<et>(Vs1_vu[i + offset])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x38: vfwmul_vv({{
+                        auto fd = fmul<ewt>(
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fwiden(ftype<et>(Vs1_vu[i + offset])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x3c: vfwmacc_vv({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(ftype<et>(Vs1_vu[i + offset])),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            ftype<ewt>(Vs3_vwu[i]));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x3d: vfwnmacc_vv({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(fneg(ftype<et>(Vs1_vu[i + offset]))),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fneg(ftype<ewt>(Vs3_vwu[i])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x3e: vfwmsac_vv({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(ftype<et>(Vs1_vu[i + offset])),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fneg(ftype<ewt>(Vs3_vwu[i])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x3f: vfwnmsac_vv({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(fneg(ftype<et>(Vs1_vu[i + offset]))),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            ftype<ewt>(Vs3_vwu[i]));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                }
+            }
+            // OPMVV
+            0x2: decode VFUNCT6 {
+                format VectorReduceIntFormat {
+                    0x0: vredsum_vs({{
+                        Vd_vi[0] =
+                            reduce_loop(std::plus<vi>(), Vs1_vi, Vs2_vi);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x1: vredand_vs({{
+                        Vd_vi[0] =
+                            reduce_loop(std::bit_and<vi>(), Vs1_vi, Vs2_vi);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x2: vredor_vs({{
+                        Vd_vi[0] =
+                            reduce_loop(std::bit_or<vi>(), Vs1_vi, Vs2_vi);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x3: vredxor_vs({{
+                        Vd_vi[0] =
+                            reduce_loop(std::bit_xor<vi>(), Vs1_vi, Vs2_vi);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x4: vredminu_vs({{
+                        Vd_vu[0] =
+                            reduce_loop([](const vu& src1, const vu& src2) {
+                                return std::min<vu>(src1, src2);
+                            }, Vs1_vu, Vs2_vu);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x5: vredmin_vs({{
+                        Vd_vi[0] =
+                            reduce_loop([](const vi& src1, const vi& src2) {
+                                return std::min<vi>(src1, src2);
+                            }, Vs1_vi, Vs2_vi);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x6: vredmaxu_vs({{
+                        Vd_vu[0] =
+                            reduce_loop([](const vu& src1, const vu& src2) {
+                                return std::max<vu>(src1, src2);
+                            }, Vs1_vu, Vs2_vu);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x7: vredmax_vs({{
+                        Vd_vi[0] =
+                            reduce_loop([](const vi& src1, const vi& src2) {
+                                return std::max<vi>(src1, src2);
+                            }, Vs1_vi, Vs2_vi);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                }
+                format VectorIntFormat {
+                    0x8: vaaddu_vv({{
+                        __uint128_t res = (__uint128_t)Vs2_vu[i] + Vs1_vu[i];
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vu[i] = res >> 1;
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x9: vaadd_vv({{
+                        __uint128_t res = (__uint128_t)Vs2_vi[i] + Vs1_vi[i];
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vi[i] = res >> 1;
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0xa: vasubu_vv({{
+                        __uint128_t res = (__uint128_t)Vs2_vu[i] - Vs1_vu[i];
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vu[i] = res >> 1;
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0xb: vasub_vv({{
+                        __uint128_t res = (__uint128_t)Vs2_vi[i] - Vs1_vi[i];
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vi[i] = res >> 1;
+                    }}, OPMVV, VectorIntegerArithOp);
+                }
+                // VWXUNARY0
+                0x10: decode VS1 {
+                    0x00: decode VM {
+                        // The encodings corresponding to the masked versions
+                        // (vm=0) of vmv.x.s are reserved.
+                        0x1: VectorNonSplitFormat::vmv_x_s({{
+                            Rd_ud = Vs2_vi[0];
+                        }}, OPMVV, VectorMiscOp);
+                    }
+                    0x10: Vector1Vs1RdMaskFormat::vcpop_m({{
+                        uint64_t popcount = 0;
+                        for (uint32_t i = 0; i < (uint32_t)machInst.vl; i++) {
+                            bool vs2_lsb = elem_mask(Vs2_vu, i);
+                            if(this->vm){
+                                popcount += vs2_lsb;
+                            }else{
+                                bool do_mask = elem_mask(v0, i);
+                                popcount += (vs2_lsb && do_mask);
+                            }
+                        }
+                        Rd_vu = popcount;
+                    }}, OPMVV, VectorMiscOp);
+                    0x11: Vector1Vs1RdMaskFormat::vfirst_m({{
+                        int64_t pos = -1;
+                        for (uint32_t i = 0; i < (uint32_t)machInst.vl; i++) {
+                            if(this->vm == 0){
+                                if(elem_mask(v0, i)==0){
+                                    continue;
+                                }
+                            }
+                            bool vs2_lsb = elem_mask(Vs2_vu, i);
+                            if (vs2_lsb) {
+                                pos = i;
+                                break;
+                            }
+                        }
+                        Rd_vu = pos;
+                    }}, OPMVV, VectorMiscOp);
+                }
+                0x12: decode VS1 {
+                    format VectorIntExtFormat {
+                        0x02: vzext_vf8({{
+                            Vd_vu[i] = Vs2_vextu[i + offset];
+                        }}, OPMVV, VectorIntegerExtensionOp);
+                        0x03: vsext_vf8({{
+                            Vd_vi[i] = Vs2_vext[i + offset];
+                        }}, OPMVV, VectorIntegerExtensionOp);
+                        0x04: vzext_vf4({{
+                            Vd_vu[i] = Vs2_vextu[i + offset];
+                        }}, OPMVV, VectorIntegerExtensionOp);
+                        0x05: vsext_vf4({{
+                            Vd_vi[i] = Vs2_vext[i + offset];
+                        }}, OPMVV, VectorIntegerExtensionOp);
+                        0x06: vzext_vf2({{
+                            Vd_vu[i] = Vs2_vextu[i + offset];
+                        }}, OPMVV, VectorIntegerExtensionOp);
+                        0x07: vsext_vf2({{
+                            Vd_vi[i] = Vs2_vext[i + offset];
+                        }}, OPMVV, VectorIntegerExtensionOp);
+                    }
+                }
+                0x14: decode VS1 {
+                    0x01: Vector1Vs1VdMaskFormat::vmsbf_m({{
+                        bool has_one = false;
+                        for (uint32_t i = 0; i < (uint32_t)machInst.vl; i++) {
+                            bool vs2_lsb = elem_mask(Vs2_vu, i);
+                            bool do_mask = elem_mask(v0, i);
+                            if(this->vm||(this->vm == 0&&do_mask)){
+                                uint64_t res = 0;
+                                if (!has_one && !vs2_lsb) {
+                                    res = 1;
+                                } else if(!has_one && vs2_lsb) {
+                                    has_one = true;
+                                }
+                                Vd_ub[i/8] = ASSIGN_VD_BIT(i, res);
+                            }
+                        }
+                    }}, OPMVV, VectorMiscOp);
+                    0x02: Vector1Vs1VdMaskFormat::vmsof_m({{
+                        bool has_one = false;
+                        for (uint32_t i = 0; i < (uint32_t)machInst.vl; i++) {
+                            bool vs2_lsb = elem_mask(Vs2_vu, i);
+                            bool do_mask = elem_mask(v0, i);
+                            if(this->vm||(this->vm == 0&&do_mask)){
+                                uint64_t res = 0;
+                                if(!has_one && vs2_lsb) {
+                                    has_one = true;
+                                    res = 1;
+                                }
+                                Vd_ub[i/8] = ASSIGN_VD_BIT(i, res);
+                            }
+                        }
+                    }}, OPMVV, VectorMiscOp);
+                    0x03: Vector1Vs1VdMaskFormat::vmsif_m({{
+                        bool has_one = false;
+                        for (uint32_t i = 0; i < (uint32_t)machInst.vl; i++) {
+                            bool vs2_lsb = elem_mask(Vs2_vu, i);
+                            bool do_mask = elem_mask(v0, i);
+                            if(this->vm||(this->vm == 0&&do_mask)){
+                                uint64_t res = 0;
+                                if (!has_one && !vs2_lsb) {
+                                    res = 1;
+                                } else if(!has_one && vs2_lsb) {
+                                    has_one = true;
+                                    res = 1;
+                                }
+                                Vd_ub[i/8] = ASSIGN_VD_BIT(i, res);
+                            }
+                        }
+                    }}, OPMVV, VectorMiscOp);
+                    0x10: ViotaFormat::viota_m({{
+                        RiscvISAInst::VecRegContainer tmp_s2;
+                        xc->getRegOperand(this, 2,
+                            &tmp_s2);
+                        auto Vs2bit = tmp_s2.as<vu>();
+                        for (uint32_t i = 0; i < this->microVl; i++) {
+                            uint32_t ei = i +
+                                vtype_VLMAX(vtype, true) * this->microIdx;
+                            bool vs2_lsb = elem_mask(Vs2bit, ei);
+                            bool do_mask = elem_mask(v0, ei);
+                            bool has_one = false;
+                            if (this->vm || (do_mask && !this->vm)) {
+                                if (vs2_lsb) {
+                                    has_one = true;
+                                }
+                            }
+                            bool use_ori = (!this->vm) && !do_mask;
+                            if(use_ori == false){
+                                Vd_vu[i] = *cnt;
+                            }
+                            if (has_one) {
+                                *cnt = *cnt+1;
+                            }
+                        }
+                    }}, OPMVV, VectorMiscOp);
+                    0x11: VectorIntFormat::vid_v({{
+                        Vd_vu[i] = ei;
+                    }}, OPMVV, VectorMiscOp);
+                }
+                format VectorMaskFormat {
+                    0x18: vmandn_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            elem_mask(Vs2_vu, i) & !elem_mask(Vs1_vu, i));
+                    }}, OPMVV, VectorMiscOp);
+                    0x19: vmand_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            elem_mask(Vs2_vu, i) & elem_mask(Vs1_vu, i));
+                    }}, OPMVV, VectorMiscOp);
+                    0x1a: vmor_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            elem_mask(Vs2_vu, i) | elem_mask(Vs1_vu, i));
+                    }}, OPMVV, VectorMiscOp);
+                    0x1b: vmxor_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            elem_mask(Vs2_vu, i) ^ elem_mask(Vs1_vu, i));
+                    }}, OPMVV, VectorMiscOp);
+                    0x1c: vmorn_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            elem_mask(Vs2_vu, i) | !elem_mask(Vs1_vu, i));
+                    }}, OPMVV, VectorMiscOp);
+                    0x1d: vmnand_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            !(elem_mask(Vs2_vu, i) & elem_mask(Vs1_vu, i)));
+                    }}, OPMVV, VectorMiscOp);
+                    0x1e: vmnor_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            !(elem_mask(Vs2_vu, i) | elem_mask(Vs1_vu, i)));
+                    }}, OPMVV, VectorMiscOp);
+                    0x1f: vmxnor_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            !(elem_mask(Vs2_vu, i) ^ elem_mask(Vs1_vu, i)));
+                    }}, OPMVV, VectorMiscOp);
+                }
+                format VectorIntFormat {
+                    0x20: vdivu_vv({{
+                        if (Vs1_vu[i] == 0)
+                            Vd_vu[i] = (vu)-1;
+                        else
+                            Vd_vu[i] = Vs2_vu[i] / Vs1_vu[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x21: vdiv_vv({{
+                        if (Vs1_vi[i] == 0)
+                            Vd_vi[i] = -1;
+                        else if (Vs2_vi[i] == std::numeric_limits<vi>::min()
+                                && Vs1_vi[i] == -1)
+                            Vd_vi[i] = Vs2_vi[i];
+                        else
+                            Vd_vi[i] = Vs2_vi[i] / Vs1_vi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x22: vremu_vv({{
+                        if (Vs1_vu[i] == 0) {
+                            Vd_vu[i] = Vs2_vu[i];
+                        } else {
+                            Vd_vu[i] = Vs2_vu[i] % Vs1_vu[i];
+                        }
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x23: vrem_vv({{
+                        if (Vs1_vi[i] == 0) {
+                            Vd_vi[i] = Vs2_vi[i];
+                        } else if (Vs2_vi[i] == std::numeric_limits<vi>::min()
+                                && Vs1_vi[i] == -1) {
+                            Vd_vi[i] = 0;
+                        } else {
+                            Vd_vi[i] = Vs2_vi[i] % Vs1_vi[i];
+                        }
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x24: vmulhu_vv({{
+                        if (sew < 64) {
+                            Vd_vu[i] = ((uint64_t)Vs2_vu[i] * Vs1_vu[i])
+                                        >> sew;
+                        } else {
+                            Vd_vu[i] = mulhu_64(Vs2_vu[i], Vs1_vu[i]);
+                        }
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x25: vmul_vv({{
+                        Vd_vi[i] = Vs2_vi[i] * Vs1_vi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x26: vmulhsu_vv({{
+                        if (sew < 64) {
+                            Vd_vi[i] = ((int64_t)Vs2_vi[i] *
+                                        (uint64_t)Vs1_vu[i])
+                                        >> sew;
+                        } else {
+                            Vd_vi[i] = mulhsu_64(Vs2_vi[i], Vs1_vu[i]);
+                        }
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x27: vmulh_vv({{
+                        if (sew < 64) {
+                            Vd_vi[i] = ((int64_t)Vs2_vi[i] * Vs1_vi[i])
+                                        >> sew;
+                        } else {
+                            Vd_vi[i] = mulh_64(Vs2_vi[i], Vs1_vi[i]);
+                        }
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x29: vmadd_vv({{
+                        Vd_vi[i] = Vs3_vi[i] * Vs1_vi[i] + Vs2_vi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x2b: vnmsub_vv({{
+                        Vd_vi[i] = -(Vs3_vi[i] * Vs1_vi[i]) + Vs2_vi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x2d: vmacc_vv({{
+                        Vd_vi[i] = Vs2_vi[i] * Vs1_vi[i] + Vs3_vi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x2f: vnmsac_vv({{
+                        Vd_vi[i] = -(Vs2_vi[i] * Vs1_vi[i]) + Vs3_vi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                }
+                format VectorIntWideningFormat {
+                    0x30: vwaddu_vv({{
+                        Vd_vwu[i] = vwu(Vs2_vu[i + offset])
+                                + vwu(Vs1_vu[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x31: vwadd_vv({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset])
+                                + vwi(Vs1_vi[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x32: vwsubu_vv({{
+                        Vd_vwu[i] = vwu(Vs2_vu[i + offset])
+                                - vwu(Vs1_vu[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x33: vwsub_vv({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset])
+                                - vwi(Vs1_vi[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x34: vwaddu_wv({{
+                        Vd_vwu[i] = Vs2_vwu[i] + vwu(Vs1_vu[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x35: vwadd_wv({{
+                        Vd_vwi[i] = Vs2_vwi[i] + vwi(Vs1_vi[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x36: vwsubu_wv({{
+                        Vd_vwu[i] = Vs2_vwu[i] - vwu(Vs1_vu[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x37: vwsub_wv({{
+                        Vd_vwi[i] = Vs2_vwi[i] - vwi(Vs1_vi[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x38: vwmulu_vv({{
+                        Vd_vwu[i] = vwu(Vs2_vu[i + offset])
+                                * vwu(Vs1_vu[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x3a: vwmulsu_vv({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset])
+                                * vwu(Vs1_vu[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x3b: vwmul_vv({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset])
+                                * vwi(Vs1_vi[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x3c: vwmaccu_vv({{
+                        Vd_vwu[i] = vwu(Vs1_vu[i + offset])
+                                * vwu(Vs2_vu[i + offset])
+                                + Vs3_vwu[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x3d: vwmacc_vv({{
+                        Vd_vwi[i] = vwi(Vs1_vi[i + offset])
+                                * vwi(Vs2_vi[i + offset])
+                                + Vs3_vwi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x3f: vwmaccsu_vv({{
+                        Vd_vwi[i] = vwi(Vs1_vi[i + offset])
+                                * vwu(Vs2_vu[i + offset])
+                                + Vs3_vwi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                }
+            }
+            // OPIVI
+            0x3: decode VFUNCT6 {
+                format VectorIntFormat {
+                    0x00: vadd_vi({{
+                        Vd_vi[i] = Vs2_vi[i] + (vi)sext<5>(SIMM5);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x03: vrsub_vi({{
+                        Vd_vi[i] = (vi)sext<5>(SIMM5) - Vs2_vi[i];
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x09: vand_vi({{
+                        Vd_vi[i] = Vs2_vi[i] & (vi)sext<5>(SIMM5);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x0a: vor_vi({{
+                        Vd_vi[i] = Vs2_vi[i] | (vi)sext<5>(SIMM5);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x0b: vxor_vi({{
+                        Vd_vi[i] = Vs2_vi[i] ^ (vi)sext<5>(SIMM5);
+                    }}, OPIVI, VectorIntegerArithOp);
+                }
+                0x0c: VectorGatherFormat::vrgather_vi({{
+                    for (uint32_t i = 0; i < microVl; i++) {
+                        uint32_t ei = i + vs1_idx * vs1_elems + vs1_bias;
+                        if (this->vm || elem_mask(v0, ei)) {
+                            const uint64_t idx =
+                                (uint64_t)sext<5>(SIMM5) - vs2_elems * vs2_idx;
+                            Vd_vu[i] = ((uint64_t)sext<5>(SIMM5) >= vlmax) ? 0
+                                : (idx < vs2_elems) ? Vs2_vu[idx]
+                                : Vs3_vu[i];
+                        }
+                    }
+                }}, OPIVI, VectorMiscOp);
+                0x0e: VectorSlideUpFormat::vslideup_vi({{
+                    const int offset = (int)(uint64_t)(SIMM5);
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vdIdx - vs2Idx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int elemOffset = vdOffset + vdIdx * microVlmax;
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            if (this->vm || elem_mask(v0, i + elemOffset)) {
+                                Vd_vu[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                            }
+                        }
+                    }
+                }}, OPIVI, VectorMiscOp);
+                0x0f: VectorSlideDownFormat::vslidedown_vi({{
+                    const int offset = (int)(uint64_t)(SIMM5);
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vs2Idx - vdIdx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    const int numVs2s = vtype_regs_per_group(vtype);
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const bool needZeroTail = numVs2s == vs2Idx + 1;
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int elemIdxBase = vdIdx * microVlmax;
+                        vreg_t resVreg;
+                        auto res = resVreg.as<vu>();
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            res[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                        }
+                        if (needZeroTail) {
+                            for (int i = upperBound + vdOffset;
+                                i < microVlmax; i++) {
+                                res[i] = 0;
+                            }
+                        }
+                        for (int i = vdOffset; i < microVl ; i++) {
+                            if (vm || elem_mask(v0, i + elemIdxBase)) {
+                                Vd_vu[i] = res[i];
+                            }
+                        }
+                    }
+                }}, OPIVI, VectorMiscOp);
+                format VectorIntFormat {
+                    0x10: decode VM {
+                        0x0: vadc_vim({{
+                            Vd_vi[i] = Vs2_vi[i] +
+                                (vi)sext<5>(SIMM5) + elem_mask(v0, ei);
+                        }}, OPIVI, VectorIntegerArithOp);
+                        // the unmasked versions (vm=1) are reserved
+                    }
+                    0x17: decode VM {
+                        0x0: vmerge_vim({{
+                            Vd_vi[i] = elem_mask(v0, ei)
+                                    ? (vi)sext<5>(SIMM5)
+                                    : Vs2_vi[i];
+                        }}, OPIVI, VectorIntegerArithOp);
+                        0x1: vmv_v_i({{
+                            Vd_vi[i] = (vi)sext<5>(SIMM5);
+                        }}, OPIVI, VectorIntegerArithOp);
+                    }
+                }
+                format VectorIntVxsatFormat{
+                    0x20: vsaddu_vi({{
+                        Vd_vu[i] = sat_addu<vu>(Vs2_vu[i], (vu)SIMM5,
+                            vxsatptr);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x21: vsadd_vi({{
+                        Vd_vu[i] = sat_add<vi>(Vs2_vu[i], (vu)SIMM5,
+                            vxsatptr);
+                    }}, OPIVI, VectorIntegerArithOp);
+                }
+                format VectorIntFormat {
+                    0x25: vsll_vi({{
+                        Vd_vu[i] = Vs2_vu[i] << ((vu)SIMM5 & (sew - 1) & 0x1f);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x28: vsrl_vi({{
+                        Vd_vu[i] = Vs2_vu[i] >> ((vu)SIMM5 & (sew - 1) & 0x1f);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x2a: vssrl_vi({{
+                        int sh = SIMM5 & (vtype_SEW(vtype) - 1);
+                        __uint128_t res = Vs2_vu[i];
+
+                        res = int_rounding<__uint128_t>(
+                            res, 0 /* TODO */, sh) >> sh;
+
+                        Vd_vu[i] = res;
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x29: vsra_vi({{
+                        Vd_vi[i] = Vs2_vi[i] >> ((vu)SIMM5 & (sew - 1) & 0x1f);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x2b: vssra_vi({{
+                        int sh = SIMM5 & (sew - 1);
+                        __int128_t val = Vs2_vi[i];
+
+                        val = int_rounding<__int128_t>(val,
+                            xc->readMiscReg(MISCREG_VXRM), sh);
+                        Vd_vi[i] = val >> sh;
+                    }}, OPIVI, VectorIntegerArithOp);
+                }
+                // According to Spec Section 16.6,
+                // vm must be 1 (unmasked) in vmv<nr>r.v instructions.
+                0x27: decode VM { 0x1: decode SIMM3 {
+                    format VMvWholeFormat {
+                        0x0: vmv1r_v({{
+                            Vd_ud[i] = Vs2_ud[i];
+                        }}, OPIVI, VectorMiscOp);
+                        0x1: vmv2r_v({{
+                            Vd_ud[i] = Vs2_ud[i];
+                        }}, OPIVI, VectorMiscOp);
+                        0x3: vmv4r_v({{
+                            Vd_ud[i] = Vs2_ud[i];
+                        }}, OPIVI, VectorMiscOp);
+                        0x7: vmv8r_v({{
+                            Vd_ud[i] = Vs2_ud[i];
+                        }}, OPIVI, VectorMiscOp);
+                    }
+                }}
+                format VectorIntMaskFormat {
+                    0x11: decode VM {
+                        0x0: vmadc_vim({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                carry_out(Vs2_vi[i], (vi)sext<5>(SIMM5),
+                                    elem_mask(v0, ei)));
+                        }}, OPIVI, VectorIntegerArithOp);
+                        0x1: vmadc_vi({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                carry_out(Vs2_vi[i], (vi)sext<5>(SIMM5)));
+                        }}, OPIVI, VectorIntegerArithOp);
+                    }
+                    0x18: vmseq_vi({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] == (vi)sext<5>(SIMM5)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x19: vmsne_vi({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] != (vi)sext<5>(SIMM5)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x1c: vmsleu_vi({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] <= (vu)sext<5>(SIMM5)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x1d: vmsle_vi({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] <= (vi)sext<5>(SIMM5)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x1e: vmsgtu_vi({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] > (vu)sext<5>(SIMM5)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x1f: vmsgt_vi({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] > (vi)sext<5>(SIMM5)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                }
+                format VectorIntNarrowingFormat {
+                    0x2c: vnsrl_wi({{
+                        Vd_vu[i + offset] = (vu)(Vs2_vwu[i] >>
+                                            ((vwu)SIMM5 & (sew * 2 - 1)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x2d: vnsra_wi({{
+                        Vd_vi[i + offset] = (vi)(Vs2_vwi[i] >>
+                                            ((vwu)SIMM5 & (sew * 2 - 1)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x2e: vnclipu_wi({{
+                        vu max = std::numeric_limits<vu>::max();
+                        uint64_t sign_mask =
+                            std::numeric_limits<uint64_t>::max() << sew;
+                        __uint128_t res = Vs2_vwu[i];
+                        unsigned shift = VS1 & ((sew * 2) - 1);
+
+                        res = int_rounding<__uint128_t>(
+                            res, 0 /* TODO */, shift) >> shift;
+
+                        if (res & sign_mask) {
+                            // TODO: vxsat
+                            res = max;
+                        }
+
+                        Vd_vu[i + offset] = (vu)res;
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x2f: vnclip_wi({{
+                        vi max = std::numeric_limits<vi>::max();
+                        vi min = std::numeric_limits<vi>::min();
+                        __int128_t res = Vs2_vwi[i];
+                        unsigned shift = VS1 & ((sew * 2) - 1);
+
+                        res = int_rounding<__int128_t>(
+                            res, 0 /* TODO */, shift) >> shift;
+
+                        if (res < min) {
+                            res = min;
+                            // TODO: vxsat
+                        } else if (res > max) {
+                            res = max;
+                            // TODO: vxsat
+                        }
+
+                        Vd_vi[i + offset] = (vi)res;
+                    }}, OPIVI, VectorIntegerArithOp);
+                }
+            }
+            // OPIVX
+            0x4: decode VFUNCT6 {
+                format VectorIntFormat {
+                    0x0: vadd_vx({{
+                        Vd_vu[i] = Vs2_vu[i] + Rs1_vu;
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x2: vsub_vx({{
+                        Vd_vu[i] = Vs2_vu[i] - Rs1_vu;
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x3: vrsub_vx({{
+                        Vd_vu[i] = Rs1_vu - Vs2_vu[i];
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x4: vminu_vx({{
+                        Vd_vu[i] = std::min(Vs2_vu[i], Rs1_vu);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x5: vmin_vx({{
+                        Vd_vi[i] = std::min(Vs2_vi[i], Rs1_vi);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x6: vmaxu_vx({{
+                        Vd_vu[i] = std::max(Vs2_vu[i], Rs1_vu);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x7: vmax_vx({{
+                        Vd_vi[i] = std::max(Vs2_vi[i], Rs1_vi);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x9: vand_vx({{
+                        Vd_vu[i] = Vs2_vu[i] & Rs1_vu;
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0xa: vor_vx({{
+                        Vd_vu[i] = Vs2_vu[i] | Rs1_vu;
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0xb: vxor_vx({{
+                        Vd_vu[i] = Vs2_vu[i] ^ Rs1_vu;
+                    }}, OPIVX, VectorIntegerArithOp);
+                }
+                0x0e: VectorSlideUpFormat::vslideup_vx({{
+                    const int offset = (int)Rs1_vu;
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vdIdx - vs2Idx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int elemOffset = vdOffset + vdIdx * microVlmax;
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            if (this->vm || elem_mask(v0, i + elemOffset)) {
+                                Vd_vu[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                            }
+                        }
+                    }
+                }}, OPIVX, VectorMiscOp);
+                0x0f: VectorSlideDownFormat::vslidedown_vx({{
+                    const int offset = (int)Rs1_vu;
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vs2Idx - vdIdx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    const int numVs2s = vtype_regs_per_group(vtype);
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const bool needZeroTail = numVs2s == vs2Idx + 1;
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int elemIdxBase = vdIdx * microVlmax;
+                        vreg_t resVreg;
+                        auto res = resVreg.as<vu>();
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            res[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                        }
+                        if (needZeroTail) {
+                            for (int i = upperBound + vdOffset;
+                                i < microVlmax; i++) {
+                                res[i] = 0;
+                            }
+                        }
+                        for (int i = vdOffset; i < microVl ; i++) {
+                            if (vm || elem_mask(v0, i + elemIdxBase)) {
+                                Vd_vu[i] = res[i];
+                            }
+                        }
+                    }
+                }}, OPIVX, VectorMiscOp);
+                0x0c: VectorGatherFormat::vrgather_vx({{
+                    for (uint32_t i = 0; i < microVl; i++) {
+                        uint32_t ei = i + vs1_idx * vs1_elems + vs1_bias;
+                        if (this->vm || elem_mask(v0, ei)) {
+                            const uint64_t idx = Rs1_vu - vs2_elems * vs2_idx;
+                            Vd_vu[i] = (Rs1_vu >= vlmax) ? 0
+                                : (idx < vs2_elems) ? Vs2_vu[idx]
+                                : Vs3_vu[i];
+                        }
+                    }
+                }}, OPIVX, VectorMiscOp);
+                format VectorIntFormat {
+                    0x10: decode VM {
+                        0x0: vadc_vxm({{
+                            Vd_vi[i] = Vs2_vi[i] + Rs1_vi + elem_mask(v0, ei);
+                        }}, OPIVX, VectorIntegerArithOp);
+                        // the unmasked versions (vm=1) are reserved
+                    }
+                    0x12: decode VM {
+                        0x0: vsbc_vxm({{
+                            Vd_vi[i] = Vs2_vi[i] - Rs1_vi - elem_mask(v0, ei);
+                        }}, OPIVX, VectorIntegerArithOp);
+                        // the unmasked versions (vm=1) are reserved
+                    }
+                    0x17: decode VM {
+                        0x0: vmerge_vxm({{
+                            Vd_vu[i] = elem_mask(v0, ei) ? Rs1_vu : Vs2_vu[i];
+                        }}, OPIVX, VectorIntegerArithOp);
+                        0x1: decode VS2 {
+                            0x0: vmv_v_x({{
+                                Vd_vu[i] = Rs1_vu;
+                            }}, OPIVX, VectorIntegerArithOp);
+                        }
+                    }
+                }
+                format VectorIntVxsatFormat{
+                    0x20: vsaddu_vx({{
+                        Vd_vu[i] = sat_addu<vu>(Vs2_vu[i], Rs1_vu,
+                            vxsatptr);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x21: vsadd_vx({{
+                        Vd_vu[i] = sat_add<vi>(Vs2_vu[i], Rs1_vu,
+                            vxsatptr);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x22: vssubu_vx({{
+                        Vd_vu[i] = sat_subu<vu>(Vs2_vu[i], Rs1_vu,
+                            vxsatptr);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x23: vssub_vx({{
+                        Vd_vu[i] = sat_sub<vi>(Vs2_vu[i], Rs1_vu,
+                            vxsatptr);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x27: vsmul_vx({{
+                        vi max = std::numeric_limits<vi>::max();
+                        vi min = std::numeric_limits<vi>::min();
+                        bool overflow = Rs1_vi == Vs2_vi[i] && Rs1_vi == min;
+                        __int128_t result =
+                            (__int128_t)Rs1_vi * (__int128_t)Vs2_vi[i];
+                        result = int_rounding<__uint128_t>(
+                            result, 0 /* TODO */, sew - 1);
+                        result = result >> (sew - 1);
+                        if (overflow) {
+                            result = max;
+                            *vxsatptr = true;
+                        }
+
+                        Vd_vi[i] = (vi)result;
+                    }}, OPIVX, VectorIntegerArithOp);
+                }
+                format VectorIntFormat {
+                    0x25: vsll_vx({{
+                        Vd_vu[i] = Vs2_vu[i] << (Rs1_vu & (sew - 1));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x28: vsrl_vx({{
+                        Vd_vu[i] = Vs2_vu[i] >> (Rs1_vu & (sew - 1));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x29: vsra_vx({{
+                        Vd_vi[i] = Vs2_vi[i] >> (Rs1_vu & (sew - 1));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x2a: vssrl_vx({{
+                        int sh = Rs1_vu & (sew - 1);
+                        __uint128_t val = Vs2_vu[i];
+
+                        val = int_rounding<__uint128_t>(val,
+                            xc->readMiscReg(MISCREG_VXRM), sh);
+                        Vd_vu[i] = val >> sh;
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x2b: vssra_vx({{
+                        int sh = Rs1_vu & (sew - 1);
+                        __int128_t val = Vs2_vi[i];
+
+                        val = int_rounding<__int128_t>(val,
+                            xc->readMiscReg(MISCREG_VXRM), sh);
+                        Vd_vi[i] = val >> sh;
+                    }}, OPIVX, VectorIntegerArithOp);
+                }
+                format VectorIntNarrowingFormat {
+                    0x2c: vnsrl_wx({{
+                        Vd_vu[i + offset] = (vu)(Vs2_vwu[i] >>
+                                            ((vwu)Rs1_vu & (sew * 2 - 1)));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x2d: vnsra_wx({{
+                        Vd_vi[i + offset] = (vi)(Vs2_vwi[i] >>
+                                            ((vwu)Rs1_vu & (sew * 2 - 1)));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x2e: vnclipu_wx({{
+                        vu max = std::numeric_limits<vu>::max();
+                        uint64_t sign_mask =
+                            std::numeric_limits<uint64_t>::max() << sew;
+                        __uint128_t res = Vs2_vwu[i];
+                        unsigned shift = Rs1_vu & ((sew * 2) - 1);
+
+                        res = int_rounding<__uint128_t>(
+                            res, 0 /* TODO */, shift) >> shift;
+
+                        if (res & sign_mask) {
+                            // TODO: vxsat
+                            res = max;
+                        }
+
+                        Vd_vu[i + offset] = (vu)res;
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x2f: vnclip_wx({{
+                        vi max = std::numeric_limits<vi>::max();
+                        vi min = std::numeric_limits<vi>::min();
+                        __int128_t res = Vs2_vwi[i];
+                        unsigned shift = Rs1_vi & ((sew * 2) - 1);
+
+                        res = int_rounding<__int128_t>(
+                            res, 0 /* TODO */, shift) >> shift;
+
+                        if (res < min) {
+                            res = min;
+                            // TODO: vxsat
+                        } else if (res > max) {
+                            res = max;
+                            // TODO: vxsat
+                        }
+
+                        Vd_vi[i + offset] = (vi)res;
+                    }}, OPIVX, VectorIntegerArithOp);
+                }
+
+                format VectorIntMaskFormat {
+                    0x11: decode VM {
+                        0x0: vmadc_vxm({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                carry_out(Vs2_vi[i], Rs1_vi,
+                                    elem_mask(v0, ei)));
+                        }}, OPIVX, VectorIntegerArithOp);
+                        0x1: vmadc_vx({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                carry_out(Vs2_vi[i], Rs1_vi));
+                        }}, OPIVX, VectorIntegerArithOp);
+                    }
+                    0x13: decode VM {
+                        0x0: vmsbc_vxm({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                borrow_out(Vs2_vi[i], Rs1_vi,
+                                    elem_mask(v0, ei)));
+                        }}, OPIVX, VectorIntegerArithOp);
+                        0x1: vmsbc_vx({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                borrow_out(Vs2_vi[i], Rs1_vi));
+                        }}, OPIVX, VectorIntegerArithOp);
+                    }
+                    0x18: vmseq_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] == Rs1_vu));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x19: vmsne_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] != Rs1_vu));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x1a: vmsltu_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] < Rs1_vu));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x1b: vmslt_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] < Rs1_vi));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x1c: vmsleu_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] <= Rs1_vu));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x1d: vmsle_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] <= Rs1_vi));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x1e: vmsgtu_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] > Rs1_vu));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x1f: vmsgt_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] > Rs1_vi));
+                    }}, OPIVX, VectorIntegerArithOp);
+                }
+            }
+            // OPFVF
+            0x5: decode VFUNCT6 {
+                format VectorFloatFormat{
+                    0x00: vfadd_vf({{
+                        auto fd = fadd<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype_freg<et>(freg(Fs1_bits)));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x02: vfsub_vf({{
+                        auto fd = fsub<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype_freg<et>(freg(Fs1_bits)));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x04: vfmin_vf({{
+                        auto fd = fmin<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype_freg<et>(freg(Fs1_bits)));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x06: vfmax_vf({{
+                        auto fd = fmax<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype_freg<et>(freg(Fs1_bits)));
+                            Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x08: vfsgnj_vf({{
+                        Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
+                                             ftype_freg<et>(freg(Fs1_bits)),
+                                             false, false).v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x09: vfsgnjn_vf({{
+                        Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
+                                             ftype_freg<et>(freg(Fs1_bits)),
+                                             true, false).v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x0a: vfsgnjx_vf({{
+                        Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
+                                             ftype_freg<et>(freg(Fs1_bits)),
+                                             false, true).v;
+                    }}, OPFVF, VectorFloatArithOp);
+                }
+                0x0e: VectorFloatSlideUpFormat::vfslide1up_vf({{
+                    const int offset = 1;
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vdIdx - vs2Idx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int elemOffset = vdOffset + vdIdx * microVlmax;
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            if (this->vm || elem_mask(v0, i + elemOffset)) {
+                                Vd_vu[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                            }
+                        }
+                        // TODO: dirty code
+                        if (vdIdx == 0 && vs2Idx == 0 &&
+                                (this->vm || elem_mask(v0, 0))) {
+                            tmp_d0.as<vu>()[0] = Rs1_vu;
+                        }
+                    }
+                }}, OPFVF, VectorMiscOp);
+                0x0f: VectorFloatSlideDownFormat::vfslide1down_vf({{
+                    const int offset = 1;
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vs2Idx - vdIdx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    const int numVs2s = vtype_regs_per_group(vtype);
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const bool needZeroTail = numVs2s == vs2Idx + 1;
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int elemIdxBase = vdIdx * microVlmax;
+                        vreg_t resVreg;
+                        auto res = resVreg.as<vu>();
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            res[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                        }
+                        if (needZeroTail) {
+                            for (int i = upperBound + vdOffset;
+                                i < microVlmax; i++) {
+                                res[i] = 0;
+                            }
+                        }
+                        for (int i = vdOffset; i < microVl ; i++) {
+                            if (vm || elem_mask(v0, i + elemIdxBase)) {
+                                Vd_vu[i] = (i + elemIdxBase != machInst.vl - 1)
+                                    ? res[i]
+                                    : Rs1_vu;
+                            }
+                        }
+                    }
+                }}, OPFVF, VectorMiscOp);
+                // VRFUNARY0
+                0x10: decode VS2 {
+                    0x00: decode VM {
+                        // The encodings corresponding to the masked versions
+                        // (vm=0) of vfmv.s.f are reserved
+                        0x1: VectorNonSplitFormat::vfmv_s_f({{
+                            auto fd = ftype_freg<et>(freg(Fs1_bits));
+                            Vd_vu[0] = fd.v;
+                        }}, OPFVV, VectorMiscOp);
+                    }
+                }
+                format VectorFloatFormat{
+                    0x17: decode VM {
+                        0x0: vfmerge_vfm({{
+                            Vd_vu[i] = elem_mask(v0, ei)
+                                    ? ftype_freg<et>(freg(Fs1_bits)).v
+                                    : Vs2_vu[i];
+                        }}, OPFVF, VectorFloatArithOp);
+                        0x1: vfmv_v_f({{
+                            auto fd = ftype_freg<et>(freg(Fs1_bits));
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVF, VectorFloatArithOp);
+                    }
+                }
+                format VectorFloatMaskFormat {
+                    0x18: vmfeq_vf({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            feq<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype_freg<et>(freg(Fs1_bits))));
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x19: vmfle_vf({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            fle<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype_freg<et>(freg(Fs1_bits))));
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x1b: vmflt_vf({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            flt<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype_freg<et>(freg(Fs1_bits))));
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x1c: vmfne_vf({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            !feq<et>(ftype<et>(Vs2_vu[i]),
+                                     ftype_freg<et>(freg(Fs1_bits))));
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x1d: vmfgt_vf({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            flt<et>(ftype_freg<et>(freg(Fs1_bits)),
+                                    ftype<et>(Vs2_vu[i])));
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x1f: vmfge_vf({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            fle<et>(ftype_freg<et>(freg(Fs1_bits)),
+                                    ftype<et>(Vs2_vu[i])));
+                    }}, OPFVF, VectorFloatArithOp);
+                }
+                format VectorFloatFormat{
+                    0x20: vfdiv_vf({{
+                        auto fd = fdiv<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype_freg<et>(freg(Fs1_bits)));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x21: vfrdiv_vf({{
+                        auto fd = fdiv<et>(ftype_freg<et>(freg(Fs1_bits)),
+                                           ftype<et>(Vs2_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x24: vfmul_vf({{
+                        auto fd = fmul<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype_freg<et>(freg(Fs1_bits)));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x27: vfrsub_vf({{
+                        auto fd = fsub<et>(ftype_freg<et>(freg(Fs1_bits)),
+                                           ftype<et>(Vs2_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x28: vfmadd_vf({{
+                        auto fd = fmadd<et>(ftype<et>(Vs3_vu[i]),
+                                            ftype_freg<et>(freg(Fs1_bits)),
+                                            ftype<et>(Vs2_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x29: vfnmadd_vf({{
+                        auto fd = fmadd<et>(fneg(ftype<et>(Vs3_vu[i])),
+                                            ftype_freg<et>(freg(Fs1_bits)),
+                                            fneg(ftype<et>(Vs2_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x2a: vfmsub_vf({{
+                        auto fd = fmadd<et>(ftype<et>(Vs3_vu[i]),
+                                            ftype_freg<et>(freg(Fs1_bits)),
+                                            fneg(ftype<et>(Vs2_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x2b: vfnmsub_vf({{
+                        auto fd = fmadd<et>(fneg(ftype<et>(Vs3_vu[i])),
+                                            ftype_freg<et>(freg(Fs1_bits)),
+                                            ftype<et>(Vs2_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x2c: vfmacc_vf({{
+                        auto fd = fmadd<et>(ftype_freg<et>(freg(Fs1_bits)),
+                                            ftype<et>(Vs2_vu[i]),
+                                            ftype<et>(Vs3_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x2d: vfnmacc_vf({{
+                        auto fd = fmadd<et>(
+                            fneg(ftype_freg<et>(freg(Fs1_bits))),
+                            ftype<et>(Vs2_vu[i]),
+                            fneg(ftype<et>(Vs3_vu[i]))
+                        );
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x2e: vfmsac_vf({{
+                        auto fd = fmadd<et>(ftype_freg<et>(freg(Fs1_bits)),
+                                            ftype<et>(Vs2_vu[i]),
+                                            fneg(ftype<et>(Vs3_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x2f: vfnmsac_vf({{
+                        auto fd = fmadd<et>(
+                            fneg(ftype_freg<et>(freg(Fs1_bits))),
+                            ftype<et>(Vs2_vu[i]),
+                            ftype<et>(Vs3_vu[i])
+                        );
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                }
+                format VectorFloatWideningFormat {
+                    0x30: vfwadd_vf({{
+                        auto fd = fadd<ewt>(
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x32: vfwsub_vf({{
+                        auto fd = fsub<ewt>(
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x34: vfwadd_wf({{
+                        auto fd = fadd<ewt>(
+                            ftype<ewt>(Vs2_vwu[i]),
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x36: vfwsub_wf({{
+                        auto fd = fsub<ewt>(
+                            ftype<ewt>(Vs2_vwu[i]),
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x38: vfwmul_vf({{
+                        auto fd = fmul<ewt>(
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x3c: vfwmacc_vf({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            ftype<ewt>(Vs3_vwu[i]));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x3d: vfwnmacc_vf({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(fneg(ftype_freg<et>(freg(Fs1_bits)))),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fneg(ftype<ewt>(Vs3_vwu[i])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x3e: vfwmsac_vf({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fneg(ftype<ewt>(Vs3_vwu[i])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x3f: vfwnmsac_vf({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(fneg(ftype_freg<et>(freg(Fs1_bits)))),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            ftype<ewt>(Vs3_vwu[i]));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                }
+            }
+            // OPMVX
+            0x6: decode VFUNCT6 {
+                format VectorIntFormat {
+                    0x08: vaaddu_vx({{
+                        __uint128_t res = (__uint128_t)Vs2_vu[i] + Rs1_vu;
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vu[i] = res >> 1;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x09: vaadd_vx({{
+                        __uint128_t res = (__uint128_t)Vs2_vi[i] + Rs1_vi;
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vi[i] = res >> 1;
+                    }}, OPMVX, VectorIntegerArithOp);
+                }
+                0x0e: VectorSlideUpFormat::vslide1up_vx({{
+                    const int offset = 1;
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vdIdx - vs2Idx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int elemOffset = vdOffset + vdIdx * microVlmax;
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            if (this->vm || elem_mask(v0, i + elemOffset)) {
+                                Vd_vu[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                            }
+                        }
+                        // TODO: dirty code
+                        if (vdIdx == 0 && vs2Idx == 0 &&
+                                (this->vm || elem_mask(v0, 0))) {
+                            tmp_d0.as<vu>()[0] = Rs1_vu;
+                        }
+                    }
+                }}, OPIVX, VectorMiscOp);
+                0x0f: VectorSlideDownFormat::vslide1down_vx({{
+                    const int offset = 1;
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vs2Idx - vdIdx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    const int numVs2s = vtype_regs_per_group(vtype);
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const bool needZeroTail = numVs2s == vs2Idx + 1;
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int elemIdxBase = vdIdx * microVlmax;
+                        vreg_t resVreg;
+                        auto res = resVreg.as<vu>();
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            res[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                        }
+                        if (needZeroTail) {
+                            for (int i = upperBound + vdOffset;
+                                i < microVlmax; i++) {
+                                res[i] = 0;
+                            }
+                        }
+                        for (int i = vdOffset; i < microVl ; i++) {
+                            if (vm || elem_mask(v0, i + elemIdxBase)) {
+                                Vd_vu[i] = (i + elemIdxBase != machInst.vl - 1)
+                                    ? res[i]
+                                    : Rs1_vu;
+                            }
+                        }
+                    }
+                }}, OPIVX, VectorMiscOp);
+                // VRXUNARY0
+                0x10: decode VS2 {
+                    0x00: decode VM {
+                        // The encodings corresponding to the masked versions
+                        // (vm=0) of vmv.s.x are reserved.
+                        0x1: VectorNonSplitFormat::vmv_s_x({{
+                            Vd_vu[0] = Rs1_vu;
+                        }}, OPMVX, VectorMiscOp);
+                    }
+                }
+                format VectorIntFormat {
+                    0x0a: vasubu_vx({{
+                        __uint128_t res = (__uint128_t)Vs2_vu[i] - Rs1_vu;
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vu[i] = res >> 1;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x0b: vasub_vx({{
+                        __uint128_t res = (__uint128_t)Vs2_vi[i] - Rs1_vi;
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vi[i] = res >> 1;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x20: vdivu_vx({{
+                        if (Rs1_vu == 0)
+                            Vd_vu[i] = (vu)-1;
+                        else
+                            Vd_vu[i] = Vs2_vu[i] / Rs1_vu;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x21: vdiv_vx({{
+                        if (Rs1_vi == 0)
+                            Vd_vi[i] = -1;
+                        else if (Vs2_vi[i] == std::numeric_limits<vi>::min()
+                                && Rs1_vi == -1)
+                            Vd_vi[i] = Vs2_vi[i];
+                        else
+                            Vd_vi[i] = Vs2_vi[i] / Rs1_vi;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x22: vremu_vx({{
+                        if (Rs1_vu == 0)
+                            Vd_vu[i] = Vs2_vu[i];
+                        else
+                            Vd_vu[i] = Vs2_vu[i] % Rs1_vu;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x23: vrem_vx({{
+                        if (Rs1_vi == 0)
+                            Vd_vi[i] = Vs2_vi[i];
+                        else if (Vs2_vi[i] == std::numeric_limits<vi>::min()
+                                && Rs1_vi == -1)
+                            Vd_vi[i] = 0;
+                        else
+                            Vd_vi[i] = Vs2_vi[i] % Rs1_vi;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x24: vmulhu_vx({{
+                        if (sew < 64)
+                            Vd_vu[i] = ((uint64_t)Vs2_vu[i] * Rs1_vu)
+                                        >> sew;
+                        else
+                            Vd_vu[i] = mulhu_64(Vs2_vu[i], Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x25: vmul_vx({{
+                        Vd_vi[i] = Vs2_vi[i] * Rs1_vi;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x26: vmulhsu_vx({{
+                        if (sew < 64)
+                            Vd_vi[i] = ((int64_t)Vs2_vi[i] *
+                                        (uint64_t)Rs1_vu)
+                                        >> sew;
+                        else
+                            Vd_vi[i] = mulhsu_64(Vs2_vi[i], Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x27: vmulh_vx({{
+                        if (sew < 64)
+                            Vd_vi[i] = ((int64_t)Vs2_vi[i] * Rs1_vi)
+                                        >> sew;
+                        else
+                            Vd_vi[i] = mulh_64(Vs2_vi[i], Rs1_vi);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x29: vmadd_vx({{
+                        Vd_vi[i] = Vs3_vi[i] * Rs1_vi + Vs2_vi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x2b: vnmsub_vx({{
+                        Vd_vi[i] = -(Vs3_vi[i] * Rs1_vi) + Vs2_vi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x2d: vmacc_vx({{
+                        Vd_vi[i] = Vs2_vi[i] * Rs1_vi + Vs3_vi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x2f: vnmsac_vx({{
+                        Vd_vi[i] = -(Vs2_vi[i] * Rs1_vi) + Vs3_vi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                }
+                format VectorIntWideningFormat {
+                    0x30: vwaddu_vx({{
+                        Vd_vwu[i] = vwu(Vs2_vu[i + offset]) + vwu(Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x31: vwadd_vx({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset]) + vwi(Rs1_vi);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x32: vwsubu_vx({{
+                        Vd_vwu[i] = vwu(Vs2_vu[i + offset]) - vwu(Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x33: vwsub_vx({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset]) - vwi(Rs1_vi);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x34: vwaddu_wx({{
+                        Vd_vwu[i] = Vs2_vwu[i] + vwu(Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x35: vwadd_wx({{
+                        Vd_vwi[i] = Vs2_vwi[i] + vwi(Rs1_vi);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x36: vwsubu_wx({{
+                        Vd_vwu[i] = Vs2_vwu[i] - vwu(Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x37: vwsub_wx({{
+                        Vd_vwi[i] = Vs2_vwi[i] - vwi(Rs1_vi);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x38: vwmulu_vx({{
+                        Vd_vwu[i] = vwu(Vs2_vu[i + offset]) * vwu(Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x3a: vwmulsu_vx({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset]) * vwu(Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x3b: vwmul_vx({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset]) * vwi(Rs1_vi);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x3c: vwmaccu_vx({{
+                        Vd_vwu[i] = vwu(Rs1_vu) * vwu(Vs2_vu[i + offset])
+                                + Vs3_vwu[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x3d: vwmacc_vx({{
+                        Vd_vwi[i] = vwi(Rs1_vi) * vwi(Vs2_vi[i + offset])
+                                + Vs3_vwi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x3e: vwmaccus_vx({{
+                        Vd_vwi[i] = vwu(Rs1_vu) * vwi(Vs2_vi[i + offset])
+                                + Vs3_vwi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x3f: vwmaccsu_vx({{
+                        Vd_vwi[i] = vwi(Rs1_vi) * vwu(Vs2_vu[i + offset])
+                                + Vs3_vwi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                }
+            }
             0x7: decode BIT31 {
                 format VConfOp {
                     0x0: vsetvli({{
diff --git a/src/arch/riscv/isa/formats/formats.isa b/src/arch/riscv/isa/formats/formats.isa
index 4bdc3021d5..0102df17d7 100644
--- a/src/arch/riscv/isa/formats/formats.isa
+++ b/src/arch/riscv/isa/formats/formats.isa
@@ -38,6 +38,7 @@
 ##include "amo.isa"
 ##include "bs.isa"
 ##include "vector_conf.isa"
+##include "vector_arith.isa"
 ##include "vector_mem.isa"
 
 // Include formats for nonstandard extensions
diff --git a/src/arch/riscv/isa/formats/vector_arith.isa b/src/arch/riscv/isa/formats/vector_arith.isa
new file mode 100644
index 0000000000..c462e6c8d4
--- /dev/null
+++ b/src/arch/riscv/isa/formats/vector_arith.isa
@@ -0,0 +1,1319 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2022 PLCT Lab
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+let {{
+    def setDestWrapper(destRegId):
+        return "setDestRegIdx(_numDestRegs++, " + destRegId + ");\n" + \
+               "_numTypedDestRegs[VecRegClass]++;\n"
+    def setSrcWrapper(srcRegId):
+        return "setSrcRegIdx(_numSrcRegs++, " + srcRegId + ");\n"
+    def setSrcVm():
+        return "if (!this->vm)\n" + \
+               "    setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);"
+    def vmDeclAndReadData():
+        return '''
+            [[maybe_unused]] RiscvISA::vreg_t tmp_v0;
+            [[maybe_unused]] uint8_t* v0;
+            if(!machInst.vm) {
+                xc->getRegOperand(this, _numSrcRegs-1, &tmp_v0);
+                v0 = tmp_v0.as<uint8_t>();
+            }
+        '''
+    def copyOldVd(vd_idx):
+        return 'COPY_OLD_VD(%d);' % vd_idx
+    def loopWrapper(code, micro_inst = True):
+        if micro_inst:
+            upper_bound = "this->microVl"
+        else:
+            upper_bound = "(uint32_t)machInst.vl"
+        return '''
+            for (uint32_t i = 0; i < %s; i++) {
+                %s
+            }
+        ''' % (upper_bound, code)
+    def maskCondWrapper(code):
+        return "if (this->vm || elem_mask(v0, ei)) {\n" + \
+               code + "}\n"
+    def eiDeclarePrefix(code, widening = False):
+        if widening:
+            return '''
+            uint32_t ei = i + micro_vlmax * this->microIdx;
+            ''' + code
+        else:
+            return '''
+            uint32_t ei = i + vtype_VLMAX(vtype, true) * this->microIdx;
+            ''' + code
+
+    def wideningOpRegisterConstraintChecks(code):
+        return '''
+            const uint32_t num_microops = 1 << std::max<int64_t>(0, vtype_vlmul(machInst.vtype8) + 1);
+            if ((machInst.vd % alignToPowerOfTwo(num_microops)) != 0) {
+                std::string error =
+                    csprintf("Unaligned Vd group in Widening op");
+                return std::make_shared<IllegalInstFault>(error, machInst);
+            }
+            if ((machInst.vs2 <= machInst.vd) && (machInst.vd < (machInst.vs2 + num_microops - 1))) {
+                // A destination vector register group can overlap a source vector
+                // register group if The destination EEW is greater than the source
+                // EEW, the source EMUL is at least 1, and the overlap is in the
+                // highest- numbered part of the destination register group.
+                std::string error =
+                    csprintf("Unsupported overlap in Vs2 and Vd for Widening op");
+                return std::make_shared<IllegalInstFault>(error, machInst);
+            }
+            ''' + code
+
+    def narrowingOpRegisterConstraintChecks(code):
+        return '''
+            const uint32_t num_microops = 1 << std::max<int64_t>(0, vtype_vlmul(machInst.vtype8) + 1);
+            if ((machInst.vs2 % alignToPowerOfTwo(num_microops)) != 0) {
+                std::string error =
+                    csprintf("Unaligned VS2 group in Narrowing op");
+                return std::make_shared<IllegalInstFault>(error, machInst);
+            }
+            if ((machInst.vs2 < machInst.vd) && (machInst.vd <= (VS2 + num_microops - 1))) {
+                // A destination vector register group can overlap a source vector
+                // register group The destination EEW is smaller than the source EEW
+                // and the overlap is in the lowest-numbered part of the source
+                // register group
+                std::string error =
+                    csprintf("Unsupported overlap in Vs2 and Vd for Narrowing op");
+                return std::make_shared<IllegalInstFault>(error, machInst);
+            }
+        ''' + code
+
+    def fflags_wrapper(code):
+        return '''
+        RegVal FFLAGS = xc->readMiscReg(MISCREG_FFLAGS);
+        std::feclearexcept(FE_ALL_EXCEPT);
+        ''' + code + '''
+        FFLAGS |= softfloat_exceptionFlags;
+        softfloat_exceptionFlags = 0;
+        xc->setMiscReg(MISCREG_FFLAGS, FFLAGS);
+        '''
+}};
+
+
+def format VectorIntFormat(code, category, *flags) {{
+    macroop_class_name = 'VectorArithMacroInst'
+    microop_class_name = 'VectorArithMicroInst'
+
+    if name == "vid_v" :
+        macroop_class_name = 'VectorVMUNARY0MacroInst'
+        microp_class_name = 'VectorVMUNARY0MicroInst'
+
+    iop = InstObjParams(name, Name, macroop_class_name, {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    v0_required = inst_name not in ["vmv"]
+    mask_cond = v0_required and (inst_suffix not in ['vvm', 'vxm', 'vim'])
+    need_elem_idx = mask_cond or code.find("ei") != -1
+
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+
+    num_src_regs = 0
+
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    num_src_regs += 1
+
+    src1_reg_id = ""
+    if category in ["OPIVV", "OPMVV"]:
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx]"
+        num_src_regs += 1
+    elif category in ["OPIVX", "OPMVX"]:
+        src1_reg_id = "intRegClass[_machInst.rs1]"
+        num_src_regs += 1
+    elif category == "OPIVI":
+        pass
+    else:
+        error("not supported category for VectorIntFormat: %s" % category)
+
+    old_vd_idx = num_src_regs
+    src3_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    if category != "OPIVI":
+        set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    if v0_required:
+        set_src_reg_idx += setSrcVm()
+
+    # code
+    if mask_cond:
+        code = maskCondWrapper(code)
+    if need_elem_idx:
+        code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+
+    vm_decl_rd = ""
+    if v0_required:
+        vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        microop_class_name,
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntMicroDeclare.subst(microiop) + \
+        VectorIntMicroConstructor.subst(microiop) + \
+        VectorIntMicroExecute.subst(microiop) + \
+        VectorIntMacroDeclare.subst(iop) + \
+        VectorIntMacroConstructor.subst(iop)
+
+    decode_block = VectorIntDecodeBlock.subst(iop)
+}};
+
+
+def format VectorIntExtFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    ext_div = int(inst_suffix[-1])
+
+    old_vd_idx = 1
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx / " + \
+                      str(ext_div) + "]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    set_src_reg_idx += setSrcVm()
+
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+    vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx),
+         'ext_div': ext_div},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntExtMicroDeclare.subst(microiop) + \
+        VectorIntMicroConstructor.subst(microiop) + \
+        VectorIntExtMicroExecute.subst(microiop) + \
+        VectorIntExtMacroDeclare.subst(iop) + \
+        VectorIntMacroConstructor.subst(iop)
+
+    decode_block = VectorIntDecodeBlock.subst(iop)
+}};
+
+def format VectorIntWideningFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    v0_required = True
+    mask_cond = v0_required
+    need_elem_idx = mask_cond or code.find("ei") != -1
+    old_vd_idx = 2
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src1_reg_id = ""
+    if category in ["OPIVV", "OPMVV"]:
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx / 2]"
+    elif category in ["OPIVX", "OPMVX"]:
+        src1_reg_id = "intRegClass[_machInst.rs1]"
+    else:
+        error("not supported category for VectorIntFormat: %s" % category)
+    src2_reg_id = ""
+    if inst_suffix in ["vv", "vx"]:
+        src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx / 2]"
+    elif inst_suffix in ["wv", "wx"]:
+        src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    if v0_required:
+        set_src_reg_idx += setSrcVm()
+
+    # code
+    if mask_cond:
+        code = maskCondWrapper(code)
+    if need_elem_idx:
+        code = eiDeclarePrefix(code, widening=True)
+    code = loopWrapper(code)
+
+    code = wideningOpRegisterConstraintChecks(code)
+
+    vm_decl_rd = ""
+    if v0_required:
+        vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntWideningMicroDeclare.subst(microiop) + \
+        VectorIntWideningMicroConstructor.subst(microiop) + \
+        VectorIntWideningMicroExecute.subst(microiop) + \
+        VectorIntWideningMacroDeclare.subst(iop) + \
+        VectorIntWideningMacroConstructor.subst(iop)
+
+    decode_block = VectorIntWideningDecodeBlock.subst(iop)
+}};
+
+def format VectorIntNarrowingFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    mask_cond = True
+    need_elem_idx = True
+
+    old_vd_idx = 2
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx / 2]"
+    if category in ["OPIVV"]:
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx / 2]"
+    elif category in ["OPIVX"]:
+        src1_reg_id = "intRegClass[_machInst.rs1]"
+    elif category == "OPIVI":
+        old_vd_idx = 1
+    else:
+        error("not supported category for VectorIntFormat: %s" % category)
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vs3 + _microIdx / 2]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = ""
+    if category != "OPIVI":
+        set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_src_reg_idx += setSrcVm()
+    # code
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code, widening=True)
+    code = loopWrapper(code)
+    code = narrowingOpRegisterConstraintChecks(code)
+    vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx),
+         },
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntWideningMicroDeclare.subst(microiop) + \
+        VectorIntWideningMicroConstructor.subst(microiop) + \
+        VectorIntNarrowingMicroExecute.subst(microiop) + \
+        VectorIntWideningMacroDeclare.subst(iop) + \
+        VectorIntWideningMacroConstructor.subst(iop)
+
+    decode_block = VectorIntWideningDecodeBlock.subst(iop)
+}};
+
+def format VectorIntMaskFormat(code, category, *flags) {{
+    iop = InstObjParams(name,
+        Name,
+        'VectorArithMacroInst',
+        {'code': code},
+        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    v0_required = not (inst_name in ["vmadc", "vmsbc"] \
+        and inst_suffix in ["vv", "vx", "vi"])
+    mask_cond = inst_name not in ['vmadc', 'vmsbc']
+    need_elem_idx = mask_cond or code.find("ei") != -1
+
+    old_vd_idx = 2
+    dest_reg_id = "vecRegClass[VecMemInternalReg0 + _microIdx]"
+    src1_reg_id = ""
+    if category == "OPIVV":
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx]"
+    elif category == "OPIVX":
+        src1_reg_id = "intRegClass[_machInst.rs1]"
+    elif category == "OPIVI":
+        old_vd_idx = 1
+    else:
+        error("not supported category for VectorIntFormat: %s" % category)
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = ""
+    if category != "OPIVI":
+        set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    if v0_required:
+        set_src_reg_idx += setSrcVm()
+
+    #code
+    if mask_cond:
+        code = maskCondWrapper(code)
+    if need_elem_idx:
+        code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+
+    vm_decl_rd = ""
+    if v0_required:
+        vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntMaskMicroDeclare.subst(microiop) + \
+        VectorIntMaskMicroConstructor.subst(microiop) + \
+        VectorIntMaskMicroExecute.subst(microiop) + \
+        VectorIntMaskMacroDeclare.subst(iop) + \
+        VectorIntMaskMacroConstructor.subst(iop)
+    decode_block = VectorIntDecodeBlock.subst(iop)
+}};
+
+def format VectorGatherFormat(code, category, *flags) {{
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    if inst_name == "vrgatherei16":
+        idx_type = "uint16_t"
+    else:
+        idx_type = "elem_type"
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst',
+        {'idx_type': idx_type,
+         'code': code},
+        flags)
+    old_vd_idx = 2
+    dest_reg_id = "vecRegClass[_machInst.vd + vd_idx]"
+    src1_reg_id = ""
+    if category in ["OPIVV"]:
+        src1_reg_id = "vecRegClass[_machInst.vs1 + vs1_idx]"
+    elif category in ["OPIVX"]:
+        src1_reg_id = "intRegClass[_machInst.rs1]"
+    elif category == "OPIVI":
+        old_vd_idx = 1
+    else:
+        error("not supported category for VectorIntFormat: %s" % category)
+    src2_reg_id = "vecRegClass[_machInst.vs2 + vs2_idx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + vd_idx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    if category != "OPIVI":
+        set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    set_src_reg_idx += setSrcVm()
+
+    # code
+
+    vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx),
+         'idx_type': idx_type},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorGatherMicroDeclare.subst(microiop) + \
+        VectorGatherMicroConstructor.subst(microiop) + \
+        VectorGatherMicroExecute.subst(microiop) + \
+        VectorGatherMacroDeclare.subst(iop) + \
+        VectorGatherMacroConstructor.subst(iop)
+
+    decode_block = VectorGatherDecodeBlock.subst(iop)
+
+}};
+
+def format VectorFloatFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    v0_required = inst_name not in ["vfmv"]
+    mask_cond = v0_required and (inst_suffix not in ['vvm', 'vfm'])
+    need_elem_idx = mask_cond or code.find("ei") != -1
+
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src1_reg_id = ""
+    if category == "OPFVV":
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx]"
+    elif category == "OPFVF":
+        src1_reg_id = "floatRegClass[_machInst.rs1]"
+    else:
+        error("not supported category for VectorFloatFormat: %s" % category)
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    if v0_required:
+        set_src_reg_idx += setSrcVm()
+    # code
+    if mask_cond:
+        code = maskCondWrapper(code)
+    if need_elem_idx:
+        code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+    code = fflags_wrapper(code)
+
+    vm_decl_rd = ""
+    if v0_required:
+        vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorFloatMicroDeclare.subst(microiop) + \
+        VectorFloatMicroConstructor.subst(microiop) + \
+        VectorFloatMicroExecute.subst(microiop) + \
+        VectorFloatMacroDeclare.subst(iop) + \
+        VectorFloatMacroConstructor.subst(iop)
+
+    decode_block = VectorFloatDecodeBlock.subst(iop)
+}};
+
+def format VectorFloatCvtFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+
+    old_vd_idx = 1
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    set_src_reg_idx += setSrcVm()
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+    code = fflags_wrapper(code)
+
+    vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorFloatCvtMicroDeclare.subst(microiop) + \
+        VectorFloatMicroConstructor.subst(microiop) + \
+        VectorFloatMicroExecute.subst(microiop) + \
+        VectorFloatCvtMacroDeclare.subst(iop) + \
+        VectorFloatMacroConstructor.subst(iop)
+
+    decode_block = VectorFloatDecodeBlock.subst(iop)
+}};
+
+def format VectorFloatWideningFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    v0_required = True
+    mask_cond = v0_required
+    need_elem_idx = mask_cond or code.find("ei") != -1
+
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src1_reg_id = ""
+    if category in ["OPFVV"]:
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx / 2]"
+    elif category in ["OPFVF"]:
+        src1_reg_id = "floatRegClass[_machInst.rs1]"
+    else:
+        error("not supported category for VectorFloatFormat: %s" % category)
+    src2_reg_id = ""
+    if inst_suffix in ["vv", "vf"]:
+        src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx / 2]"
+    elif inst_suffix in ["wv", "wf"]:
+        src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    if v0_required:
+        set_src_reg_idx += setSrcVm()
+
+    # code
+    if mask_cond:
+        code = maskCondWrapper(code)
+    if need_elem_idx:
+        code = eiDeclarePrefix(code, widening=True)
+    code = loopWrapper(code)
+    code = fflags_wrapper(code)
+
+    code = wideningOpRegisterConstraintChecks(code)
+
+    vm_decl_rd = ""
+    if v0_required:
+        vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntWideningMicroDeclare.subst(microiop) + \
+        VectorIntWideningMicroConstructor.subst(microiop) + \
+        VectorFloatWideningMicroExecute.subst(microiop) + \
+        VectorIntWideningMacroDeclare.subst(iop) + \
+        VectorIntWideningMacroConstructor.subst(iop)
+
+    decode_block = VectorFloatWideningDecodeBlock.subst(iop)
+}};
+
+def format VectorFloatWideningCvtFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+
+    old_vd_idx = 1
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx / 2]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    set_src_reg_idx += setSrcVm()
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+    code = fflags_wrapper(code)
+
+    vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorFloatCvtMicroDeclare.subst(microiop) + \
+        VectorFloatMicroConstructor.subst(microiop) + \
+        VectorFloatWideningMicroExecute.subst(microiop) + \
+        VectorFloatCvtMacroDeclare.subst(iop) + \
+        VectorIntWideningMacroConstructor.subst(iop)
+
+    decode_block = VectorFloatWideningDecodeBlock.subst(iop)
+}};
+
+def format VectorFloatNarrowingCvtFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+
+    old_vd_idx = 1
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx / 2]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx / 2]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    set_src_reg_idx += setSrcVm()
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+    code = fflags_wrapper(code)
+    code = narrowingOpRegisterConstraintChecks(code)
+
+    vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorFloatCvtMicroDeclare.subst(microiop) + \
+        VectorFloatMicroConstructor.subst(microiop) + \
+        VectorFloatNarrowingMicroExecute.subst(microiop) + \
+        VectorFloatCvtMacroDeclare.subst(iop) + \
+        VectorIntWideningMacroConstructor.subst(iop)
+
+    decode_block = VectorFloatWideningDecodeBlock.subst(iop)
+}};
+
+def format VectorFloatMaskFormat(code, category, *flags) {{
+    iop = InstObjParams(name,
+        Name,
+        'VectorArithMacroInst',
+        {'code': code},
+        flags)
+    dest_reg_id = "vecRegClass[VecMemInternalReg0 + _microIdx]"
+    src1_reg_id = ""
+    if category == "OPFVV":
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx]"
+    elif category == "OPFVF":
+        src1_reg_id = "floatRegClass[_machInst.rs1]"
+    else:
+        error("not supported category for VectorFloatFormat: %s" % category)
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_src_reg_idx += setSrcVm()
+    vm_decl_rd = vmDeclAndReadData()
+
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+    code = fflags_wrapper(code)
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorFloatMaskMicroDeclare.subst(microiop) + \
+        VectorFloatMaskMicroConstructor.subst(microiop) + \
+        VectorFloatMaskMicroExecute.subst(microiop) + \
+        VectorFloatMaskMacroDeclare.subst(iop) + \
+        VectorFloatMaskMacroConstructor.subst(iop)
+    decode_block = VectorFloatDecodeBlock.subst(iop)
+}};
+
+def format VMvWholeFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VMvWholeMacroInst', {'code': code}, flags)
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VMvWholeMicroInst',
+        {'code': code},
+        flags)
+
+    header_output = \
+        VMvWholeMacroDeclare.subst(iop) + \
+        VMvWholeMicroDeclare.subst(microiop)
+    decoder_output = \
+        VMvWholeMacroConstructor.subst(iop) + \
+        VMvWholeMicroConstructor.subst(microiop)
+    exec_output = VMvWholeMicroExecute.subst(microiop)
+    decode_block = BasicDecode.subst(iop)
+}};
+
+def format ViotaFormat(code, category, *flags){{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    # The tail of vector mask inst should be treated as tail-agnostic.
+    # We treat it with tail-undisturbed policy, since
+    # the test suits only support undisturbed policy.
+    old_dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    vm_decl_rd = vmDeclAndReadData()
+    set_vm_idx = setSrcVm()
+
+    microiop = InstObjParams(name+"_micro",
+        Name+"Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'set_vm_idx': set_vm_idx,
+         'copy_old_vd': copyOldVd(1)},
+        flags)
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        ViotaMicroDeclare.subst(microiop) + \
+        ViotaMicroConstructor.subst(microiop) + \
+        ViotaMicroExecute.subst(microiop)+\
+        ViotaMacroDeclare.subst(iop) + \
+        ViotaMacroConstructor.subst(iop)
+
+    decode_block = VectorIntDecodeBlock.subst(iop)
+
+}};
+
+def format Vector1Vs1VdMaskFormat(code, category, *flags){{
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd]"
+    src2_reg_id = "vecRegClass[_machInst.vs2]"
+    # The tail of vector mask inst should be treated as tail-agnostic.
+    # We treat it with tail-undisturbed policy, since
+    # the test suits only support undisturbed policy.
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    vm_decl_rd = vmDeclAndReadData()
+    set_vm_idx = setSrcVm()
+    iop = InstObjParams(name,
+        Name,
+        'VectorNonSplitInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'set_vm_idx': set_vm_idx,
+         'copy_old_vd': copyOldVd(1)},
+        flags)
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        Vector1Vs1RdMaskDeclare.subst(iop) + \
+        Vector1Vs1VdMaskConstructor.subst(iop) + \
+        Vector1Vs1VdMaskExecute.subst(iop)
+
+    decode_block = VectorMaskDecodeBlock.subst(iop)
+}};
+
+def format Vector1Vs1RdMaskFormat(code, category, *flags){{
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    vm_decl_rd = vmDeclAndReadData()
+    set_vm_idx = setSrcVm()
+    iop = InstObjParams(name,
+        Name,
+        'VectorNonSplitInst',
+        {'code': code,
+         'vm_decl_rd': vm_decl_rd,
+         'set_vm_idx': set_vm_idx},
+        flags)
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        Vector1Vs1RdMaskDeclare.subst(iop) + \
+        Vector1Vs1RdMaskConstructor.subst(iop) + \
+        Vector1Vs1RdMaskExecute.subst(iop)
+
+    decode_block = VectorMaskDecodeBlock.subst(iop)
+}};
+
+def format VectorNonSplitFormat(code, category, *flags) {{
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    vm_decl_rd = ""
+
+    set_vm_idx = ""
+
+    if inst_name == "vfmv" :
+        code = fflags_wrapper(code)
+
+    iop = InstObjParams(name,
+        Name,
+        'VectorNonSplitInst',
+        {'code': code,
+         'vm_decl_rd': vm_decl_rd,
+         'set_vm_idx': set_vm_idx},
+        flags)
+
+
+    if inst_name == "vfmv" :
+        execute_block = VectorFloatNonSplitExecute.subst(iop)
+        decode_block = VectorFloatDecodeBlock.subst(iop)
+    elif inst_name == "vmv" :
+        execute_block = VectorIntNonSplitExecute.subst(iop)
+        decode_block = VectorIntDecodeBlock.subst(iop)
+    else :
+        error("Unsupported inst for VectorNonSplitFormat: %s" % inst_name)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorNonSplitDeclare.subst(iop) + \
+        VectorNonSplitConstructor.subst(iop) + \
+        execute_block
+
+}};
+
+def format VectorMaskFormat(code, category, *flags) {{
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    old_vd_idx = 2
+    if category not in ["OPMVV"]:
+        error("not supported category for VectorIntFormat: %s" % category)
+    dest_reg_id = "vecRegClass[_machInst.vd]"
+    src1_reg_id = "vecRegClass[_machInst.vs1]"
+    src2_reg_id = "vecRegClass[_machInst.vs2]"
+
+    # The tail of vector mask inst should be treated as tail-agnostic.
+    # We treat it with tail-undisturbed policy, since
+    # the test suits only support undisturbed policy.
+    # TODO: remove it
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    code = loopWrapper(code, micro_inst = False)
+
+    iop = InstObjParams(name,
+        Name,
+        'VectorNonSplitInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorMaskDeclare.subst(iop) + \
+        VectorMaskConstructor.subst(iop) + \
+        VectorMaskExecute.subst(iop)
+
+    decode_block = VectorMaskDecodeBlock.subst(iop)
+}};
+
+def format VectorReduceIntFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd]"
+    src1_reg_id = "vecRegClass[_machInst.vs1]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    # Treat tail undisturbed/agnostic as the same
+    # We always need old rd as src vreg
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_src_reg_idx += setSrcVm()
+    vm_decl_rd = vmDeclAndReadData()
+    type_def = '''
+        using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+        using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    '''
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'type_def': type_def,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorReduceMicroDeclare.subst(microiop) + \
+        VectorReduceMicroConstructor.subst(microiop) + \
+        VectorReduceIntMicroExecute.subst(microiop) + \
+        VectorReduceMacroDeclare.subst(iop) + \
+        VectorReduceMacroConstructor.subst(iop)
+    decode_block = VectorIntDecodeBlock.subst(iop)
+}};
+
+def format VectorReduceFloatFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd]"
+    src1_reg_id = "vecRegClass[_machInst.vs1]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    # Treat tail undisturbed/agnostic as the same
+    # We always need old rd as src vreg
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_src_reg_idx += setSrcVm()
+    vm_decl_rd = vmDeclAndReadData()
+    type_def = '''
+        using et = ElemType;
+        using vu = decltype(et::v);
+    '''
+
+    code = fflags_wrapper(code)
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'type_def': type_def,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorReduceMicroDeclare.subst(microiop) + \
+        VectorReduceMicroConstructor.subst(microiop) + \
+        VectorReduceFloatMicroExecute.subst(microiop) + \
+        VectorReduceMacroDeclare.subst(iop) + \
+        VectorReduceMacroConstructor.subst(iop)
+    decode_block = VectorFloatDecodeBlock.subst(iop)
+}};
+
+def format VectorReduceFloatWideningFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd]"
+    src1_reg_id = "vecRegClass[_machInst.vs1]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    # Treat tail undisturbed/agnostic as the same
+    # We always need old rd as src vreg
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_src_reg_idx += setSrcVm()
+    vm_decl_rd = vmDeclAndReadData()
+    type_def = '''
+        using et = ElemType;
+        using vu [[maybe_unused]] = decltype(et::v);
+        using ewt = typename double_width<et>::type;
+        using vwu = decltype(ewt::v);
+    '''
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'type_def': type_def,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorReduceMicroDeclare.subst(microiop) + \
+        VectorReduceMicroConstructor.subst(microiop) + \
+        VectorReduceFloatWideningMicroExecute.subst(microiop) + \
+        VectorReduceMacroDeclare.subst(iop) + \
+        VectorReduceMacroConstructor.subst(iop)
+    decode_block = VectorFloatWideningDecodeBlock.subst(iop)
+}};
+
+def format VectorIntVxsatFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    old_vd_idx = 2
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src1_reg_id = ""
+    if category in ["OPIVV"]:
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx]"
+    elif category in ["OPIVX"]:
+        src1_reg_id = "intRegClass[_machInst.rs1]"
+    elif category == "OPIVI":
+        old_vd_idx = 1
+    else:
+        error("not supported category for VectorIntVxsatFormat: %s" % category)
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    if category != "OPIVI":
+        set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    set_src_reg_idx += setSrcVm()
+    vm_decl_rd = vmDeclAndReadData()
+
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntVxsatMicroDeclare.subst(microiop) + \
+        VectorIntVxsatMicroConstructor.subst(microiop) + \
+        VectorIntMicroExecute.subst(microiop) + \
+        VectorIntVxsatMacroDeclare.subst(iop) + \
+        VectorIntVxsatMacroConstructor.subst(iop)
+
+    decode_block = VectorIntDecodeBlock.subst(iop)
+}};
+
+def format VectorReduceIntWideningFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd]"
+    src1_reg_id = "vecRegClass[_machInst.vs1]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    # Treat tail undisturbed/agnostic as the same
+    # We always need old rd as src vreg
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_src_reg_idx += setSrcVm()
+    vm_decl_rd = vmDeclAndReadData()
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorReduceMicroDeclare.subst(microiop) + \
+        VectorReduceMicroConstructor.subst(microiop) + \
+        VectorReduceIntWideningMicroExecute.subst(microiop) + \
+        VectorReduceMacroDeclare.subst(iop) + \
+        VectorReduceMacroConstructor.subst(iop)
+    decode_block = VectorIntWideningDecodeBlock.subst(iop)
+}};
+
+let {{
+
+def VectorSlideBase(name, Name, category, code, flags, macro_construtor,
+        decode_template, micro_execute_template):
+    macroop_class_name = 'VectorSlideMacroInst'
+    microop_class_name = 'VectorSlideMicroInst'
+    # Make sure flags are in lists (convert to lists if not).
+    flags = makeList(flags)
+    iop = InstObjParams(name, Name, macroop_class_name, {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd + vdIdx]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + vs2Idx]"
+    src1_ireg_id = "intRegClass[_machInst.rs1]"
+    src1_freg_id = "floatRegClass[_machInst.rs1]"
+
+    # The tail of vector mask inst should be treated as tail-agnostic.
+    # We treat it with tail-undisturbed policy, since
+    # the test suits only support undisturbed policy.
+    num_src_regs = 0
+
+    old_dest_reg_id = "vecRegClass[_machInst.vd + vdIdx]"
+    set_src_reg_idx = ""
+    if category in ["OPIVX", "OPMVX"]:
+        set_src_reg_idx += setSrcWrapper(src1_ireg_id)
+        num_src_regs += 1
+    elif category in ["OPFVF"]:
+        set_src_reg_idx += setSrcWrapper(src1_freg_id)
+        num_src_regs += 1
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    num_src_regs += 1
+    old_vd_idx = num_src_regs
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    vm_decl_rd = vmDeclAndReadData()
+    set_src_reg_idx += setSrcVm()
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        microop_class_name,
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorSlideMicroDeclare.subst(microiop) + \
+        VectorSlideMicroConstructor.subst(microiop) + \
+        micro_execute_template.subst(microiop) + \
+        VectorSlideMacroDeclare.subst(iop) + \
+        macro_construtor.subst(iop)
+
+    decode_block = decode_template.subst(iop)
+    return (header_output, decode_block)
+
+}};
+
+def format VectorSlideUpFormat(code, category, *flags) {{
+    (header_output, decode_block) = VectorSlideBase(name, Name, category, code,
+        flags,
+        macro_construtor = VectorSlideUpMacroConstructor,
+        decode_template = VectorIntDecodeBlock,
+        micro_execute_template = VectorSlideMicroExecute)
+}};
+
+def format VectorSlideDownFormat(code, category, *flags) {{
+    (header_output, decode_block) = VectorSlideBase(name, Name, category, code,
+        flags,
+        macro_construtor = VectorSlideDownMacroConstructor,
+        decode_template = VectorIntDecodeBlock,
+        micro_execute_template = VectorSlideMicroExecute)
+}};
+
+def format VectorFloatSlideUpFormat(code, category, *flags) {{
+    (header_output, decode_block) = VectorSlideBase(name, Name, category, code,
+        flags,
+        macro_construtor = VectorSlideUpMacroConstructor,
+        decode_template = VectorFloatDecodeBlock,
+        micro_execute_template = VectorFloatSlideMicroExecute)
+}};
+
+def format VectorFloatSlideDownFormat(code, category, *flags) {{
+    (header_output, decode_block) = VectorSlideBase(name, Name, category, code,
+        flags,
+        macro_construtor = VectorSlideDownMacroConstructor,
+        decode_template = VectorFloatDecodeBlock,
+        micro_execute_template = VectorFloatSlideMicroExecute)
+}};
diff --git a/src/arch/riscv/isa/templates/templates.isa b/src/arch/riscv/isa/templates/templates.isa
index b4de46d846..ed3f5287c0 100644
--- a/src/arch/riscv/isa/templates/templates.isa
+++ b/src/arch/riscv/isa/templates/templates.isa
@@ -1,2 +1,32 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2022 PLCT Lab
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
 // Include
 ##include "vector_mem.isa"
+##include "vector_arith.isa"
diff --git a/src/arch/riscv/isa/templates/vector_arith.isa b/src/arch/riscv/isa/templates/vector_arith.isa
new file mode 100644
index 0000000000..d15ab70f20
--- /dev/null
+++ b/src/arch/riscv/isa/templates/vector_arith.isa
@@ -0,0 +1,1989 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2022 PLCT Lab
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+output header {{
+
+#define ASSIGN_VD_BIT(idx, bit) \
+    ((Vd[(idx)/8] & ~(1 << (idx)%8)) | ((bit) << (idx)%8))
+
+#define COPY_OLD_VD(idx)                                             \
+    [[maybe_unused]] RiscvISA::vreg_t old_vd;                        \
+    [[maybe_unused]] decltype(Vd) old_Vd = nullptr;                  \
+    xc->getRegOperand(this, (idx), &old_vd);                           \
+    old_Vd = old_vd.as<std::remove_reference_t<decltype(Vd[0])> >(); \
+    memcpy(Vd, old_Vd, VLENB);
+
+#define VRM_REQUIRED                                                         \
+        uint_fast8_t frm = xc->readMiscReg(MISCREG_FRM);                     \
+        if (frm > 4)                                                         \
+            return std::make_shared<IllegalInstFault>("RM fault", machInst); \
+        softfloat_roundingMode = frm;
+
+template<typename Type>
+bool inline
+carry_out(Type a, Type b, bool carry_in = false) {
+    using TypeU = std::make_unsigned_t<Type>;
+    TypeU s = *reinterpret_cast<TypeU*>(&a)
+            + *reinterpret_cast<TypeU*>(&b) + carry_in;
+    return carry_in
+        ? (s <= *reinterpret_cast<TypeU*>(&a))
+        : (s <  *reinterpret_cast<TypeU*>(&a));
+}
+
+template<typename Type>
+bool inline
+borrow_out(Type a, Type b, bool borrow_in = false) {
+    using TypeU = std::make_unsigned_t<Type>;
+    return borrow_in
+        ? (*reinterpret_cast<TypeU*>(&a) <= *reinterpret_cast<TypeU*>(&b))
+        : (*reinterpret_cast<TypeU*>(&a) <  *reinterpret_cast<TypeU*>(&b));
+}
+
+}};
+
+def template VectorIntMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+}};
+
+def template VectorIntMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs1, vs2, vs3(old_vd), vm for *.vv, *.vx
+    // vs2, (old_vd), vm for *.vi
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+                   uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+                                         uint8_t _microVl, uint8_t _microIdx)
+    : %(base_class)s("%(mnemonic)s", _machInst,
+                     %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorIntMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    [[maybe_unused]] constexpr size_t sew = sizeof(vu) * 8;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+
+    return NoFault;
+}
+
+}};
+
+def template VectorIntExtMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    std::string generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const override
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+            << registerName(srcRegIdx(0));
+        if (machInst.vm == 0) ss << ", v0.t";
+        return ss.str();
+    }
+};
+
+}};
+
+def template VectorIntExtMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    RegId srcRegIdxArr[3];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+                   uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    std::string generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const override
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+            << registerName(srcRegIdx(0));
+        if (machInst.vm == 0) ss << ", v0.t";
+        return ss.str();
+    }
+};
+
+}};
+
+def template VectorIntExtMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    auto SEW = vtype_SEW(vtype);
+    auto offset = (VLEN / SEW) * (microIdx % %(ext_div)d);
+    switch (SEW / %(ext_div)d) {
+      case 8: {
+        using vext  [[maybe_unused]] = int8_t;
+        using vextu [[maybe_unused]] = uint8_t;
+        %(op_decl)s;
+        %(op_rd)s;
+        %(vm_decl_rd)s;
+        %(copy_old_vd)s;
+        %(code)s;
+        %(op_wb)s;
+        break;
+      }
+      case 16: {
+        using vext  [[maybe_unused]] = int16_t;
+        using vextu [[maybe_unused]] = uint16_t;
+        %(op_decl)s;
+        %(op_rd)s;
+        %(vm_decl_rd)s;
+        %(copy_old_vd)s;
+        %(code)s;
+        %(op_wb)s;
+        break;
+      }
+      case 32: {
+        using vext  [[maybe_unused]] = int32_t;
+        using vextu [[maybe_unused]] = uint32_t;
+        %(op_decl)s;
+        %(op_rd)s;
+        %(vm_decl_rd)s;
+        %(copy_old_vd)s;
+        %(code)s;
+        %(op_wb)s;
+      break;
+      }
+      default: break;
+    }
+
+    return NoFault;
+}
+
+}};
+
+def template VectorIntDecodeBlock {{
+
+switch(machInst.vtype8.vsew) {
+case 0b000: return new %(class_name)s<uint8_t>(machInst);
+case 0b001: return new %(class_name)s<uint16_t>(machInst);
+case 0b010: return new %(class_name)s<uint32_t>(machInst);
+case 0b011: return new %(class_name)s<uint64_t>(machInst);
+default: GEM5_UNREACHABLE;
+}
+
+}};
+
+def template VectorIntWideningMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntWideningMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const int64_t vlmul = vtype_vlmul(_machInst.vtype8);
+    // Todo: move to Decode template
+    panic_if(vlmul == 3, "LMUL=8 is illegal for widening inst");
+    // when LMUL setted as m1, need to split to 2 micro insts
+    const uint32_t num_microops = 1 << std::max<int64_t>(0, vlmul + 1);
+
+    int32_t tmp_vl = this->vl;
+    const int32_t t_micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    const int32_t micro_vlmax = vlmul < 0 ? t_micro_vlmax : t_micro_vlmax / 2;
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorIntWideningMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs1, vs2, vs3(old_vd), vm for *.vv, *.vx
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+                   uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntWideningMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+        uint8_t _microVl, uint8_t _microIdx)
+    : %(base_class)s("%(mnemonic)s", _machInst,
+                     %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorIntWideningMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    using vwu [[maybe_unused]] = typename double_width<vu>::type;
+    using vwi [[maybe_unused]] = typename double_width<vi>::type;
+    [[maybe_unused]] constexpr size_t sew = sizeof(vu) * 8;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    const int64_t vlmul = vtype_vlmul(machInst.vtype8);
+    const int32_t t_micro_vlmax = vtype_VLMAX(machInst.vtype8, true);
+    const int32_t micro_vlmax = vlmul < 0 ? t_micro_vlmax : t_micro_vlmax / 2;
+    [[maybe_unused]] const size_t offset =
+        (this->microIdx % 2 == 0) ? 0 : micro_vlmax;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorIntNarrowingMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    using vwu [[maybe_unused]] = typename double_width<vu>::type;
+    using vwi [[maybe_unused]] = typename double_width<vi>::type;
+    [[maybe_unused]] constexpr size_t sew = sizeof(vu) * 8;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    const int64_t vlmul = vtype_vlmul(machInst.vtype8);
+    const int32_t t_micro_vlmax = vtype_VLMAX(machInst.vtype8, true);
+    const int32_t micro_vlmax = vlmul < 0 ? t_micro_vlmax : t_micro_vlmax / 2;
+    [[maybe_unused]] const size_t offset =
+        (this->microIdx % 2 == 0) ? 0 : micro_vlmax;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorIntWideningDecodeBlock {{
+
+switch(machInst.vtype8.vsew) {
+case 0b000: return new %(class_name)s<uint8_t>(machInst);
+case 0b001: return new %(class_name)s<uint16_t>(machInst);
+case 0b010: return new %(class_name)s<uint32_t>(machInst);
+default: GEM5_UNREACHABLE;
+}
+
+}};
+
+def template VectorFloatMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorFloatMacroConstructor {{
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+}};
+
+def template VectorFloatMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs1, vs2, vs3(old_vd), vm
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst,
+        uint8_t _microVl, uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorFloatMicroConstructor {{
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+                                         uint8_t _microVl, uint8_t _microIdx)
+    : %(base_class)s("%(mnemonic)s", _machInst,
+                     %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorFloatMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using et = ElemType;
+    using vu = decltype(et::v);
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    VRM_REQUIRED;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+
+    return NoFault;
+}
+
+}};
+
+def template VectorFloatDecodeBlock {{
+
+switch(machInst.vtype8.vsew) {
+case 0b010: return new %(class_name)s<float32_t>(machInst);
+case 0b011: return new %(class_name)s<float64_t>(machInst);
+default: GEM5_UNREACHABLE;
+}
+
+}};
+
+def template VectorFloatCvtMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    std::string generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const override
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+            << registerName(srcRegIdx(0));
+        if (machInst.vm == 0) ss << ", v0.t";
+        return ss.str();
+    }
+};
+
+}};
+
+def template VectorFloatCvtMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    RegId srcRegIdxArr[3];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst,
+        uint8_t _microVl, uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    std::string generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const override
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+            << registerName(srcRegIdx(0));
+        if (machInst.vm == 0) ss << ", v0.t";
+        return ss.str();
+    }
+};
+
+}};
+
+
+def template VectorFloatWideningMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using et = ElemType;
+    using vu [[maybe_unused]] = decltype(et::v);
+    using ewt = typename double_width<et>::type;
+    using vwu = decltype(ewt::v);
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    VRM_REQUIRED;
+
+    const int64_t vlmul = vtype_vlmul(machInst.vtype8);
+    const int32_t t_micro_vlmax = vtype_VLMAX(machInst.vtype8, true);
+    const int32_t micro_vlmax = vlmul < 0 ? t_micro_vlmax : t_micro_vlmax / 2;
+    [[maybe_unused]] const size_t offset =
+        (this->microIdx % 2 == 0) ? 0 : micro_vlmax;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorFloatNarrowingMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using et = ElemType;
+    using vu [[maybe_unused]] = decltype(et::v);
+    using ewt = typename double_width<et>::type;
+    using vwu = decltype(ewt::v);
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    VRM_REQUIRED;
+
+    const int64_t vlmul = vtype_vlmul(machInst.vtype8);
+    const int32_t t_micro_vlmax = vtype_VLMAX(machInst.vtype8, true);
+    const int32_t micro_vlmax = vlmul < 0 ? t_micro_vlmax : t_micro_vlmax / 2;
+    [[maybe_unused]] const size_t offset =
+        (this->microIdx % 2 == 0) ? 0 : micro_vlmax;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorFloatWideningDecodeBlock {{
+
+switch(machInst.vtype8.vsew) {
+case 0b010: return new %(class_name)s<float32_t>(machInst);
+default: GEM5_UNREACHABLE;
+}
+
+}};
+
+def template ViotaMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    int cnt = 0;
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+
+def template ViotaMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+
+    StaticInstPtr microop;
+
+    // Allow one empty micro op to hold IsLastMicroop flag
+    for (int i = 0; i < num_microops && micro_vl >= 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i,
+            &cnt);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template ViotaMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+    int* cnt;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+                   uint8_t _microIdx, int* cnt);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template ViotaMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+    uint8_t _microVl, uint8_t _microIdx, int* cnt)
+    : %(base_class)s("%(mnemonic)s", _machInst,
+                     %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    this->cnt = cnt;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+    setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs2]);
+}
+
+}};
+
+def template ViotaMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+
+def template Vector1Vs1VdMaskConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+    %(set_vm_idx)s;
+}
+
+}};
+
+def template Vector1Vs1VdMaskExecute {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu = uint8_t;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+};
+
+}};
+
+def template Vector1Vs1RdMaskDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    RegId srcRegIdxArr[2];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template Vector1Vs1RdMaskConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    %(set_vm_idx)s;
+}
+
+}};
+
+def template Vector1Vs1RdMaskExecute {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    %(op_rd)s;
+    uint64_t Rd = 0;
+    %(vm_decl_rd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+};
+
+}};
+
+def template VectorIntMaskMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntMaskMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+    microop = new VMaskMergeMicroInst<ElemType>(_machInst, _machInst.vd,
+        this->microops.size());
+    this->microops.push_back(microop);
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorIntMaskMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs1(rs1), vs2, old_vd, v0 for *.vv[m] or *.vx[m]
+    // vs2, old_vd, v0 for *.vi[m]
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst,
+                   uint8_t _microVl, uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntMaskMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+                                         uint8_t _microVl, uint8_t _microIdx)
+: %(base_class)s("%(mnemonic)s", _machInst,
+                 %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorIntMaskMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+
+    constexpr uint16_t bit_offset = VLENB / sizeof(ElemType);
+    const uint16_t offset = bit_offset * microIdx;
+
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorFloatMaskMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorFloatMaskMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+    microop = new VMaskMergeMicroInst<ElemType>(_machInst, _machInst.vd,
+        this->microops.size());
+    this->microops.push_back(microop);
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorFloatMaskMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs1(rs1), vs2, old_vd, v0 for *.vv or *.vf
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst,
+                   uint8_t _microVl, uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorFloatMaskMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+                                         uint8_t _microVl, uint8_t _microIdx)
+: %(base_class)s("%(mnemonic)s", _machInst,
+                 %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorFloatMaskMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using et = ElemType;
+    using vu = decltype(et::v);
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+
+    constexpr uint16_t bit_offset = VLENB / sizeof(ElemType);
+    const uint16_t offset = bit_offset * microIdx;
+
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VMvWholeMacroDeclare {{
+
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VMvWholeMacroConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = _machInst.simm3 + 1;
+    StaticInstPtr microop;
+
+    for (int i = 0; i < num_microops; ++i) {
+        microop = new %(class_name)sMicro(_machInst, 0, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VMvWholeMicroDeclare {{
+
+class %(class_name)s : public %(base_class)s
+{
+private:
+    RegId srcRegIdxArr[1];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+                   uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VMvWholeMicroConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst,
+                               uint8_t _microVl, uint8_t _microIdx)
+    : %(base_class)s("%(mnemonic)s", _machInst,
+                     %(op_class)s, _microVl, _microIdx)
+{
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _microIdx]);
+    _numTypedDestRegs[VecRegClass]++;
+    setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs2 + _microIdx]);
+}
+
+}};
+
+def template VMvWholeMicroExecute {{
+
+Fault
+%(class_name)s::execute(ExecContext* xc, trace::InstRecord* traceData) const
+{
+    // TODO: Check register alignment.
+    // TODO: If vd is equal to vs2 the instruction is an architectural NOP.
+    %(op_decl)s;
+    %(op_rd)s;
+    for (size_t i = 0; i < (VLEN / 64); i++) {
+        %(code)s;
+    }
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorMaskDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    RegId srcRegIdxArr[3];
+    RegId destRegIdxArr[1];
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorMaskConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorMaskExecute {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu = uint8_t;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    // TODO: remove it
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+
+    return NoFault;
+};
+
+}};
+
+def template VectorMaskDecodeBlock {{
+
+return new %(class_name)s<uint8_t>(machInst);
+
+}};
+
+def template VectorNonSplitDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    RegId srcRegIdxArr[2];
+    RegId destRegIdxArr[1];
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorNonSplitConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    %(set_vm_idx)s;
+}
+
+}};
+
+def template VectorIntNonSplitExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                    trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorFloatNonSplitExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                    trace::InstRecord* traceData) const
+{
+    using et = ElemType;
+    using vu = decltype(et::v);
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorReduceMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorReduceMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorReduceMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs2, vs1, vd, vm
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst,
+                   uint8_t _microVl, uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorReduceMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+                                         uint8_t _microVl, uint8_t _microIdx)
+: %(base_class)s("%(mnemonic)s", _machInst,
+                 %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorReduceIntMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    %(type_def)s;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+
+    auto reduce_loop =
+        [&, this](const auto& f, const auto* _, const auto* vs2) {
+            ElemType microop_result = this->microIdx != 0 ? old_Vd[0] : Vs1[0];
+            for (uint32_t i = 0; i < this->microVl; i++) {
+                uint32_t ei = i + vtype_VLMAX(vtype, true) * this->microIdx;
+                if (this->vm || elem_mask(v0, ei)) {
+                    microop_result = f(microop_result, Vs2[i]);
+                }
+            }
+            return microop_result;
+        };
+
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorReduceFloatMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    %(type_def)s;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+
+    Vd[0] = this->microIdx != 0 ? old_Vd[0] : Vs1[0];
+
+    auto reduce_loop =
+        [&, this](const auto& f, const auto* _, const auto* vs2) {
+            vu tmp_val = Vd[0];
+            for (uint32_t i = 0; i < this->microVl; i++) {
+                uint32_t ei = i + vtype_VLMAX(vtype, true) * this->microIdx;
+                if (this->vm || elem_mask(v0, ei)) {
+                    tmp_val = f(tmp_val, Vs2[i]).v;
+                }
+            }
+            return tmp_val;
+        };
+
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorReduceFloatWideningMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    %(type_def)s;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+
+    Vd[0] = this->microIdx != 0 ? old_Vd[0] : Vs1[0];
+
+    auto reduce_loop =
+        [&, this](const auto& f, const auto* _, const auto* vs2) {
+            vwu tmp_val = Vd[0];
+            for (uint32_t i = 0; i < this->microVl; i++) {
+                uint32_t ei = i + vtype_VLMAX(vtype, true) * this->microIdx;
+                if (this->vm || elem_mask(v0, ei)) {
+                    tmp_val = f(tmp_val, Vs2[i]).v;
+                }
+            }
+            return tmp_val;
+        };
+
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorGatherMacroDeclare {{
+
+template<typename ElemType, typename IndexType>
+class %(class_name)s : public %(base_class)s{
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorGatherMacroConstructor {{
+
+template<typename ElemType, typename IndexType>
+%(class_name)s<ElemType, IndexType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    constexpr uint32_t vd_eewb = sizeof(ElemType);
+    constexpr uint32_t vs2_eewb = sizeof(ElemType);
+    constexpr uint32_t vs1_eewb = sizeof(IndexType);
+    constexpr bool vs1_split = vd_eewb > vs1_eewb;
+    const int8_t lmul = vtype_vlmul(vtype);
+    const int8_t vs1_emul = lmul +
+        (vs1_split ? -(vs2_eewb / vs1_eewb) : vs1_eewb / vs2_eewb);
+    const uint8_t vs2_vregs = lmul < 0 ? 1 : 1 << lmul;
+    const uint8_t vs1_vregs = vs1_emul < 0 ? 1 : 1 << vs1_emul;
+    const uint8_t vd_vregs = vs2_vregs;
+    const int32_t micro_vlmax = VLENB / std::max(vd_eewb, vs1_eewb);
+    int32_t remaining_vl = this->vl;
+    int32_t micro_vl = std::min(remaining_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (uint8_t i = 0; i < std::max(vs1_vregs, vd_vregs) && micro_vl > 0;
+            i++) {
+        for (uint8_t j = 0; j < vs2_vregs; j++) {
+            microop = new %(class_name)sMicro<ElemType, IndexType>(
+                _machInst, micro_vl, i * vs2_vregs + j);
+            microop->setDelayedCommit();
+            this->microops.push_back(microop);
+        }
+        micro_vl = std::min(remaining_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorGatherMicroDeclare {{
+
+template<typename ElemType, typename IndexType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs2, vs1, vd, vm
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst,
+                   uint8_t _microVl, uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorGatherMicroConstructor {{
+
+template<typename ElemType, typename IndexType>
+%(class_name)s<ElemType, IndexType>::%(class_name)s(ExtMachInst _machInst,
+    uint8_t _microVl, uint8_t _microIdx)
+: %(base_class)s("%(mnemonic)s", _machInst,
+                 %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    [[maybe_unused]] constexpr uint32_t vd_eewb = sizeof(ElemType);
+    [[maybe_unused]] constexpr uint32_t vs2_eewb = sizeof(ElemType);
+    [[maybe_unused]] constexpr uint32_t vs1_eewb = sizeof(IndexType);
+    constexpr uint8_t vs1_split_num = (vd_eewb + vs1_eewb - 1) / vs1_eewb;
+    constexpr uint8_t vd_split_num = (vs1_eewb + vd_eewb - 1) / vd_eewb;
+    const int8_t lmul = vtype_vlmul(vtype);
+    const uint8_t vs2_vregs = lmul < 0 ? 1 : 1 << lmul;
+    [[maybe_unused]] const uint8_t vs2_idx = _microIdx % vs2_vregs;
+    [[maybe_unused]] const uint8_t vs1_idx =
+        _microIdx / vs2_vregs / vs1_split_num;
+    [[maybe_unused]] const uint8_t vd_idx =
+        _microIdx / vs2_vregs / vd_split_num;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorGatherMicroExecute {{
+
+template <typename ElemType, typename IndexType>
+Fault
+%(class_name)s<ElemType, IndexType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    [[maybe_unused]] constexpr size_t sew = sizeof(vu) * 8;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    const uint32_t vlmax = vtype_VLMAX(vtype);
+    constexpr uint8_t vd_eewb = sizeof(ElemType);
+    constexpr uint8_t vs1_eewb = sizeof(IndexType);
+    constexpr uint8_t vs2_eewb = sizeof(ElemType);
+    constexpr uint8_t vs1_split_num = (vd_eewb + vs1_eewb - 1) / vs1_eewb;
+    constexpr uint8_t vd_split_num = (vs1_eewb + vd_eewb - 1) / vd_eewb;
+    [[maybe_unused]] constexpr uint16_t vd_elems = VLENB / vd_eewb;
+    [[maybe_unused]] constexpr uint16_t vs1_elems = VLENB / vs1_eewb;
+    [[maybe_unused]] constexpr uint16_t vs2_elems = VLENB / vs2_eewb;
+    [[maybe_unused]] const int8_t lmul = vtype_vlmul(vtype);
+    [[maybe_unused]] const uint8_t vs2_vregs = lmul < 0 ? 1 : 1 << lmul;
+    [[maybe_unused]] const uint8_t vs2_idx = microIdx % vs2_vregs;
+    [[maybe_unused]] const uint8_t vs1_idx =
+        microIdx / vs2_vregs / vs1_split_num;
+    [[maybe_unused]] const uint8_t vd_idx =
+        microIdx / vs2_vregs / vd_split_num;
+    [[maybe_unused]] const uint16_t vs1_bias =
+        vs1_elems * (vd_idx % vs1_split_num) / vs1_split_num;
+    [[maybe_unused]] const uint16_t vd_bias =
+        vd_elems * (vs1_idx % vd_split_num) / vd_split_num;
+
+    %(code)s;
+    %(op_wb)s;
+
+    return NoFault;
+}
+
+}};
+
+def template VectorGatherDecodeBlock {{
+
+switch(machInst.vtype8.vsew) {
+    case 0b000: {
+        using elem_type [[maybe_unused]] = uint8_t;
+        return new %(class_name)s<uint8_t, %(idx_type)s>(machInst);
+    }
+    case 0b001: {
+        using elem_type [[maybe_unused]] = uint16_t;
+        return new %(class_name)s<uint16_t, %(idx_type)s>(machInst);
+    }
+    case 0b010: {
+        using elem_type [[maybe_unused]] = uint32_t;
+        return new %(class_name)s<uint32_t, %(idx_type)s>(machInst);
+    }
+    case 0b011: {
+        using elem_type [[maybe_unused]] = uint64_t;
+        return new %(class_name)s<uint64_t, %(idx_type)s>(machInst);
+    }
+    default: GEM5_UNREACHABLE;
+}
+
+}};
+
+def template VectorIntVxsatMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s{
+private:
+    %(reg_idx_arr_decl)s;
+    bool vxsat = false;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntVxsatMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst,
+            micro_vl, i, &vxsat);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    microop = new VxsatMicroInst(&vxsat, _machInst);
+    microop->setFlag(StaticInst::IsSerializeAfter);
+    microop->setFlag(StaticInst::IsNonSpeculative);
+    this->microops.push_back(microop);
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+}};
+
+def template VectorIntVxsatMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+    bool* vxsatptr;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+                   uint8_t _microIdx, bool* vxsatptr);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntVxsatMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+    uint8_t _microVl, uint8_t _microIdx, bool* vxsatptr)
+    : %(base_class)s("%(mnemonic)s", _machInst,
+                     %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    this->vxsatptr = vxsatptr;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorReduceIntWideningMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    using vwu [[maybe_unused]] = typename double_width<vu>::type;
+    using vwi [[maybe_unused]] = typename double_width<vi>::type;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+
+    Vd[0] = this->microIdx != 0 ? old_Vd[0] : Vs1[0];
+
+    auto reduce_loop =
+        [&, this](const auto& f, const auto* _, const auto* vs2) {
+            vwu tmp_val = Vd[0];
+            for (uint32_t i = 0; i < this->microVl; i++) {
+                uint32_t ei = i + vtype_VLMAX(vtype, true) * this->microIdx;
+                if (this->vm || elem_mask(v0, ei)) {
+                    tmp_val = f(tmp_val, Vs2[i]);
+                }
+            }
+            return tmp_val;
+        };
+
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorSlideMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorSlideUpMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    // Todo static filter out useless uop
+    int micro_idx = 0;
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        for (int j = 0; j <= i; ++j) {
+            microop = new %(class_name)sMicro<ElemType>(
+                _machInst, micro_vl, micro_idx++, i, j);
+            microop->setDelayedCommit();
+            this->microops.push_back(microop);
+        }
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorSlideDownMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    // Todo static filter out useless uop
+    int micro_idx = 0;
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        for (int j = i; j < num_microops; ++j) {
+            microop = new %(class_name)sMicro<ElemType>(
+                _machInst, micro_vl, micro_idx++, i, j);
+            microop->setDelayedCommit();
+            this->microops.push_back(microop);
+        }
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorSlideMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs2, vs1, vs3(old_vd), vm for *.vv, *.vx
+    // vs2, (old_vd), vm for *.vi
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+        uint8_t _microIdx, uint8_t _vdIdx, uint8_t _vs2Idx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorSlideMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+        uint8_t _microVl, uint8_t _microIdx, uint8_t _vdIdx, uint8_t _vs2Idx)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s, _microVl,
+        _microIdx, _vdIdx, _vs2Idx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorSlideMicroExecute {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    [[maybe_unused]]const uint32_t vlmax = vtype_VLMAX(vtype);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+
+    return NoFault;
+};
+
+}};
+
+def template VectorFloatSlideMicroExecute {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using et = ElemType;
+    using vu = decltype(et::v);
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    [[maybe_unused]]const uint32_t vlmax = vtype_VLMAX(vtype);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+
+    return NoFault;
+};
+
+}};
diff --git a/src/arch/riscv/isa/templates/vector_mem.isa b/src/arch/riscv/isa/templates/vector_mem.isa
index d54243ad7d..f8be1e555b 100644
--- a/src/arch/riscv/isa/templates/vector_mem.isa
+++ b/src/arch/riscv/isa/templates/vector_mem.isa
@@ -1,3 +1,31 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2022 PLCT Lab
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 def template VMemMacroDeclare {{
 
 class %(class_name)s : public %(base_class)s
diff --git a/src/arch/riscv/regs/float.hh b/src/arch/riscv/regs/float.hh
index 4809372070..cca9e1be2f 100644
--- a/src/arch/riscv/regs/float.hh
+++ b/src/arch/riscv/regs/float.hh
@@ -211,6 +211,20 @@ const std::vector<std::string> RegNames = {
 
 } // namespace float_reg
 
+inline float32_t
+fsgnj32(float32_t a, float32_t b, bool n, bool x) {
+    if (n) b.v = ~b.v;
+    else if (x) b.v = a.v ^ b.v;
+    return f32(insertBits(b.v, 30, 0, a.v));
+}
+
+inline float64_t
+fsgnj64(float64_t a, float64_t b, bool n, bool x) {
+    if (n) b.v = ~b.v;
+    else if (x) b.v = a.v ^ b.v;
+    return f64(insertBits(b.v, 62, 0, a.v));
+}
+
 } // namespace RiscvISA
 } // namespace gem5
 
diff --git a/src/arch/riscv/utility.hh b/src/arch/riscv/utility.hh
index 1db6d6df3b..40054aec0f 100644
--- a/src/arch/riscv/utility.hh
+++ b/src/arch/riscv/utility.hh
@@ -241,6 +241,13 @@ remu(T rs1, T rs2)
     return (rs2 == 0) ? rs1 : rs1 % rs2;
 }
 
+// Vector extension functions
+inline uint64_t
+vtype_SEW(const uint64_t vtype)
+{
+    return 8 << bits(vtype, 5, 3);
+}
+
 /*
 * Encode LMUL to lmul as follows:
 *     LMUL    vlmul    lmul
@@ -269,6 +276,25 @@ vtype_VLMAX(const uint64_t vtype, const bool per_reg = false)
     return gem5::RiscvISA::VLEN >> (vsew + 3 - lmul);
 }
 
+inline int64_t
+vtype_vlmul(const uint64_t vtype)
+{
+    return (int64_t)sext<3>(bits(vtype, 2, 0));
+}
+
+inline uint64_t
+vtype_regs_per_group(const uint64_t vtype)
+{
+    int64_t lmul = (int64_t)sext<3>(bits(vtype, 2, 0));
+    return 1 << std::max<int64_t>(0, lmul);
+}
+
+inline void
+vtype_set_vill(uint64_t& vtype)
+{
+    vtype = (uint64_t)0 ^ (1UL << (sizeof(RegVal) * 8 - 1));
+}
+
 inline uint64_t
 width_EEW(uint64_t width)
 {
@@ -296,6 +322,461 @@ elem_mask(const T* vs, const int index)
     return (vs[idx] >> pos) & 1;
 }
 
+template<typename Type> struct double_width;
+template<> struct double_width<uint8_t>     { using type = uint16_t;};
+template<> struct double_width<uint16_t>    { using type = uint32_t;};
+template<> struct double_width<uint32_t>    { using type = uint64_t;};
+template<> struct double_width<int8_t>      { using type = int16_t; };
+template<> struct double_width<int16_t>     { using type = int32_t; };
+template<> struct double_width<int32_t>     { using type = int64_t; };
+template<> struct double_width<float32_t>   { using type = float64_t;};
+
+template<typename Type> struct double_widthf;
+template<> struct double_widthf<uint32_t>    { using type = float64_t;};
+template<> struct double_widthf<int32_t>     { using type = float64_t;};
+
+template<typename FloatType, typename IntType = decltype(FloatType::v)> auto
+ftype(IntType a) -> FloatType
+{
+    if constexpr(std::is_same_v<uint32_t, IntType>)
+        return f32(a);
+    else if constexpr(std::is_same_v<uint64_t, IntType>)
+        return f64(a);
+    GEM5_UNREACHABLE;
+}
+
+// TODO: Consolidate ftype_freg(freg_t a) and ftype(IntType a) into a
+// single function
+template<typename FloatType, typename IntType = decltype(FloatType::v)> auto
+ftype_freg(freg_t a) -> FloatType
+{
+    if constexpr(std::is_same_v<uint32_t, IntType>)
+        return f32(a);
+    else if constexpr(std::is_same_v<uint64_t, IntType>)
+        return f64(a);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fadd(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_add(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_add(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fsub(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_sub(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_sub(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fmin(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_min(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_min(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fmax(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_max(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_max(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fdiv(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_div(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_div(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fmul(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_mul(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_mul(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fsqrt(FloatType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_sqrt(a);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_sqrt(a);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+frsqrte7(FloatType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_rsqrte7(a);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_rsqrte7(a);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+frecip7(FloatType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_recip7(a);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_recip7(a);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fclassify(FloatType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32(f32_classify(a));
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64(f64_classify(a));
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fsgnj(FloatType a, FloatType b, bool n, bool x)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return fsgnj32(a, b, n, x);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return fsgnj64(a, b, n, x);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> bool
+fle(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_le(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_le(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> bool
+feq(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_eq(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_eq(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> bool
+flt(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_lt(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_lt(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fmadd(FloatType a, FloatType b, FloatType c)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_mulAdd(a, b, c);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_mulAdd(a, b, c);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fneg(FloatType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32(a.v ^ uint32_t(mask(31, 31)));
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64(a.v ^ mask(63, 63));
+    GEM5_UNREACHABLE;
+}
+
+template<typename FT, typename WFT = typename double_width<FT>::type> WFT
+fwiden(FT a)
+{
+    if constexpr(std::is_same_v<float32_t, FT>)
+        return f32_to_f64(a);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType, typename IntType = decltype(FloatType::v)> IntType
+f_to_ui(FloatType a, uint_fast8_t mode)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_to_ui32(a, mode, true);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_to_ui64(a, mode, true);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename FloatType,
+    typename IntType = decltype(double_width<FloatType>::type::v)
+> IntType
+f_to_wui(FloatType a, uint_fast8_t mode)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_to_ui64(a, mode, true);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename IntType,
+    typename FloatType = typename double_widthf<IntType>::type
+> IntType
+f_to_nui(FloatType a, uint_fast8_t mode)
+{
+    if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_to_ui32(a, mode, true);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType, typename IntType = decltype(FloatType::v)> IntType
+f_to_i(FloatType a, uint_fast8_t mode)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return (uint32_t)f32_to_i32(a, mode, true);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return (uint64_t)f64_to_i64(a, mode, true);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename FloatType,
+    typename IntType = decltype(double_width<FloatType>::type::v)
+> IntType
+f_to_wi(FloatType a, uint_fast8_t mode)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return (uint64_t)f32_to_i64(a, mode, true);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename IntType,
+    typename FloatType = typename double_widthf<IntType>::type
+> IntType
+f_to_ni(FloatType a, uint_fast8_t mode)
+{
+    if constexpr(std::is_same_v<float64_t, FloatType>)
+        return (uint32_t)f64_to_i32(a, mode, true);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType, typename IntType = decltype(FloatType::v)>
+FloatType
+ui_to_f(IntType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return ui32_to_f32(a);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return ui64_to_f64(a);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename IntType,
+    typename FloatType = typename double_widthf<IntType>::type
+> FloatType
+ui_to_wf(IntType a)
+{
+    if constexpr(std::is_same_v<float64_t, FloatType>)
+        return ui32_to_f64(a);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename FloatType,
+    typename IntType = decltype(double_width<FloatType>::type::v)
+> FloatType
+ui_to_nf(IntType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return ui64_to_f32(a);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType, typename IntType = decltype(FloatType::v)>
+FloatType
+i_to_f(IntType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return i32_to_f32((int32_t)a);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return i64_to_f64((int64_t)a);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename IntType,
+    typename FloatType = typename double_widthf<IntType>::type
+> FloatType
+i_to_wf(IntType a)
+{
+    if constexpr(std::is_same_v<float64_t, FloatType>)
+        return i32_to_f64((int32_t)a);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename FloatType,
+    typename IntType = std::make_signed_t<
+        decltype(double_width<FloatType>::type::v)
+    >
+> FloatType
+i_to_nf(IntType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return i64_to_f32(a);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename FloatType,
+    typename FloatWType = typename double_width<FloatType>::type
+> FloatWType
+f_to_wf(FloatType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_to_f64(a);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename FloatNType,
+    typename FloatType = typename double_width<FloatNType>::type
+> FloatNType
+f_to_nf(FloatType a)
+{
+    if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_to_f32(a);
+    GEM5_UNREACHABLE;
+}
+
+//ref:  https://locklessinc.com/articles/sat_arithmetic/
+template<typename T> T
+sat_add(T x, T y, bool* sat)
+{
+    using UT = std::make_unsigned_t<T>;
+    UT ux = x;
+    UT uy = y;
+    UT res = ux + uy;
+
+    int sh = sizeof(T) * 8 - 1;
+
+    ux = (ux >> sh) + (((UT)0x1 << sh) - 1);
+
+    if ((T) ((ux ^ uy) | ~(uy ^ res)) >= 0) {
+    res = ux;
+    *sat = true;
+    }
+    return res;
+}
+
+template<typename T> T
+sat_sub(T x, T y, bool* sat)
+{
+    using UT = std::make_unsigned_t<T>;
+    UT ux = x;
+    UT uy = y;
+    UT res = ux - uy;
+
+    int sh = sizeof(T) * 8 - 1;
+
+    ux = (ux >> sh) + (((UT)0x1 << sh) - 1);
+
+    if ((T) ((ux ^ uy) & (ux ^ res)) < 0) {
+    res = ux;
+    *sat = true;
+    }
+    return res;
+}
+
+template<typename T> T
+sat_addu(T x, T y, bool* sat)
+{
+    T res = x + y;
+
+    bool t = res < x;
+    if (false == *sat){
+    *sat = t;
+    }
+    res |= -(res < x);
+
+    return res;
+}
+
+template<typename T> T
+sat_subu(T x, T y, bool* sat)
+{
+    T res = x - y;
+
+    bool t = !(res <= x);
+    if (false == *sat){
+    *sat = t;
+    }
+
+    res &= -(res <= x);
+
+    return res;
+}
+
+/**
+ * Ref:
+ * https://github.com/riscv-software-src/riscv-isa-sim
+ */
+template<typename T> T
+int_rounding(T result, uint8_t xrm, unsigned gb) {
+    const uint64_t lsb = 1UL << gb;
+    const uint64_t lsb_half = lsb >> 1;
+    switch (xrm) {
+    case 0 /* RNU */:
+        result += lsb_half;
+        break;
+    case 1 /* RNE */:
+        if ((result & lsb_half) &&
+            ((result & (lsb_half - 1)) || (result & lsb)))
+            result += lsb;
+        break;
+    case 2 /* RDN */:
+        break;
+    case 3 /* ROD */:
+        if (result & (lsb - 1))
+            result |= lsb;
+        break;
+    default:
+        panic("Invalid xrm value %d", (int)xrm);
+    }
+
+    return result;
+}
+
 } // namespace RiscvISA
 } // namespace gem5