From d96df4025323e0173d32b5bba9086fea28c8444a Mon Sep 17 00:00:00 2001
From: Harshil Patel <harshilp2107@gmail.com>
Date: Tue, 1 Aug 2023 16:22:44 -0700
Subject: [PATCH 01/10] stdlib: Added support for JSON via env variables.

Change-Id: I5791e6d51b3b9f68eb212a46c4cd0add23668340
Co-authored-by: Kunal Pai <kunpai@ucdavis.edu>
---
 src/python/gem5/resources/client.py | 33 ++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/python/gem5/resources/client.py b/src/python/gem5/resources/client.py
index ab8262bf92..c43dd76ac5 100644
--- a/src/python/gem5/resources/client.py
+++ b/src/python/gem5/resources/client.py
@@ -30,7 +30,7 @@ import os
 from typing import Optional, Dict, List
 from .client_api.client_wrapper import ClientWrapper
 from gem5.gem5_default_config import config
-from m5.util import inform
+from m5.util import inform, warn
 from _m5 import core
 
 
@@ -53,6 +53,14 @@ clientwrapper = None
 def _get_clientwrapper():
     global clientwrapper
     if clientwrapper is None:
+        if (
+            "GEM5_RESOURCE_JSON" in os.environ
+            and "GEM5_RESOURCE_JSON_APPEND" in os.environ
+        ):
+            raise Exception(
+                "Both GEM5_RESOURCE_JSON and GEM5_RESOURCE_JSON_APPEND are set. Please set only one of them."
+            )
+
         # First check if the config file path is provided in the environment variable
         if "GEM5_CONFIG" in os.environ:
             config_file_path = Path(os.environ["GEM5_CONFIG"])
@@ -68,6 +76,29 @@ def _get_clientwrapper():
         else:
             gem5_config = config
             inform("Using default config")
+
+        # If the GEM5_RESOURCE_JSON_APPEND is set, append the resources to the gem5_config
+        if "GEM5_RESOURCE_JSON_APPEND" in os.environ:
+            json_source = {
+                "url": os.environ["GEM5_RESOURCE_JSON_APPEND"],
+                "isMongo": False,
+            }
+            gem5_config["sources"].update(
+                {"GEM5_RESOURCE_JSON_APPEND": json_source}
+            )
+            inform(
+                f"Appending resources from {os.environ['GEM5_RESOURCE_JSON_APPEND']}"
+            )
+        # If the GEM5_RESOURCE_JSON is set, use it as the only source
+        elif "GEM5_RESOURCE_JSON" in os.environ:
+            json_source = {
+                "url": os.environ["GEM5_RESOURCE_JSON"],
+                "isMongo": False,
+            }
+            gem5_config["sources"] = {"GEM5_RESOURCE_JSON": json_source}
+            warn(
+                f"No config sources are used, Using resources from {os.environ['GEM5_RESOURCE_JSON']}"
+            )
         clientwrapper = ClientWrapper(gem5_config)
     return clientwrapper
 

From 32b7ffc4546fd53bc50487d1899188844177bb1d Mon Sep 17 00:00:00 2001
From: Harshil Patel <harshilp2107@gmail.com>
Date: Tue, 1 Aug 2023 17:22:35 -0700
Subject: [PATCH 02/10] stdlib: fixed warning message

Change-Id: I04ef23529d7afc5d46fbba7558279ec08acd629a
Co-authored-by: paikunal <kunpai@ucdavis.edu>
---
 src/python/gem5/resources/client.py | 37 +++++++++++++++++++----------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/src/python/gem5/resources/client.py b/src/python/gem5/resources/client.py
index c43dd76ac5..571a8254e0 100644
--- a/src/python/gem5/resources/client.py
+++ b/src/python/gem5/resources/client.py
@@ -60,9 +60,31 @@ def _get_clientwrapper():
             raise Exception(
                 "Both GEM5_RESOURCE_JSON and GEM5_RESOURCE_JSON_APPEND are set. Please set only one of them."
             )
-
+        gem5_config = {}
+        # If the GEM5_RESOURCE_JSON is set, use it as the only source
+        if "GEM5_RESOURCE_JSON" in os.environ:
+            json_source = {
+                "url": os.environ["GEM5_RESOURCE_JSON"],
+                "isMongo": False,
+            }
+            gem5_config["sources"] = {"GEM5_RESOURCE_JSON": json_source}
+            if "GEM5_CONFIG" in os.environ:
+                warn(
+                    f"Both GEM5_CONFIG and GEM5_RESOURCE_JSON are set.\n"
+                    f"GEM5_CONFIG will be ignored in favor of the GEM5_RESOURCE_JSON environment variable."
+                )
+            elif (Path().cwd().resolve() / "gem5-config.json").exists():
+                warn(
+                    f"Both gem5-config.json and GEM5_RESOURCE_JSON are set.\n"
+                    f"gem5-config.json will be ignored in favor of the GEM5_RESOURCE_JSON environment variable."
+                )
+            else:
+                warn(
+                    f"GEM5_RESOURCE_JSON is set.\n"
+                    f"gem5-default-config will be ignored in favor of the GEM5_RESOURCE_JSON environment variable."
+                )
         # First check if the config file path is provided in the environment variable
-        if "GEM5_CONFIG" in os.environ:
+        elif "GEM5_CONFIG" in os.environ:
             config_file_path = Path(os.environ["GEM5_CONFIG"])
             gem5_config = getFileContent(config_file_path)
             inform("Using config file specified by $GEM5_CONFIG")
@@ -89,16 +111,7 @@ def _get_clientwrapper():
             inform(
                 f"Appending resources from {os.environ['GEM5_RESOURCE_JSON_APPEND']}"
             )
-        # If the GEM5_RESOURCE_JSON is set, use it as the only source
-        elif "GEM5_RESOURCE_JSON" in os.environ:
-            json_source = {
-                "url": os.environ["GEM5_RESOURCE_JSON"],
-                "isMongo": False,
-            }
-            gem5_config["sources"] = {"GEM5_RESOURCE_JSON": json_source}
-            warn(
-                f"No config sources are used, Using resources from {os.environ['GEM5_RESOURCE_JSON']}"
-            )
+
         clientwrapper = ClientWrapper(gem5_config)
     return clientwrapper
 

From 73892c9b47984b1a1e4441ce4141aa0576147bf6 Mon Sep 17 00:00:00 2001
From: Xuan Hu <huxuan@bosc.ac.cn>
Date: Tue, 21 Mar 2023 13:11:01 +0800
Subject: [PATCH 03/10] arch-riscv: Add risc-v vector regs and configs

This commit add regs and configs for vector extension

* Add 32 vector arch regs as spec defined and 8 internal regs for
  uop-based vector implementation.
* Add default vector configs(VLEN = 256, ELEN = 64). These cannot
  be changed yet, since the vector implementation has only be tested
  with such configs.
* Add disassamble register name v0~v31 and vtmp0~vtmp7.
* Add CSR registers defined in RISCV Vector Spec v1.0.
* Add vector bitfields.
* Add vector operand_types and operands.

Change-Id: I7bbab1ee9e0aa804d6f15ef7b77fac22d4f7212a
Co-authored-by: Yang Liu <numbksco@gmail.com>
Co-authored-by: Fan Yang <1209202421@qq.com>
Co-authored-by: Jerin Joy <joy@rivosinc.com>

arch-riscv: enable rvv flags only for RV64

Change-Id: I6586e322dfd562b598f63a18964d17326c14d4cf
---
 src/arch/riscv/faults.hh         |  2 +-
 src/arch/riscv/isa.cc            | 54 +++++++++++++++++--
 src/arch/riscv/isa.hh            |  2 +
 src/arch/riscv/isa/bitfields.isa | 24 +++++++++
 src/arch/riscv/isa/includes.isa  |  1 +
 src/arch/riscv/isa/operands.isa  | 15 +++++-
 src/arch/riscv/regs/misc.hh      | 31 +++++++++--
 src/arch/riscv/regs/vector.hh    | 90 ++++++++++++++++++++++++++++++++
 src/arch/riscv/utility.hh        | 10 +++-
 9 files changed, 219 insertions(+), 10 deletions(-)
 create mode 100644 src/arch/riscv/regs/vector.hh

diff --git a/src/arch/riscv/faults.hh b/src/arch/riscv/faults.hh
index f687fd6f20..fa67e3b34c 100644
--- a/src/arch/riscv/faults.hh
+++ b/src/arch/riscv/faults.hh
@@ -173,7 +173,7 @@ class InstFault : public RiscvFault
         : RiscvFault(n, FaultType::OTHERS, INST_ILLEGAL), _inst(inst)
     {}
 
-    RegVal trap_value() const override { return bits(_inst, 31, 0); }
+    RegVal trap_value() const override { return _inst.instBits; }
 };
 
 class UnknownInstFault : public InstFault
diff --git a/src/arch/riscv/isa.cc b/src/arch/riscv/isa.cc
index 94a8239bac..2f9d52e1b2 100644
--- a/src/arch/riscv/isa.cc
+++ b/src/arch/riscv/isa.cc
@@ -43,6 +43,7 @@
 #include "arch/riscv/regs/float.hh"
 #include "arch/riscv/regs/int.hh"
 #include "arch/riscv/regs/misc.hh"
+#include "arch/riscv/regs/vector.hh"
 #include "base/bitfield.hh"
 #include "base/compiler.hh"
 #include "base/logging.hh"
@@ -52,6 +53,7 @@
 #include "debug/LLSC.hh"
 #include "debug/MatRegs.hh"
 #include "debug/RiscvMisc.hh"
+#include "debug/VecRegs.hh"
 #include "mem/packet.hh"
 #include "mem/request.hh"
 #include "params/RiscvISA.hh"
@@ -189,6 +191,14 @@ namespace RiscvISA
     [MISCREG_FFLAGS]        = "FFLAGS",
     [MISCREG_FRM]           = "FRM",
 
+    [MISCREG_VSTART]        = "VSTART",
+    [MISCREG_VXSAT]         = "VXSAT",
+    [MISCREG_VXRM]          = "VXRM",
+    [MISCREG_VCSR]          = "VCSR",
+    [MISCREG_VL]            = "VL",
+    [MISCREG_VTYPE]         = "VTYPE",
+    [MISCREG_VLENB]         = "VLENB",
+
     [MISCREG_NMIVEC]        = "NMIVEC",
     [MISCREG_NMIE]          = "NMIE",
     [MISCREG_NMIP]          = "NMIP",
@@ -234,11 +244,10 @@ namespace
 {
 
 /* Not applicable to RISCV */
-RegClass vecRegClass(VecRegClass, VecRegClassName, 1, debug::IntRegs);
-RegClass vecElemClass(VecElemClass, VecElemClassName, 2, debug::IntRegs);
-RegClass vecPredRegClass(VecPredRegClass, VecPredRegClassName, 1,
+RegClass vecElemClass(VecElemClass, VecElemClassName, 0, debug::IntRegs);
+RegClass vecPredRegClass(VecPredRegClass, VecPredRegClassName, 0,
         debug::IntRegs);
-RegClass matRegClass(MatRegClass, MatRegClassName, 1, debug::MatRegs);
+RegClass matRegClass(MatRegClass, MatRegClassName, 0, debug::MatRegs);
 RegClass ccRegClass(CCRegClass, CCRegClassName, 0, debug::IntRegs);
 
 } // anonymous namespace
@@ -275,6 +284,13 @@ ISA::copyRegsFrom(ThreadContext *src)
     for (auto &id: floatRegClass)
         tc->setReg(id, src->getReg(id));
 
+    // Third loop through the vector registers.
+    RiscvISA::VecRegContainer vc;
+    for (auto &id: vecRegClass) {
+        src->getReg(id, &vc);
+        tc->setReg(id, &vc);
+    }
+
     // Lastly copy PC/NPC
     tc->pcState(src->pcState());
 }
@@ -299,6 +315,7 @@ void ISA::clear()
     // mark FS is initial
     status.fs = INITIAL;
 
+
     // rv_type dependent init.
     switch (rv_type) {
         case RV32:
@@ -307,6 +324,8 @@ void ISA::clear()
         case RV64:
           misa.rv64_mxl = 2;
           status.uxl = status.sxl = 2;
+          status.vs = VPUStatus::INITIAL;
+          misa.rvv = 1;
           break;
         default:
           panic("%s: Unknown rv_type: %d", name(), (int)rv_type);
@@ -479,6 +498,17 @@ ISA::readMiscReg(RegIndex idx)
 
             return readMiscRegNoEffect(idx);
         }
+      case MISCREG_VLENB:
+        {
+            return VLENB;
+        }
+        break;
+      case MISCREG_VCSR:
+        {
+            return readMiscRegNoEffect(MISCREG_VXSAT) &
+                  (readMiscRegNoEffect(MISCREG_VXRM) << 1);
+        }
+        break;
       default:
         // Try reading HPM counters
         // As a placeholder, all HPM counters are just cycle counters
@@ -652,6 +682,22 @@ ISA::setMiscReg(RegIndex idx, RegVal val)
                 setMiscRegNoEffect(idx, val);
             }
             break;
+          case MISCREG_VXSAT:
+            {
+                setMiscRegNoEffect(idx, val & 0x1);
+            }
+            break;
+          case MISCREG_VXRM:
+            {
+                setMiscRegNoEffect(idx, val & 0x3);
+            }
+            break;
+          case MISCREG_VCSR:
+            {
+                setMiscRegNoEffect(MISCREG_VXSAT, val & 0x1);
+                setMiscRegNoEffect(MISCREG_VXRM, (val & 0x6) >> 1);
+            }
+            break;
           default:
             setMiscRegNoEffect(idx, val);
         }
diff --git a/src/arch/riscv/isa.hh b/src/arch/riscv/isa.hh
index 31001c04b4..d7b0a21a1f 100644
--- a/src/arch/riscv/isa.hh
+++ b/src/arch/riscv/isa.hh
@@ -67,6 +67,8 @@ enum FPUStatus
     DIRTY = 3,
 };
 
+using VPUStatus = FPUStatus;
+
 class ISA : public BaseISA
 {
   protected:
diff --git a/src/arch/riscv/isa/bitfields.isa b/src/arch/riscv/isa/bitfields.isa
index 8589269949..280bcbab22 100644
--- a/src/arch/riscv/isa/bitfields.isa
+++ b/src/arch/riscv/isa/bitfields.isa
@@ -133,3 +133,27 @@ def bitfield BIT25         <25>;
 def bitfield RNUM       <23:20>;
 def bitfield KFUNCT5    <29:25>;
 def bitfield BS         <31:30>;
+
+// Vector instructions
+def bitfield VFUNCT6    vfunct6;
+def bitfield VFUNCT5    vfunct5;
+def bitfield VFUNCT3    vfunct3;
+def bitfield VFUNCT2    vfunct2;
+
+def bitfield VS3        vs3;
+def bitfield VS2        vs2;
+def bitfield VS1        vs1;
+def bitfield VD         vd;
+
+def bitfield NF         nf;
+def bitfield MEW        mew;
+def bitfield MOP        mop;
+def bitfield VM         vm;
+def bitfield LUMOP      lumop;
+def bitfield SUMOP      sumop;
+def bitfield WIDTH      width;
+
+def bitfield BIT31      bit31;
+def bitfield BIT30      bit30;
+def bitfield SIMM5      uimm_vsetivli;
+def bitfield SIMM3      simm3;
diff --git a/src/arch/riscv/isa/includes.isa b/src/arch/riscv/isa/includes.isa
index 8dddc2fb59..cb95f58f7e 100644
--- a/src/arch/riscv/isa/includes.isa
+++ b/src/arch/riscv/isa/includes.isa
@@ -95,6 +95,7 @@ output exec {{
 #include "arch/riscv/reg_abi.hh"
 #include "arch/riscv/regs/float.hh"
 #include "arch/riscv/regs/misc.hh"
+#include "arch/riscv/regs/vector.hh"
 #include "arch/riscv/utility.hh"
 #include "base/condcodes.hh"
 #include "cpu/base.hh"
diff --git a/src/arch/riscv/isa/operands.isa b/src/arch/riscv/isa/operands.isa
index 72d8f81bca..a81b28df57 100644
--- a/src/arch/riscv/isa/operands.isa
+++ b/src/arch/riscv/isa/operands.isa
@@ -38,7 +38,15 @@ def operand_types {{
     'sd' : 'int64_t',
     'ud' : 'uint64_t',
     'sf' : 'float',
-    'df' : 'double'
+    'df' : 'double',
+
+    'vi'    : 'vi',
+    'vu'    : 'vu',
+    'vwi'   : 'vwi',
+    'vwu'   : 'vwu',
+    'vext'  : 'vext',
+    'vextu' : 'vextu',
+    'vc'    : 'RiscvISA::VecRegContainer'
 }};
 
 let {{
@@ -79,6 +87,11 @@ def operands {{
     'Fp2': FloatRegOp('df', 'FP2 + 8', 'IsFloating', 2),
     'Fp2_bits': FloatRegOp('ud', 'FP2 + 8', 'IsFloating', 2),
 
+    'Vd':  VecRegOp('vc', 'VD', 'IsVector', 1),
+    'Vs1': VecRegOp('vc', 'VS1', 'IsVector', 2),
+    'Vs2': VecRegOp('vc', 'VS2', 'IsVector', 3),
+    'Vs3': VecRegOp('vc', 'VS3', 'IsVector', 4),
+
 #Memory Operand
     'Mem': MemOp('ud', None, (None, 'IsLoad', 'IsStore'), 5),
 
diff --git a/src/arch/riscv/regs/misc.hh b/src/arch/riscv/regs/misc.hh
index 5ea3536141..64072c97e2 100644
--- a/src/arch/riscv/regs/misc.hh
+++ b/src/arch/riscv/regs/misc.hh
@@ -191,6 +191,14 @@ enum MiscRegIndex
     MISCREG_FFLAGS,
     MISCREG_FRM,
 
+    MISCREG_VSTART,
+    MISCREG_VXSAT,
+    MISCREG_VXRM,
+    MISCREG_VCSR,
+    MISCREG_VL,
+    MISCREG_VTYPE,
+    MISCREG_VLENB,
+
     // These registers are not in the standard, hence does not exist in the
     // CSRData map. These are mainly used to provide a minimal implementation
     // for non-maskable-interrupt in our simple cpu.
@@ -476,7 +484,15 @@ enum CSRIndex
     CSR_TDATA3 = 0x7A3,
     CSR_DCSR = 0x7B0,
     CSR_DPC = 0x7B1,
-    CSR_DSCRATCH = 0x7B2
+    CSR_DSCRATCH = 0x7B2,
+
+    CSR_VSTART       = 0x008,
+    CSR_VXSAT        = 0x009,
+    CSR_VXRM         = 0x00A,
+    CSR_VCSR         = 0x00F,
+    CSR_VL           = 0xC20,
+    CSR_VTYPE        = 0xC21,
+    CSR_VLENB        = 0xC22
 };
 
 struct CSRMetadata
@@ -718,7 +734,15 @@ const std::unordered_map<int, CSRMetadata> CSRData = {
     {CSR_TDATA3, {"tdata3", MISCREG_TDATA3, rvTypeFlags(RV64, RV32)}},
     {CSR_DCSR, {"dcsr", MISCREG_DCSR, rvTypeFlags(RV64, RV32)}},
     {CSR_DPC, {"dpc", MISCREG_DPC, rvTypeFlags(RV64, RV32)}},
-    {CSR_DSCRATCH, {"dscratch", MISCREG_DSCRATCH, rvTypeFlags(RV64, RV32)}}
+    {CSR_DSCRATCH, {"dscratch", MISCREG_DSCRATCH, rvTypeFlags(RV64, RV32)}},
+
+    {CSR_VSTART, {"vstart", MISCREG_VSTART, rvTypeFlags(RV64, RV32)}},
+    {CSR_VXSAT,  {"vxsat" , MISCREG_VXSAT, rvTypeFlags(RV64, RV32)}},
+    {CSR_VXRM,   {"vxrm"  , MISCREG_VXRM, rvTypeFlags(RV64, RV32)}},
+    {CSR_VCSR,   {"vcsr"  , MISCREG_VCSR, rvTypeFlags(RV64, RV32)}},
+    {CSR_VL,     {"vl"    , MISCREG_VL, rvTypeFlags(RV64, RV32)}},
+    {CSR_VTYPE,  {"vtype" , MISCREG_VTYPE, rvTypeFlags(RV64, RV32)}},
+    {CSR_VLENB,  {"VLENB" , MISCREG_VLENB, rvTypeFlags(RV64, RV32)}}
 };
 
 /**
@@ -816,6 +840,7 @@ const off_t SBE_OFFSET[enums::Num_RiscvType] = {
 const off_t SXL_OFFSET = 34;
 const off_t UXL_OFFSET = 32;
 const off_t FS_OFFSET = 13;
+const off_t VS_OFFSET = 9;
 const off_t FRM_OFFSET = 5;
 
 const RegVal ISA_MXL_MASKS[enums::Num_RiscvType] = {
@@ -853,7 +878,7 @@ const RegVal STATUS_MPRV_MASK = 1ULL << 17;
 const RegVal STATUS_XS_MASK = 3ULL << 15;
 const RegVal STATUS_FS_MASK = 3ULL << FS_OFFSET;
 const RegVal STATUS_MPP_MASK = 3ULL << 11;
-const RegVal STATUS_VS_MASK = 3ULL << 9;
+const RegVal STATUS_VS_MASK = 3ULL << VS_OFFSET;
 const RegVal STATUS_SPP_MASK = 1ULL << 8;
 const RegVal STATUS_MPIE_MASK = 1ULL << 7;
 const RegVal STATUS_SPIE_MASK = 1ULL << 5;
diff --git a/src/arch/riscv/regs/vector.hh b/src/arch/riscv/regs/vector.hh
new file mode 100644
index 0000000000..d722c2d03a
--- /dev/null
+++ b/src/arch/riscv/regs/vector.hh
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022 PLCT Lab
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef __ARCH_RISCV_REGS_VECTOR_HH__
+#define __ARCH_RISCV_REGS_VECTOR_HH__
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "arch/generic/vec_pred_reg.hh"
+#include "arch/generic/vec_reg.hh"
+#include "base/bitunion.hh"
+#include "cpu/reg_class.hh"
+#include "debug/VecRegs.hh"
+
+namespace gem5
+{
+
+namespace RiscvISA
+{
+
+constexpr unsigned ELEN = 64;
+constexpr unsigned VLEN = 256;
+constexpr unsigned VLENB = VLEN / 8;
+
+using VecRegContainer = gem5::VecRegContainer<VLENB>;
+using vreg_t = VecRegContainer;
+
+const int NumVecStandardRegs = 32;
+const int NumVecInternalRegs = 8; // Used by vector uop
+const int NumVecRegs = NumVecStandardRegs + NumVecInternalRegs;
+
+const std::vector<std::string> VecRegNames = {
+    "v0",   "v1",   "v2",   "v3",   "v4",   "v5",   "v6",   "v7",
+    "v8",   "v9",   "v10",  "v11",  "v12",  "v13",  "v14",  "v15",
+    "v16",  "v17",  "v18",  "v19",  "v20",  "v21",  "v22",  "v23",
+    "v24",  "v25",  "v26",  "v27",  "v28",  "v29",  "v30",  "v31",
+    "vtmp0", "vtmp1", "vtmp2", "vtmp3", "vtmp4", "vtmp5", "vtmp6", "vtmp7"
+};
+
+// vector index
+const int VecMemInternalReg0 = NumVecStandardRegs;
+
+static inline TypedRegClassOps<RiscvISA::VecRegContainer> vecRegClassOps;
+
+inline constexpr RegClass vecRegClass =
+    RegClass(VecRegClass, VecRegClassName, NumVecRegs, debug::VecRegs).
+        ops(vecRegClassOps).
+        regType<VecRegContainer>();
+
+BitUnion32(VTYPE)
+    Bitfield<31> vill;
+    Bitfield<7, 0> vtype8;
+    Bitfield<7> vma;
+    Bitfield<6> vta;
+    Bitfield<5, 3> vsew;
+    Bitfield<2, 0> vlmul;
+EndBitUnion(VTYPE)
+
+} // namespace RiscvISA
+} // namespace gem5
+
+#endif // __ARCH_RISCV_REGS_VECTOR_HH__
diff --git a/src/arch/riscv/utility.hh b/src/arch/riscv/utility.hh
index 5fccc84c79..e0a8494ece 100644
--- a/src/arch/riscv/utility.hh
+++ b/src/arch/riscv/utility.hh
@@ -51,6 +51,7 @@
 
 #include "arch/riscv/regs/float.hh"
 #include "arch/riscv/regs/int.hh"
+#include "arch/riscv/regs/vector.hh"
 #include "base/types.hh"
 #include "cpu/reg_class.hh"
 #include "cpu/static_inst.hh"
@@ -130,7 +131,14 @@ registerName(RegId reg)
             return str.str();
         }
         return float_reg::RegNames[reg.index()];
-    } else {
+    } else if (reg.is(VecRegClass)) {
+        if (reg.index() >= NumVecRegs) {
+            std::stringstream str;
+            str << "?? (v" << reg.index() << ')';
+            return str.str();
+        }
+        return VecRegNames[reg.index()];
+    } else  {
         /* It must be an InvalidRegClass, in RISC-V we should treat it as a
          * zero register for the disassembler to work correctly.
          */

From e14e066fde5516b7ab3921687248f9b74219bdf0 Mon Sep 17 00:00:00 2001
From: Xuan Hu <huxuan@bosc.ac.cn>
Date: Tue, 21 Feb 2023 11:48:54 +0800
Subject: [PATCH 04/10] arch-riscv: Add risc-v vector ext v1.0 vset insts
 support

Change-Id: I84363164ca327151101e8a1c3d8441a66338c909
Co-authored-by: Yang Liu <numbksco@gmail.com>
Co-authored-by: Fan Yang <1209202421@qq.com>

arch-riscv: Add a todo to fix vsetvl stall on decode

Change-Id: Iafb129648fba89009345f0c0ad3710f773379bf6
---
 src/arch/riscv/decoder.cc                  |  27 +++++
 src/arch/riscv/decoder.hh                  |  12 +-
 src/arch/riscv/insts/SConscript            |   1 +
 src/arch/riscv/insts/static_inst.hh        |   1 +
 src/arch/riscv/insts/vector.cc             | 126 +++++++++++++++++++++
 src/arch/riscv/insts/vector.hh             |  88 ++++++++++++++
 src/arch/riscv/isa/decoder.isa             |  33 ++++++
 src/arch/riscv/isa/formats/formats.isa     |   1 +
 src/arch/riscv/isa/formats/vector_conf.isa |  96 ++++++++++++++++
 src/arch/riscv/isa/includes.isa            |   2 +
 10 files changed, 386 insertions(+), 1 deletion(-)
 create mode 100644 src/arch/riscv/insts/vector.cc
 create mode 100644 src/arch/riscv/insts/vector.hh
 create mode 100644 src/arch/riscv/isa/formats/vector_conf.isa

diff --git a/src/arch/riscv/decoder.cc b/src/arch/riscv/decoder.cc
index 7faa310b1e..ce362ad522 100644
--- a/src/arch/riscv/decoder.cc
+++ b/src/arch/riscv/decoder.cc
@@ -42,6 +42,7 @@ void Decoder::reset()
 {
     aligned = true;
     mid = false;
+    vConfigDone = true;
     machInst = 0;
     emi = 0;
 }
@@ -49,6 +50,15 @@ void Decoder::reset()
 void
 Decoder::moreBytes(const PCStateBase &pc, Addr fetchPC)
 {
+    // TODO: Current vsetvl instructions stall decode. Future fixes should
+    // enable speculation, and this code will be removed.
+    if (GEM5_UNLIKELY(!this->vConfigDone)) {
+        DPRINTF(Decode, "Waiting for vset*vl* to be executed\n");
+        instDone = false;
+        outOfBytes = false;
+        return;
+    }
+
     // The MSB of the upper and lower halves of a machine instruction.
     constexpr size_t max_bit = sizeof(machInst) * 8 - 1;
     constexpr size_t mid_bit = sizeof(machInst) * 4 - 1;
@@ -78,6 +88,14 @@ Decoder::moreBytes(const PCStateBase &pc, Addr fetchPC)
             instDone = compressed(emi);
         }
     }
+    if (instDone) {
+        emi.vl      = this->machVl;
+        emi.vtype8   = this->machVtype & 0xff;
+        emi.vill    = this->machVtype.vill;
+        if (vconf(emi)) {
+            this->vConfigDone = false; // set true when vconfig inst execute
+        }
+    }
 }
 
 StaticInstPtr
@@ -116,5 +134,14 @@ Decoder::decode(PCStateBase &_next_pc)
     return decode(emi, next_pc.instAddr());
 }
 
+void
+Decoder::setVlAndVtype(uint32_t vl, VTYPE vtype)
+{
+    this->machVtype = vtype;
+    this->machVl = vl;
+
+    this->vConfigDone = true;
+}
+
 } // namespace RiscvISA
 } // namespace gem5
diff --git a/src/arch/riscv/decoder.hh b/src/arch/riscv/decoder.hh
index 15cbefe39c..d1d2f3cb0c 100644
--- a/src/arch/riscv/decoder.hh
+++ b/src/arch/riscv/decoder.hh
@@ -32,6 +32,7 @@
 
 #include "arch/generic/decode_cache.hh"
 #include "arch/generic/decoder.hh"
+#include "arch/riscv/insts/vector.hh"
 #include "arch/riscv/types.hh"
 #include "base/logging.hh"
 #include "base/types.hh"
@@ -53,12 +54,16 @@ class Decoder : public InstDecoder
     decode_cache::InstMap<ExtMachInst> instMap;
     bool aligned;
     bool mid;
+    bool vConfigDone;
 
   protected:
     //The extended machine instruction being generated
     ExtMachInst emi;
     uint32_t machInst;
 
+    VTYPE machVtype;
+    uint32_t machVl;
+
     StaticInstPtr decodeInst(ExtMachInst mach_inst);
 
     /// Decode a machine instruction.
@@ -74,13 +79,18 @@ class Decoder : public InstDecoder
 
     void reset() override;
 
-    inline bool compressed(ExtMachInst inst) { return (inst & 0x3) < 0x3; }
+    inline bool compressed(ExtMachInst inst) { return inst.quadRant < 0x3; }
+    inline bool vconf(ExtMachInst inst) {
+      return inst.opcode == 0b1010111u && inst.funct3 == 0b111u;
+    }
 
     //Use this to give data to the decoder. This should be used
     //when there is control flow.
     void moreBytes(const PCStateBase &pc, Addr fetchPC) override;
 
     StaticInstPtr decode(PCStateBase &nextPC) override;
+
+    void setVlAndVtype(uint32_t vl, VTYPE vtype);
 };
 
 } // namespace RiscvISA
diff --git a/src/arch/riscv/insts/SConscript b/src/arch/riscv/insts/SConscript
index 704152c040..2822cf86b4 100644
--- a/src/arch/riscv/insts/SConscript
+++ b/src/arch/riscv/insts/SConscript
@@ -33,3 +33,4 @@ Source('compressed.cc', tags='riscv isa')
 Source('mem.cc', tags='riscv isa')
 Source('standard.cc', tags='riscv isa')
 Source('static_inst.cc', tags='riscv isa')
+Source('vector.cc', tags='riscv isa')
diff --git a/src/arch/riscv/insts/static_inst.hh b/src/arch/riscv/insts/static_inst.hh
index f835713505..74f9ddb452 100644
--- a/src/arch/riscv/insts/static_inst.hh
+++ b/src/arch/riscv/insts/static_inst.hh
@@ -33,6 +33,7 @@
 #include <string>
 
 #include "arch/riscv/pcstate.hh"
+#include "arch/riscv/regs/misc.hh"
 #include "arch/riscv/types.hh"
 #include "cpu/exec_context.hh"
 #include "cpu/static_inst.hh"
diff --git a/src/arch/riscv/insts/vector.cc b/src/arch/riscv/insts/vector.cc
new file mode 100644
index 0000000000..3965a45b26
--- /dev/null
+++ b/src/arch/riscv/insts/vector.cc
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2022 PLCT Lab
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/riscv/insts/vector.hh"
+
+#include <sstream>
+#include <string>
+
+#include "arch/riscv/insts/static_inst.hh"
+#include "arch/riscv/utility.hh"
+#include "cpu/static_inst.hh"
+
+namespace gem5
+{
+
+namespace RiscvISA
+{
+
+/**
+ * This function translates the 3-bit value of vlmul bits to the corresponding
+ * lmul value as specified in RVV 1.0 spec p11-12 chapter 3.4.2.
+ *
+ * I.e.,
+ * vlmul = -3 -> LMUL = 1/8
+ * vlmul = -2 -> LMUL = 1/4
+ * vlmul = -1 -> LMUL = 1/2
+ * vlmul = 0 -> LMUL = 1
+ * vlmul = 1 -> LMUL = 2
+ * vlmul = 2 -> LMUL = 4
+ * vlmul = 3 -> LMUL = 8
+ *
+**/
+float
+getVflmul(uint32_t vlmul_encoding) {
+  int vlmul = sext<3>(vlmul_encoding & 7);
+  float vflmul = vlmul >= 0 ? 1 << vlmul : 1.0 / (1 << -vlmul);
+  return vflmul;
+}
+
+uint32_t
+getVlmax(VTYPE vtype, uint32_t vlen) {
+  uint32_t sew = getSew(vtype.vsew);
+  // vlmax is defined in RVV 1.0 spec p12 chapter 3.4.2.
+  uint32_t vlmax = (vlen/sew) * getVflmul(vtype.vlmul);
+  return vlmax;
+}
+
+std::string
+VConfOp::generateDisassembly(Addr pc, const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", ";
+    if (bit31 && bit30 == 0) {
+        ss << registerName(srcRegIdx(0)) << ", " << registerName(srcRegIdx(1));
+    } else if (bit31 && bit30) {
+        ss << uimm << ", " << generateZimmDisassembly();
+    } else {
+        ss << registerName(srcRegIdx(0)) << ", " << generateZimmDisassembly();
+    }
+    return ss.str();
+}
+
+std::string
+VConfOp::generateZimmDisassembly() const
+{
+    std::stringstream s;
+
+    // VSETIVLI uses ZIMM10 and VSETVLI uses ZIMM11
+    uint64_t zimm = (bit31 && bit30) ? zimm10 : zimm11;
+
+    bool frac_lmul = bits(zimm, 2);
+    int sew = 1 << (bits(zimm, 5, 3) + 3);
+    int lmul = bits(zimm, 1, 0);
+    auto vta = bits(zimm, 6) == 1 ? "ta" : "tu";
+    auto vma = bits(zimm, 7) == 1 ? "ma" : "mu";
+    s << "e" << sew;
+    if (frac_lmul) {
+        std::string lmul_str = "";
+        switch(lmul){
+        case 3:
+            lmul_str = "f2";
+            break;
+        case 2:
+            lmul_str = "f4";
+            break;
+        case 1:
+            lmul_str = "f8";
+            break;
+        default:
+            panic("Unsupport fractional LMUL");
+        }
+        s << ", m" << lmul_str;
+    } else {
+        s << ", m" << (1 << lmul);
+    }
+    s << ", " << vta << ", " << vma;
+    return s.str();
+}
+
+} // namespace RiscvISA
+} // namespace gem5
diff --git a/src/arch/riscv/insts/vector.hh b/src/arch/riscv/insts/vector.hh
new file mode 100644
index 0000000000..cdeb48360c
--- /dev/null
+++ b/src/arch/riscv/insts/vector.hh
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2022 PLCT Lab
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_RISCV_INSTS_VECTOR_HH__
+#define __ARCH_RISCV_INSTS_VECTOR_HH__
+
+#include <string>
+
+#include "arch/riscv/insts/static_inst.hh"
+#include "arch/riscv/regs/misc.hh"
+#include "arch/riscv/regs/vector.hh"
+#include "arch/riscv/utility.hh"
+#include "cpu/exec_context.hh"
+#include "cpu/static_inst.hh"
+
+namespace gem5
+{
+
+namespace RiscvISA
+{
+
+float
+getVflmul(uint32_t vlmul_encoding);
+
+inline uint32_t getSew(uint32_t vsew) {
+    assert(vsew <= 3);
+    return (8 << vsew);
+}
+
+uint32_t
+getVlmax(VTYPE vtype, uint32_t vlen);
+
+/**
+ * Base class for Vector Config operations
+ */
+class VConfOp : public RiscvStaticInst
+{
+  protected:
+    uint64_t bit30;
+    uint64_t bit31;
+    uint64_t zimm10;
+    uint64_t zimm11;
+    uint64_t uimm;
+    VConfOp(const char *mnem, ExtMachInst _extMachInst, OpClass __opClass)
+        : RiscvStaticInst(mnem, _extMachInst, __opClass),
+          bit30(_extMachInst.bit30), bit31(_extMachInst.bit31),
+          zimm10(_extMachInst.zimm_vsetivli),
+          zimm11(_extMachInst.zimm_vsetvli),
+          uimm(_extMachInst.uimm_vsetivli)
+    {}
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+
+    std::string generateZimmDisassembly() const;
+};
+
+
+} // namespace RiscvISA
+} // namespace gem5
+
+
+#endif // __ARCH_RISCV_INSTS_VECTOR_HH__
diff --git a/src/arch/riscv/isa/decoder.isa b/src/arch/riscv/isa/decoder.isa
index a339c11375..2e5b52a879 100644
--- a/src/arch/riscv/isa/decoder.isa
+++ b/src/arch/riscv/isa/decoder.isa
@@ -2012,6 +2012,39 @@ decode QUADRANT default Unknown::unknown() {
             }
         }
 
+        0x15: decode FUNCT3 {
+            0x7: decode BIT31 {
+                format VConfOp {
+                    0x0: vsetvli({{
+                        uint64_t rd_bits = RD;
+                        uint64_t rs1_bits = RS1;
+                        uint64_t requested_vl = Rs1_ud;
+                        uint64_t requested_vtype = zimm11;
+
+                        Rd_ud = 0;
+                    }}, VectorConfigOp, IsDirectControl, IsCondControl);
+                    0x1: decode BIT30 {
+                        0x0: vsetvl({{
+                            uint64_t rd_bits = RD;
+                            uint64_t rs1_bits = RS1;
+                            uint64_t requested_vl = Rs1_ud;
+                            uint64_t requested_vtype = Rs2_ud;
+
+                            Rd_ud = 0;
+                        }}, VectorConfigOp, IsDirectControl, IsCondControl);
+                        0x1: vsetivli({{
+                            uint64_t rd_bits = RD;
+                            uint64_t rs1_bits = -1;
+                            uint64_t requested_vl = uimm;
+                            uint64_t requested_vtype = zimm10;
+
+                            Rd_ud = 0;
+                        }}, VectorConfigOp, IsDirectControl, IsCondControl);
+                    }
+                }
+            }
+        }
+
         0x18: decode FUNCT3 {
             format BOp {
                 0x0: beq({{
diff --git a/src/arch/riscv/isa/formats/formats.isa b/src/arch/riscv/isa/formats/formats.isa
index 19749438a8..0f7c94da9a 100644
--- a/src/arch/riscv/isa/formats/formats.isa
+++ b/src/arch/riscv/isa/formats/formats.isa
@@ -37,6 +37,7 @@
 ##include "fp.isa"
 ##include "amo.isa"
 ##include "bs.isa"
+##include "vector_conf.isa"
 
 // Include formats for nonstandard extensions
 ##include "compressed.isa"
diff --git a/src/arch/riscv/isa/formats/vector_conf.isa b/src/arch/riscv/isa/formats/vector_conf.isa
new file mode 100644
index 0000000000..556e230075
--- /dev/null
+++ b/src/arch/riscv/isa/formats/vector_conf.isa
@@ -0,0 +1,96 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2022 PLCT Lab
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+def format VConfOp(code, *flags) {{
+    iop = InstObjParams(name, Name, 'VConfOp', code, flags)
+    header_output = BasicDeclare.subst(iop)
+    decoder_output = BasicConstructor.subst(iop)
+    decode_block = BasicDecode.subst(iop)
+    exec_output = VConfExecute.subst(iop)
+}};
+
+def template VConfExecute {{
+    Fault
+    %(class_name)s::execute(ExecContext *xc,
+        trace::InstRecord *traceData) const
+    {
+        auto tc = xc->tcBase();
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(code)s;
+
+        tc->setMiscReg(MISCREG_VSTART, 0);
+
+        uint32_t vlen = xc->readMiscReg(MISCREG_VLENB) * 8;
+        uint32_t vlmax = getVlmax(xc->readMiscReg(MISCREG_VTYPE), vlen);
+
+        VTYPE new_vtype = requested_vtype;
+        if (xc->readMiscReg(MISCREG_VTYPE) != new_vtype) {
+            vlmax = getVlmax(new_vtype, vlen);
+
+            float vflmul = getVflmul(new_vtype.vlmul);
+
+            uint32_t sew = getSew(new_vtype.vsew);
+
+            uint32_t new_vill =
+                !(vflmul >= 0.125 && vflmul <= 8) ||
+                    sew > std::min(vflmul, 1.0f) * ELEN ||
+                    bits(requested_vtype, 30, 8) != 0;
+            if (new_vill) {
+                vlmax = 0;
+                new_vtype = 0;
+                new_vtype.vill = 1;
+            }
+
+            xc->setMiscReg(MISCREG_VTYPE, new_vtype);
+        }
+
+        uint32_t current_vl = xc->readMiscReg(MISCREG_VL);
+        uint32_t new_vl = 0;
+        if (vlmax == 0) {
+            new_vl = 0;
+        } else if (rd_bits == 0 && rs1_bits == 0) {
+            new_vl = current_vl > vlmax ? vlmax : current_vl;
+        } else if (rd_bits != 0 && rs1_bits == 0) {
+            new_vl = vlmax;
+        } else if (rs1_bits != 0) {
+            new_vl = requested_vl > vlmax ? vlmax : requested_vl;
+        }
+
+        xc->setMiscReg(MISCREG_VL, new_vl);
+
+        tc->getDecoderPtr()->as<Decoder>().setVlAndVtype(new_vl, new_vtype);
+
+        Rd = new_vl;
+
+        %(op_wb)s;
+        return NoFault;
+    }
+}};
diff --git a/src/arch/riscv/isa/includes.isa b/src/arch/riscv/isa/includes.isa
index cb95f58f7e..1d544f40ed 100644
--- a/src/arch/riscv/isa/includes.isa
+++ b/src/arch/riscv/isa/includes.isa
@@ -34,6 +34,7 @@
 //
 
 output header {{
+#include <functional>
 #include <iomanip>
 #include <sstream>
 #include <string>
@@ -45,6 +46,7 @@ output header {{
 #include <softfloat.h>
 #include <specialize.h>
 
+#include "arch/riscv/decoder.hh"
 #include "arch/riscv/insts/amo.hh"
 #include "arch/riscv/insts/bs.hh"
 #include "arch/riscv/insts/compressed.hh"

From 91b1d50f59b558a90eb529e8c8135d3a6d774464 Mon Sep 17 00:00:00 2001
From: Xuan Hu <huxuan@bosc.ac.cn>
Date: Tue, 21 Feb 2023 12:58:30 +0800
Subject: [PATCH 05/10] arch-riscv: Add risc-v vector ext v1.0 mem insts
 support

* TODOs:
  + Vector Segment Load/Store
  + Vector Fault-only-first Load

Change-Id: I2815c76404e62babab7e9466e4ea33ea87e66e75
Co-authored-by: Yang Liu <numbksco@gmail.com>
Co-authored-by: Fan Yang <1209202421@qq.com>
Co-authored-by: Jerin Joy <joy@rivosinc.com>
---
 src/arch/riscv/insts/vector.cc              |  173 +++
 src/arch/riscv/insts/vector.hh              |  341 +++++
 src/arch/riscv/isa/decoder.isa              |  268 ++++
 src/arch/riscv/isa/formats/formats.isa      |    1 +
 src/arch/riscv/isa/formats/vector_mem.isa   |  205 +++
 src/arch/riscv/isa/includes.isa             |    8 +
 src/arch/riscv/isa/main.isa                 |    3 +
 src/arch/riscv/isa/templates/templates.isa  |    2 +
 src/arch/riscv/isa/templates/vector_mem.isa | 1349 +++++++++++++++++++
 src/arch/riscv/utility.hh                   |   55 +
 10 files changed, 2405 insertions(+)
 create mode 100644 src/arch/riscv/isa/formats/vector_mem.isa
 create mode 100644 src/arch/riscv/isa/templates/templates.isa
 create mode 100644 src/arch/riscv/isa/templates/vector_mem.isa

diff --git a/src/arch/riscv/insts/vector.cc b/src/arch/riscv/insts/vector.cc
index 3965a45b26..f2bde629e9 100644
--- a/src/arch/riscv/insts/vector.cc
+++ b/src/arch/riscv/insts/vector.cc
@@ -122,5 +122,178 @@ VConfOp::generateZimmDisassembly() const
     return s.str();
 }
 
+std::string VleMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+       << VLENB * microIdx << '(' << registerName(srcRegIdx(0)) << ')' << ", "
+       << registerName(srcRegIdx(1));
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VlWholeMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+       << VLENB * microIdx << '(' << registerName(srcRegIdx(0)) << ')';
+    return ss.str();
+}
+
+std::string VseMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(srcRegIdx(1)) << ", "
+       << VLENB * microIdx  << '(' << registerName(srcRegIdx(0)) << ')';
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VsWholeMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(srcRegIdx(1)) << ", "
+       << VLENB * microIdx << '(' << registerName(srcRegIdx(0)) << ')';
+    return ss.str();
+}
+
+std::string VleMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+        '(' << registerName(srcRegIdx(0)) << ')';
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VlWholeMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+        '(' << registerName(srcRegIdx(0)) << ')';
+    return ss.str();
+}
+
+std::string VseMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(srcRegIdx(1)) << ", " <<
+        '(' << registerName(srcRegIdx(0)) << ')';
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VsWholeMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(srcRegIdx(1)) << ", " <<
+        '(' << registerName(srcRegIdx(0)) << ')';
+    return ss.str();
+}
+
+std::string VlStrideMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+        '(' << registerName(srcRegIdx(0)) << ')' <<
+        ", " << registerName(srcRegIdx(1));
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VlStrideMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+        '(' << registerName(srcRegIdx(0)) << ')' <<
+        ", "<< registerName(srcRegIdx(1));
+    if (microIdx != 0 || machInst.vtype8.vma == 0 || machInst.vtype8.vta == 0)
+        ss << ", " << registerName(srcRegIdx(2));
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VsStrideMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(srcRegIdx(2)) << ", " <<
+        '(' << registerName(srcRegIdx(0)) << ')' <<
+        ", " << registerName(srcRegIdx(1));
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VsStrideMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(srcRegIdx(2)) << ", " <<
+        '(' << registerName(srcRegIdx(0)) << ')' <<
+        ", "<< registerName(srcRegIdx(1));
+    if (microIdx != 0 || machInst.vtype8.vma == 0 || machInst.vtype8.vta == 0)
+        ss << ", " << registerName(srcRegIdx(2));
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VlIndexMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+        << '(' << registerName(srcRegIdx(0)) << "),"
+        << registerName(srcRegIdx(1));
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VlIndexMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' '
+        << registerName(destRegIdx(0)) << "[" << uint16_t(vdElemIdx) << "], "
+        << '(' << registerName(srcRegIdx(0)) << "), "
+        << registerName(srcRegIdx(1)) << "[" << uint16_t(vs2ElemIdx) << "]";
+    if (microIdx != 0 || machInst.vtype8.vma == 0 || machInst.vtype8.vta == 0)
+        ss << ", " << registerName(srcRegIdx(2));
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VsIndexMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(srcRegIdx(2)) << ", "
+        << '(' << registerName(srcRegIdx(0)) << "),"
+        << registerName(srcRegIdx(1));
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VsIndexMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' '
+        << registerName(srcRegIdx(2)) << "[" << uint16_t(vs3ElemIdx) << "], "
+        << '(' << registerName(srcRegIdx(0)) << "), "
+        << registerName(srcRegIdx(1)) << "[" << uint16_t(vs2ElemIdx) << "]";
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
 } // namespace RiscvISA
 } // namespace gem5
diff --git a/src/arch/riscv/insts/vector.hh b/src/arch/riscv/insts/vector.hh
index cdeb48360c..f989d7ffbf 100644
--- a/src/arch/riscv/insts/vector.hh
+++ b/src/arch/riscv/insts/vector.hh
@@ -80,6 +80,347 @@ class VConfOp : public RiscvStaticInst
     std::string generateZimmDisassembly() const;
 };
 
+inline uint8_t checked_vtype(bool vill, uint8_t vtype) {
+    panic_if(vill, "vill has been set");
+    const uint8_t vsew = bits(vtype, 5, 3);
+    panic_if(vsew >= 0b100, "vsew: %#x not supported", vsew);
+    const uint8_t vlmul = bits(vtype, 2, 0);
+    panic_if(vlmul == 0b100, "vlmul: %#x not supported", vlmul);
+    return vtype;
+}
+
+class VectorMacroInst : public RiscvMacroInst
+{
+  protected:
+    uint32_t vl;
+    uint8_t vtype;
+    VectorMacroInst(const char* mnem, ExtMachInst _machInst,
+                   OpClass __opClass)
+        : RiscvMacroInst(mnem, _machInst, __opClass),
+        vl(_machInst.vl),
+        vtype(checked_vtype(_machInst.vill, _machInst.vtype8))
+    {
+        this->flags[IsVector] = true;
+    }
+};
+
+class VectorMicroInst : public RiscvMicroInst
+{
+protected:
+    uint8_t microVl;
+    uint8_t microIdx;
+    uint8_t vtype;
+    VectorMicroInst(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                    uint8_t _microVl, uint8_t _microIdx)
+        : RiscvMicroInst(mnem, _machInst, __opClass),
+        microVl(_microVl),
+        microIdx(_microIdx),
+        vtype(_machInst.vtype8)
+    {
+        this->flags[IsVector] = true;
+    }
+};
+
+class VectorNopMicroInst : public RiscvMicroInst
+{
+public:
+    VectorNopMicroInst(ExtMachInst _machInst)
+        : RiscvMicroInst("vnop", _machInst, No_OpClass)
+    {}
+
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)
+        const override
+    {
+        return NoFault;
+    }
+
+    std::string generateDisassembly(Addr pc, const loader::SymbolTable *symtab)
+      const override
+    {
+        std::stringstream ss;
+        ss << mnemonic;
+        return ss.str();
+    }
+};
+
+class VectorArithMicroInst : public VectorMicroInst
+{
+protected:
+    VectorArithMicroInst(const char *mnem, ExtMachInst _machInst,
+                         OpClass __opClass, uint8_t _microVl,
+                         uint8_t _microIdx)
+        : VectorMicroInst(mnem, _machInst, __opClass, _microVl, _microIdx)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VectorArithMacroInst : public VectorMacroInst
+{
+  protected:
+    VectorArithMacroInst(const char* mnem, ExtMachInst _machInst,
+                         OpClass __opClass)
+        : VectorMacroInst(mnem, _machInst, __opClass)
+    {
+        this->flags[IsVector] = true;
+    }
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VectorMemMicroInst : public VectorMicroInst
+{
+  protected:
+    uint32_t offset; // Used to calculate EA.
+    Request::Flags memAccessFlags;
+
+    VectorMemMicroInst(const char* mnem, ExtMachInst _machInst,
+                       OpClass __opClass, uint8_t _microVl, uint8_t _microIdx,
+                       uint32_t _offset)
+        : VectorMicroInst(mnem, _machInst, __opClass, _microVl, _microIdx)
+        , offset(_offset)
+        , memAccessFlags(0)
+    {}
+};
+
+class VectorMemMacroInst : public VectorMacroInst
+{
+  protected:
+    VectorMemMacroInst(const char* mnem, ExtMachInst _machInst,
+                       OpClass __opClass)
+        : VectorMacroInst(mnem, _machInst, __opClass)
+    {}
+};
+
+class VleMacroInst : public VectorMemMacroInst
+{
+  protected:
+    VleMacroInst(const char* mnem, ExtMachInst _machInst,
+                   OpClass __opClass)
+        : VectorMemMacroInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VseMacroInst : public VectorMemMacroInst
+{
+  protected:
+    VseMacroInst(const char* mnem, ExtMachInst _machInst,
+                   OpClass __opClass)
+        : VectorMemMacroInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VleMicroInst : public VectorMicroInst
+{
+  protected:
+    Request::Flags memAccessFlags;
+
+    VleMicroInst(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                 uint8_t _microVl, uint8_t _microIdx)
+        : VectorMicroInst(mnem, _machInst, __opClass, _microVl, _microIdx)
+    {
+        this->flags[IsLoad] = true;
+    }
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VseMicroInst : public VectorMicroInst
+{
+  protected:
+    Request::Flags memAccessFlags;
+
+    VseMicroInst(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                 uint8_t _microVl, uint8_t _microIdx)
+        : VectorMicroInst(mnem, _machInst, __opClass, _microVl, _microIdx)
+    {
+        this->flags[IsStore] = true;
+    }
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VlWholeMacroInst : public VectorMemMacroInst
+{
+  protected:
+    VlWholeMacroInst(const char *mnem, ExtMachInst _machInst,
+                     OpClass __opClass)
+      : VectorMemMacroInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(
+      Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VlWholeMicroInst : public VectorMicroInst
+{
+  protected:
+    Request::Flags memAccessFlags;
+
+    VlWholeMicroInst(const char *mnem, ExtMachInst _machInst,
+                     OpClass __opClass, uint8_t _microVl, uint8_t _microIdx)
+        : VectorMicroInst(mnem, _machInst, __opClass, _microVl, _microIdx)
+    {}
+
+    std::string generateDisassembly(
+      Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VsWholeMacroInst : public VectorMemMacroInst
+{
+  protected:
+    VsWholeMacroInst(const char *mnem, ExtMachInst _machInst,
+                     OpClass __opClass)
+        : VectorMemMacroInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VsWholeMicroInst : public VectorMicroInst
+{
+  protected:
+    Request::Flags memAccessFlags;
+
+    VsWholeMicroInst(const char *mnem, ExtMachInst _machInst,
+                     OpClass __opClass, uint8_t _microVl, uint8_t _microIdx)
+        : VectorMicroInst(mnem, _machInst, __opClass, _microIdx, _microIdx)
+    {}
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VlStrideMacroInst : public VectorMemMacroInst
+{
+  protected:
+    VlStrideMacroInst(const char* mnem, ExtMachInst _machInst,
+                   OpClass __opClass)
+        : VectorMemMacroInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VlStrideMicroInst : public VectorMemMicroInst
+{
+  protected:
+  uint8_t regIdx;
+    VlStrideMicroInst(const char *mnem, ExtMachInst _machInst,
+                      OpClass __opClass, uint8_t _regIdx,
+                      uint8_t _microIdx, uint8_t _microVl)
+        : VectorMemMicroInst(mnem, _machInst, __opClass, _microVl,
+                             _microIdx, 0)
+        , regIdx(_regIdx)
+    {}
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VsStrideMacroInst : public VectorMemMacroInst
+{
+  protected:
+    VsStrideMacroInst(const char* mnem, ExtMachInst _machInst,
+                   OpClass __opClass)
+        : VectorMemMacroInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VsStrideMicroInst : public VectorMemMicroInst
+{
+  protected:
+  uint8_t regIdx;
+    VsStrideMicroInst(const char *mnem, ExtMachInst _machInst,
+                      OpClass __opClass, uint8_t _regIdx,
+                      uint8_t _microIdx, uint8_t _microVl)
+        : VectorMemMicroInst(mnem, _machInst, __opClass, _microVl,
+                             _microIdx, 0)
+        , regIdx(_regIdx)
+    {}
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VlIndexMacroInst : public VectorMemMacroInst
+{
+  protected:
+    VlIndexMacroInst(const char* mnem, ExtMachInst _machInst,
+                   OpClass __opClass)
+        : VectorMemMacroInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VlIndexMicroInst : public VectorMemMicroInst
+{
+  protected:
+    uint8_t vdRegIdx;
+    uint8_t vdElemIdx;
+    uint8_t vs2RegIdx;
+    uint8_t vs2ElemIdx;
+    VlIndexMicroInst(const char *mnem, ExtMachInst _machInst,
+                    OpClass __opClass, uint8_t _vdRegIdx, uint8_t _vdElemIdx,
+                    uint8_t _vs2RegIdx, uint8_t _vs2ElemIdx)
+        : VectorMemMicroInst(mnem, _machInst, __opClass, 1,
+                             0, 0)
+        , vdRegIdx(_vdRegIdx), vdElemIdx(_vdElemIdx)
+        , vs2RegIdx(_vs2RegIdx), vs2ElemIdx(_vs2ElemIdx)
+    {}
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VsIndexMacroInst : public VectorMemMacroInst
+{
+  protected:
+    VsIndexMacroInst(const char* mnem, ExtMachInst _machInst,
+                   OpClass __opClass)
+        : VectorMemMacroInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VsIndexMicroInst : public VectorMemMicroInst
+{
+  protected:
+    uint8_t vs3RegIdx;
+    uint8_t vs3ElemIdx;
+    uint8_t vs2RegIdx;
+    uint8_t vs2ElemIdx;
+    VsIndexMicroInst(const char *mnem, ExtMachInst _machInst,
+                    OpClass __opClass, uint8_t _vs3RegIdx, uint8_t _vs3ElemIdx,
+                    uint8_t _vs2RegIdx, uint8_t _vs2ElemIdx)
+        : VectorMemMicroInst(mnem, _machInst, __opClass, 1, 0, 0)
+        , vs3RegIdx(_vs3RegIdx), vs3ElemIdx(_vs3ElemIdx)
+        , vs2RegIdx(_vs2RegIdx), vs2ElemIdx(_vs2ElemIdx)
+    {}
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
 
 } // namespace RiscvISA
 } // namespace gem5
diff --git a/src/arch/riscv/isa/decoder.isa b/src/arch/riscv/isa/decoder.isa
index 2e5b52a879..0288f37ad8 100644
--- a/src/arch/riscv/isa/decoder.isa
+++ b/src/arch/riscv/isa/decoder.isa
@@ -500,6 +500,174 @@ decode QUADRANT default Unknown::unknown() {
                     Fd_bits = fd.v;
                 }}, inst_flags=FloatMemReadOp);
             }
+
+            0x0: decode MOP {
+                0x0: decode LUMOP {
+                    0x00: VleOp::vle8_v({{
+                        if ((machInst.vm || elem_mask(v0, ei)) &&
+                            i < this->microVl) {
+                            Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                        } else {
+                            Vd_ub[i] = Vs2_ub[i];
+                        }
+                    }}, inst_flags=VectorUnitStrideLoadOp);
+                    0x08: decode NF {
+                        format VlWholeOp {
+                            0x0: vl1re8_v({{
+                                Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x1: vl2re8_v({{
+                                Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x3: vl4re8_v({{
+                                Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x7: vl8re8_v({{
+                                Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                        }
+                    }
+                    0x0b: VlmOp::vlm_v({{
+                        Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                    }}, inst_flags=VectorUnitStrideMaskLoadOp);
+                }
+                0x1: VlIndexOp::vluxei8_v({{
+                    Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
+                }}, {{
+                    EA = Rs1 + Vs2_ub[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedLoadOp);
+                0x2: VlStrideOp::vlse8_v({{
+                    Vd_ub[microIdx] = Mem_vc.as<uint8_t>()[0];
+                }}, inst_flags=VectorStridedLoadOp);
+                0x3: VlIndexOp::vloxei8_v({{
+                    Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
+                }}, {{
+                    EA = Rs1 + Vs2_ub[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedLoadOp);
+            }
+            0x5: decode MOP {
+                0x0: decode LUMOP {
+                    0x00: VleOp::vle16_v({{
+                        if ((machInst.vm || elem_mask(v0, ei)) &&
+                            i < this->microVl) {
+                            Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
+                        } else {
+                            Vd_uh[i] = Vs2_uh[i];
+                        }
+                    }}, inst_flags=VectorUnitStrideLoadOp);
+                    0x08: decode NF {
+                        format VlWholeOp {
+                            0x0: vl1re16_v({{
+                                Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x1: vl2re16_v({{
+                                Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x3: vl4re16_v({{
+                                Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x7: vl8re16_v({{
+                                Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                        }
+                    }
+                }
+                0x1: VlIndexOp::vluxei16_v({{
+                    Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
+                }}, {{
+                    EA = Rs1 + Vs2_uh[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedLoadOp);
+                0x2: VlStrideOp::vlse16_v({{
+                    Vd_uh[microIdx] = Mem_vc.as<uint16_t>()[0];
+                }}, inst_flags=VectorStridedLoadOp);
+                0x3: VlIndexOp::vloxei16_v({{
+                    Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
+                }}, {{
+                    EA = Rs1 + Vs2_uh[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedLoadOp);
+            }
+            0x6: decode MOP {
+                0x0: decode LUMOP {
+                    0x00: VleOp::vle32_v({{
+                        if ((machInst.vm || elem_mask(v0, ei)) &&
+                            i < this->microVl) {
+                            Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
+                        } else {
+                            Vd_uw[i] = Vs2_uw[i];
+                        }
+                    }}, inst_flags=VectorUnitStrideLoadOp);
+                    0x08: decode NF {
+                        format VlWholeOp {
+                            0x0: vl1re32_v({{
+                                Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x1: vl2re32_v({{
+                                Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x3: vl4re32_v({{
+                                Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x7: vl8re32_v({{
+                                Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                        }
+                    }
+                }
+                0x1: VlIndexOp::vluxei32_v({{
+                    Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
+                }}, {{
+                    EA = Rs1 + Vs2_uw[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedLoadOp);
+                0x2: VlStrideOp::vlse32_v({{
+                    Vd_uw[microIdx] = Mem_vc.as<uint32_t>()[0];
+                }}, inst_flags=VectorStridedLoadOp);
+                0x3: VlIndexOp::vloxei32_v({{
+                    Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
+                }}, {{
+                    EA = Rs1 + Vs2_uw[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedLoadOp);
+            }
+            0x7: decode MOP {
+                0x0: decode LUMOP {
+                    0x00: VleOp::vle64_v({{
+                        if ((machInst.vm || elem_mask(v0, ei)) &&
+                            i < this->microVl) {
+                            Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
+                        } else {
+                            Vd_ud[i] = Vs2_ud[i];
+                        }
+                    }}, inst_flags=VectorUnitStrideLoadOp);
+                    0x08: decode NF {
+                        format VlWholeOp {
+                            0x0: vl1re64_v({{
+                                Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x1: vl2re64_v({{
+                                Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x3: vl4re64_v({{
+                                Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x7: vl8re64_v({{
+                                Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                        }
+                    }
+                }
+                0x1: VlIndexOp::vluxei64_v({{
+                    Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
+                }}, {{
+                    EA = Rs1 + Vs2_ud[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedLoadOp);
+                0x2: VlStrideOp::vlse64_v({{
+                    Vd_ud[microIdx] = Mem_vc.as<uint64_t>()[0];
+                }}, inst_flags=VectorStridedLoadOp);
+                0x3: VlIndexOp::vloxei64_v({{
+                    Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
+                }}, {{
+                    EA = Rs1 + Vs2_ud[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedLoadOp);
+            }
         }
 
         0x03: decode FUNCT3 {
@@ -806,6 +974,106 @@ decode QUADRANT default Unknown::unknown() {
                     Mem_ud = Fs2_bits;
                 }}, inst_flags=FloatMemWriteOp);
             }
+
+            0x0: decode MOP {
+                0x0: decode SUMOP {
+                    0x00: VseOp::vse8_v({{
+                        Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
+                    }}, inst_flags=VectorUnitStrideStoreOp);
+                    format VsWholeOp {
+                        0x8: decode NF {
+                            0x0: vs1r_v({{
+                                Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
+                            }}, inst_flags=VectorWholeRegisterStoreOp);
+                            0x1: vs2r_v({{
+                                Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
+                            }}, inst_flags=VectorWholeRegisterStoreOp);
+                            0x3: vs4r_v({{
+                                Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
+                            }}, inst_flags=VectorWholeRegisterStoreOp);
+                            0x7: vs8r_v({{
+                                Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
+                            }}, inst_flags=VectorWholeRegisterStoreOp);
+                        }
+                    }
+                    0x0b: VsmOp::vsm_v({{
+                        Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
+                    }}, inst_flags=VectorUnitStrideMaskStoreOp);
+                }
+                0x1: VsIndexOp::vsuxei8_v({{
+                    Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
+                }}, {{
+                    EA = Rs1 + Vs2_ub[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedStoreOp);
+                0x2: VsStrideOp::vsse8_v({{
+                    Mem_vc.as<uint8_t>()[0] = Vs3_ub[microIdx];
+                }}, inst_flags=VectorStridedStoreOp);
+                0x3: VsIndexOp::vsoxei8_v({{
+                    Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
+                }}, {{
+                    EA = Rs1 + Vs2_ub[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedStoreOp);
+            }
+            0x5: decode MOP {
+                0x0: decode SUMOP {
+                    0x00: VseOp::vse16_v({{
+                        Mem_vc.as<uint16_t>()[i] = Vs3_uh[i];
+                    }}, inst_flags=VectorUnitStrideStoreOp);
+                }
+                0x1: VsIndexOp::vsuxei16_v({{
+                    Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
+                }}, {{
+                    EA = Rs1 + Vs2_uh[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedStoreOp);
+                0x2: VsStrideOp::vsse16_v({{
+                    Mem_vc.as<uint16_t>()[0] = Vs3_uh[microIdx];
+                }}, inst_flags=VectorStridedStoreOp);
+                0x3: VsIndexOp::vsoxei16_v({{
+                    Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
+                }}, {{
+                    EA = Rs1 + Vs2_uh[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedStoreOp);
+            }
+            0x6: decode MOP {
+                0x0: decode SUMOP {
+                    0x00: VseOp::vse32_v({{
+                        Mem_vc.as<uint32_t>()[i] = Vs3_uw[i];
+                    }}, inst_flags=VectorUnitStrideStoreOp);
+                }
+                0x1: VsIndexOp::vsuxei32_v({{
+                    Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
+                }}, {{
+                    EA = Rs1 + Vs2_uw[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedStoreOp);
+                0x2: VsStrideOp::vsse32_v({{
+                    Mem_vc.as<uint32_t>()[0] = Vs3_uw[microIdx];
+                }}, inst_flags=VectorStridedStoreOp);
+                0x3: VsIndexOp::vsoxei32_v({{
+                    Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
+                }}, {{
+                    EA = Rs1 + Vs2_uw[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedStoreOp);
+            }
+            0x7: decode MOP {
+                0x0: decode SUMOP {
+                    0x00: VseOp::vse64_v({{
+                        Mem_vc.as<uint64_t>()[i] = Vs3_ud[i];
+                    }}, inst_flags=VectorUnitStrideStoreOp);
+                }
+                0x1: VsIndexOp::vsuxei64_v({{
+                    Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
+                }}, {{
+                    EA = Rs1 + Vs2_ud[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedStoreOp);
+                0x2: VsStrideOp::vsse64_v({{
+                    Mem_vc.as<uint64_t>()[0] = Vs3_ud[microIdx];
+                }}, inst_flags=VectorStridedStoreOp);
+                0x3: VsIndexOp::vsoxei64_v({{
+                    Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
+                }}, {{
+                    EA = Rs1 + Vs2_ud[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedStoreOp);
+            }
         }
 
         0x0b: decode FUNCT3 {
diff --git a/src/arch/riscv/isa/formats/formats.isa b/src/arch/riscv/isa/formats/formats.isa
index 0f7c94da9a..4bdc3021d5 100644
--- a/src/arch/riscv/isa/formats/formats.isa
+++ b/src/arch/riscv/isa/formats/formats.isa
@@ -38,6 +38,7 @@
 ##include "amo.isa"
 ##include "bs.isa"
 ##include "vector_conf.isa"
+##include "vector_mem.isa"
 
 // Include formats for nonstandard extensions
 ##include "compressed.isa"
diff --git a/src/arch/riscv/isa/formats/vector_mem.isa b/src/arch/riscv/isa/formats/vector_mem.isa
new file mode 100644
index 0000000000..113250d5cf
--- /dev/null
+++ b/src/arch/riscv/isa/formats/vector_mem.isa
@@ -0,0 +1,205 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2022 PLCT Lab
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+let {{
+
+def VMemBase(name, Name, ea_code, memacc_code, mem_flags,
+                   inst_flags, base_class, postacc_code='',
+                   declare_template_base=VMemMacroDeclare,
+                   decode_template=BasicDecode, exec_template_base='',
+                   # If it's a macroop, the corresponding microops will be
+                   # generated.
+                   is_macroop=True):
+    # Make sure flags are in lists (convert to lists if not).
+    mem_flags = makeList(mem_flags)
+    inst_flags = makeList(inst_flags)
+    iop = InstObjParams(name, Name, base_class,
+        {'ea_code': ea_code,
+         'memacc_code': memacc_code,
+         'postacc_code': postacc_code },
+        inst_flags)
+
+    constructTemplate = eval(exec_template_base + 'Constructor')
+
+    header_output   = declare_template_base.subst(iop)
+    decoder_output  = ''
+    if declare_template_base is not VMemTemplateMacroDeclare:
+        decoder_output  += constructTemplate.subst(iop)
+    else:
+        header_output   += constructTemplate.subst(iop)
+    decode_block    = decode_template.subst(iop)
+    exec_output     = ''
+    if not is_macroop:
+        return (header_output, decoder_output, decode_block, exec_output)
+
+    microiop = InstObjParams(name + '_micro',
+        Name + 'Micro',
+        exec_template_base + 'MicroInst',
+        {'ea_code': ea_code,
+         'memacc_code': memacc_code,
+         'postacc_code': postacc_code},
+        inst_flags)
+
+    if mem_flags:
+        mem_flags = [ 'Request::%s' % flag for flag in mem_flags ]
+        s = '\n\tmemAccessFlags = ' + '|'.join(mem_flags) + ';'
+        microiop.constructor += s
+
+    microDeclTemplate = eval(exec_template_base + 'Micro' + 'Declare')
+    microExecTemplate = eval(exec_template_base + 'Micro' + 'Execute')
+    microInitTemplate = eval(exec_template_base + 'Micro' + 'InitiateAcc')
+    microCompTemplate = eval(exec_template_base + 'Micro' + 'CompleteAcc')
+    header_output = microDeclTemplate.subst(microiop) + header_output
+    micro_exec_output = (microExecTemplate.subst(microiop) +
+        microInitTemplate.subst(microiop) +
+        microCompTemplate.subst(microiop))
+    if declare_template_base is not VMemTemplateMacroDeclare:
+        exec_output += micro_exec_output
+    else:
+        header_output += micro_exec_output
+
+    return (header_output, decoder_output, decode_block, exec_output)
+
+}};
+
+def format VleOp(
+    memacc_code,
+    ea_code={{ EA = Rs1 + VLENB * microIdx; }},
+    mem_flags=[],
+    inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VleMacroInst', exec_template_base='Vle')
+}};
+
+def format VseOp(
+    memacc_code,
+    ea_code={{ EA = Rs1 + VLENB * microIdx; }},
+    mem_flags=[],
+    inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VseMacroInst', exec_template_base='Vse')
+}};
+
+def format VlmOp(
+    memacc_code,
+    ea_code={{ EA = Rs1; }},
+    mem_flags=[],
+    inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VleMacroInst', exec_template_base='Vlm', is_macroop=False)
+}};
+
+def format VsmOp(
+  memacc_code,
+  ea_code={{ EA = Rs1; }},
+  mem_flags=[],
+  inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VseMacroInst', exec_template_base='Vsm', is_macroop=False)
+}};
+
+def format VlWholeOp(
+    memacc_code,
+    ea_code={{ EA = Rs1 + VLENB * microIdx; }},
+    mem_flags=[],
+    inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VlWholeMacroInst', exec_template_base='VlWhole')
+}};
+
+def format VsWholeOp(
+    memacc_code,
+    ea_code={{ EA = Rs1 + VLENB * microIdx; }},
+    mem_flags=[],
+    inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VsWholeMacroInst', exec_template_base='VsWhole')
+}};
+
+def format VlStrideOp(
+    memacc_code,
+    ea_code={{ EA = Rs1 + Rs2 * (regIdx * VLENB / elem_size + microIdx); }},
+    mem_flags=[],
+    inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VlStrideMacroInst', exec_template_base='VlStride')
+}};
+
+def format VsStrideOp(
+    memacc_code,
+    ea_code={{ EA = Rs1 + Rs2 * (regIdx * VLENB / elem_size + microIdx); }},
+    mem_flags=[],
+    inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VsStrideMacroInst', exec_template_base='VsStride')
+}};
+
+def format VlIndexOp(
+    memacc_code,
+    ea_code,
+    mem_flags=[],
+    inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VlIndexMacroInst', exec_template_base='VlIndex',
+                 declare_template_base=VMemTemplateMacroDeclare,
+                 decode_template=VMemTemplateDecodeBlock
+                 )
+}};
+
+def format VsIndexOp(
+    memacc_code,
+    ea_code,
+    mem_flags=[],
+    inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VsIndexMacroInst', exec_template_base='VsIndex',
+                 declare_template_base=VMemTemplateMacroDeclare,
+                 decode_template=VMemTemplateDecodeBlock
+                 )
+}};
diff --git a/src/arch/riscv/isa/includes.isa b/src/arch/riscv/isa/includes.isa
index 1d544f40ed..76f2388faf 100644
--- a/src/arch/riscv/isa/includes.isa
+++ b/src/arch/riscv/isa/includes.isa
@@ -46,6 +46,7 @@ output header {{
 #include <softfloat.h>
 #include <specialize.h>
 
+#include "arch/generic/memhelpers.hh"
 #include "arch/riscv/decoder.hh"
 #include "arch/riscv/insts/amo.hh"
 #include "arch/riscv/insts/bs.hh"
@@ -55,6 +56,7 @@ output header {{
 #include "arch/riscv/insts/standard.hh"
 #include "arch/riscv/insts/static_inst.hh"
 #include "arch/riscv/insts/unknown.hh"
+#include "arch/riscv/insts/vector.hh"
 #include "arch/riscv/interrupts.hh"
 #include "cpu/static_inst.hh"
 #include "mem/packet.hh"
@@ -68,9 +70,15 @@ output decoder {{
 #include <limits>
 #include <string>
 
+/* riscv softfloat library */
+#include <internals.h>
+#include <softfloat.h>
+#include <specialize.h>
+
 #include "arch/riscv/decoder.hh"
 #include "arch/riscv/faults.hh"
 #include "arch/riscv/mmu.hh"
+#include "arch/riscv/regs/float.hh"
 #include "base/cprintf.hh"
 #include "base/loader/symtab.hh"
 #include "cpu/thread_context.hh"
diff --git a/src/arch/riscv/isa/main.isa b/src/arch/riscv/isa/main.isa
index 24f366b00c..2923a965da 100644
--- a/src/arch/riscv/isa/main.isa
+++ b/src/arch/riscv/isa/main.isa
@@ -50,6 +50,9 @@ namespace RiscvISA;
 //Include the operand_types and operand definitions
 ##include "operands.isa"
 
+//Include the definitions for the instruction templates
+##include "templates/templates.isa"
+
 //Include the definitions for the instruction formats
 ##include "formats/formats.isa"
 
diff --git a/src/arch/riscv/isa/templates/templates.isa b/src/arch/riscv/isa/templates/templates.isa
new file mode 100644
index 0000000000..b4de46d846
--- /dev/null
+++ b/src/arch/riscv/isa/templates/templates.isa
@@ -0,0 +1,2 @@
+// Include
+##include "vector_mem.isa"
diff --git a/src/arch/riscv/isa/templates/vector_mem.isa b/src/arch/riscv/isa/templates/vector_mem.isa
new file mode 100644
index 0000000000..d54243ad7d
--- /dev/null
+++ b/src/arch/riscv/isa/templates/vector_mem.isa
@@ -0,0 +1,1349 @@
+def template VMemMacroDeclare {{
+
+class %(class_name)s : public %(base_class)s
+{
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VMemTemplateMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VleConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    const int32_t micro_vlmax = VLEN / width_EEW(_machInst.width);
+    const uint32_t num_microops = ceil((float) this->vl / (micro_vlmax));
+    int32_t remaining_vl = this->vl;
+    int32_t micro_vl = std::min(remaining_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        microop->setFlag(IsLoad);
+        this->microops.push_back(microop);
+        micro_vl = std::min(remaining_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VleMicroDeclare {{
+
+class %(class_name)s : public %(base_class)s
+{
+private:
+    RegId srcRegIdxArr[3];
+    RegId destRegIdxArr[1];
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl, uint8_t _microIdx)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s, _microVl,
+                     _microIdx)
+    {
+        %(set_reg_idx_arr)s;
+        _numSrcRegs = 0;
+        _numDestRegs = 0;
+        setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _microIdx]);
+        _numTypedDestRegs[VecRegClass]++;
+        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _microIdx]);
+        if (!_machInst.vm) {
+            setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
+        }
+    }
+
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+    Fault completeAcc(PacketPtr, ExecContext *,
+                      trace::InstRecord *) const override;
+    using %(base_class)s::generateDisassembly;
+
+};
+
+}};
+
+def template VleMicroExecute {{
+
+Fault
+%(class_name)s::execute(ExecContext *xc, trace::InstRecord *traceData) const
+{
+    Addr EA;
+    %(op_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if(!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs - 1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    uint32_t mem_size = width_EEW(machInst.width) / 8 * this->microVl;
+    const std::vector<bool> byte_enable(mem_size, true);
+    Fault fault = xc->readMem(EA, Mem.as<uint8_t>(), mem_size, memAccessFlags,
+                              byte_enable);
+    if (fault != NoFault)
+        return fault;
+
+    const size_t micro_vlmax = vtype_VLMAX(machInst.vtype8, true);
+    const size_t micro_elems = VLEN / width_EEW(machInst.width);
+    size_t ei;
+    for (size_t i = 0; i < micro_elems; i++) {
+        ei = i + micro_vlmax * microIdx;
+        %(memacc_code)s;
+    }
+
+    %(op_wb)s;
+    return fault;
+}
+
+}};
+
+def template VleMicroInitiateAcc {{
+
+Fault
+%(class_name)s::initiateAcc(ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    Addr EA;
+
+    %(op_src_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+
+    uint32_t mem_size = width_EEW(this->machInst.width) / 8 * this->microVl;
+    const std::vector<bool> byte_enable(mem_size, true);
+    Fault fault = initiateMemRead(xc, EA, mem_size, memAccessFlags,
+                                  byte_enable);
+    return fault;
+}
+
+}};
+
+def template VleMicroCompleteAcc {{
+
+Fault
+%(class_name)s::completeAcc(PacketPtr pkt, ExecContext *xc,
+                            trace::InstRecord *traceData) const
+{
+    %(op_decl)s;
+    %(op_rd)s;
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if(!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs - 1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    memcpy(Mem.as<uint8_t>(), pkt->getPtr<uint8_t>(), pkt->getSize());
+
+    const size_t micro_vlmax = vtype_VLMAX(machInst.vtype8, true);
+    const size_t micro_elems = VLEN / width_EEW(machInst.width);
+    size_t ei;
+    for (size_t i = 0; i < micro_elems; i++) {
+        ei = i + micro_vlmax * microIdx;
+        %(memacc_code)s;
+    }
+
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VseConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    const int32_t micro_vlmax = VLEN / width_EEW(_machInst.width);
+    const uint32_t num_microops = ceil((float) this->vl / (micro_vlmax));
+    int32_t remaining_vl = this->vl;
+    int32_t micro_vl = std::min(remaining_vl, micro_vlmax);
+
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        microop->setFlag(IsStore);
+        this->microops.push_back(microop);
+        micro_vl = std::min(remaining_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    this->microops.front()->setFlag(IsFirstMicroop);
+    this->microops.back()->setFlag(IsLastMicroop);
+}
+
+}};
+
+def template VseMicroDeclare {{
+
+class %(class_name)s : public %(base_class)s
+{
+private:
+    RegId srcRegIdxArr[3];
+    RegId destRegIdxArr[0];
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl, uint8_t _microIdx)
+        : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s,
+                         _microVl, _microIdx)
+    {
+        %(set_reg_idx_arr)s;
+        _numSrcRegs = 0;
+        _numDestRegs = 0;
+        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs3 + _microIdx]);
+        if (!_machInst.vm) {
+            setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
+        }
+        this->flags[IsVector] = true;
+        this->flags[IsStore] = true;
+    }
+
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+    Fault completeAcc(PacketPtr, ExecContext *,
+                      trace::InstRecord *) const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VseMicroExecute {{
+
+Fault
+%(class_name)s::execute(ExecContext *xc, trace::InstRecord *traceData) const
+{
+    Addr EA;
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if(!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs - 1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+
+    const size_t micro_vlmax = vtype_VLMAX(machInst.vtype8, true);
+    const size_t eewb = width_EEW(machInst.width) / 8;
+    const size_t mem_size = eewb * microVl;
+    std::vector<bool> byte_enable(mem_size, false);
+    size_t ei;
+    for (size_t i = 0; i < microVl; i++) {
+        ei = i + micro_vlmax * microIdx;
+        if (machInst.vm || elem_mask(v0, ei)) {
+            %(memacc_code)s;
+            auto it = byte_enable.begin() + i * eewb;
+            std::fill(it, it + eewb, true);
+        }
+    }
+
+    Fault fault;
+    fault = xc->writeMem(Mem.as<uint8_t>(), mem_size, EA, memAccessFlags,
+                         nullptr, byte_enable);
+    return fault;
+}
+
+}};
+
+def template VseMicroInitiateAcc {{
+
+Fault
+%(class_name)s::initiateAcc(ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    Addr EA;
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if(!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs - 1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+
+    const size_t micro_vlmax = vtype_VLMAX(machInst.vtype8, true);
+    const size_t eewb = width_EEW(machInst.width) / 8;
+    const size_t mem_size = eewb * microVl;
+    std::vector<bool> byte_enable(mem_size, false);
+    size_t ei;
+    for (size_t i = 0; i < microVl; i++) {
+        ei = i + micro_vlmax * microIdx;
+        if (machInst.vm || elem_mask(v0, ei)) {
+            %(memacc_code)s;
+            auto it = byte_enable.begin() + i * eewb;
+            std::fill(it, it + eewb, true);
+        }
+    }
+
+    Fault fault;
+    fault = xc->writeMem(Mem.as<uint8_t>(), mem_size, EA, memAccessFlags,
+                         nullptr, byte_enable);
+    return fault;
+}
+
+}};
+
+def template VseMicroCompleteAcc {{
+
+Fault
+%(class_name)s::completeAcc(PacketPtr pkt, ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    return NoFault;
+}
+
+}};
+
+def template VlmConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    const uint32_t micro_vlmax = VLEN / width_EEW(_machInst.width);
+    int32_t micro_vl = (std::min(this->vl, micro_vlmax) + 7) / 8;
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+    } else {
+        microop = new Vle8_vMicro(_machInst, micro_vl, 0);
+        microop->setDelayedCommit();
+        microop->setFlag(IsLoad);
+    }
+    this->microops.push_back(microop);
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VsmConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    const uint32_t micro_vlmax = VLEN / width_EEW(_machInst.width);
+    int32_t micro_vl = (std::min(this->vl, micro_vlmax) + 7) / 8;
+
+    StaticInstPtr microop;
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+    } else {
+        microop = new Vse8_vMicro(_machInst, micro_vl, 0);
+        microop->setDelayedCommit();
+        microop->setFlag(IsStore);
+    }
+    this->microops.push_back(microop);
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VsWholeConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst)
+  : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    size_t NFIELDS = machInst.nf + 1;
+    const int32_t micro_vlmax = VLEN / width_EEW(_machInst.width);
+
+    StaticInstPtr microop;
+    for (int i = 0; i < NFIELDS; ++i) {
+        microop = new %(class_name)sMicro(_machInst, micro_vlmax, i);
+        microop->setDelayedCommit();
+        microop->setFlag(IsStore);
+        this->microops.push_back(microop);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VsWholeMicroDeclare {{
+
+class %(class_name)s: public %(base_class)s
+{
+private:
+    RegId destRegIdxArr[0];
+    RegId srcRegIdxArr[2];
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl, uint8_t _microIdx)
+        : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s,
+                         _microVl, _microIdx)
+    {
+        %(set_reg_idx_arr)s;
+        _numSrcRegs = 0;
+        _numDestRegs = 0;
+        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs3 + _microIdx]);
+        this->flags[IsVector] = true;
+        this->flags[IsStore] = true;
+    }
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+    Fault completeAcc(PacketPtr, ExecContext *,
+                        trace::InstRecord *) const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VsWholeMicroExecute {{
+
+Fault
+%(class_name)s::execute(ExecContext *xc, trace::InstRecord *traceData) const
+{
+    Addr EA;
+    %(op_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+
+    for (size_t i = 0; i < VLENB; i++) {
+        %(memacc_code)s;
+    }
+
+    Fault fault = writeMemAtomicLE(xc, traceData, *(vreg_t::Container*)(&Mem),
+                                   EA, memAccessFlags, nullptr);
+    return fault;
+}
+
+}};
+
+def template VsWholeMicroInitiateAcc {{
+
+Fault
+%(class_name)s::initiateAcc(ExecContext* xc,
+        trace::InstRecord* traceData) const
+{
+    Addr EA;
+    %(op_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+
+    for (size_t i = 0; i < VLENB; i++) {
+        %(memacc_code)s;
+    }
+
+    Fault fault = writeMemTimingLE(xc, traceData, *(vreg_t::Container*)(&Mem),
+                                   EA, memAccessFlags, nullptr);
+    return fault;
+}
+
+}};
+
+def template VsWholeMicroCompleteAcc {{
+
+Fault
+%(class_name)s::completeAcc(PacketPtr pkt, ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    return NoFault;
+}
+
+}};
+
+def template VlWholeConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    size_t NFIELDS = machInst.nf + 1;
+    const int32_t micro_vlmax = VLEN / width_EEW(_machInst.width);
+
+    StaticInstPtr microop;
+    for (int i = 0; i < NFIELDS; ++i) {
+        microop = new %(class_name)sMicro(_machInst, micro_vlmax, i);
+        microop->setDelayedCommit();
+        microop->setFlag(IsLoad);
+        this->microops.push_back(microop);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VlWholeMicroDeclare {{
+
+class %(class_name)s: public %(base_class)s
+{
+private:
+    RegId destRegIdxArr[1];
+    RegId srcRegIdxArr[1];
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl, uint8_t _microIdx)
+        : %(base_class)s("%(mnemonic)s_micro", _machInst, %(op_class)s,
+                         _microVl, _microIdx)
+    {
+        %(set_reg_idx_arr)s;
+        _numSrcRegs = 0;
+        _numDestRegs = 0;
+        setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _microIdx]);
+        _numTypedDestRegs[VecRegClass]++;
+        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+        this->flags[IsVector] = true;
+        this->flags[IsLoad] = true;
+    }
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+    Fault completeAcc(PacketPtr, ExecContext *,
+                        trace::InstRecord *) const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VlWholeMicroExecute {{
+
+Fault
+%(class_name)s::execute(ExecContext *xc, trace::InstRecord *traceData) const
+{
+    Addr EA;
+    %(op_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+
+    Fault fault = readMemAtomicLE(xc, traceData, EA,
+                                  *(vreg_t::Container*)(&Mem), memAccessFlags);
+    if (fault != NoFault)
+        return fault;
+
+    size_t elem_per_reg = VLEN / width_EEW(machInst.width);
+    for (size_t i = 0; i < elem_per_reg; i++) {
+        %(memacc_code)s;
+    }
+
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VlWholeMicroInitiateAcc {{
+
+Fault
+%(class_name)s::initiateAcc(ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    Addr EA;
+    %(op_src_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+
+    Fault fault = initiateMemRead(xc, traceData, EA, Mem, memAccessFlags);
+    return fault;
+}
+
+}};
+
+def template VlWholeMicroCompleteAcc {{
+
+Fault
+%(class_name)s::completeAcc(PacketPtr pkt, ExecContext* xc,
+        trace::InstRecord* traceData) const
+{
+    %(op_decl)s;
+    %(op_rd)s;
+
+    memcpy(Mem.as<uint8_t>(), pkt->getPtr<uint8_t>(), pkt->getSize());
+
+    size_t elem_per_reg = VLEN / width_EEW(machInst.width);
+    for (size_t i = 0; i < elem_per_reg; ++i) {
+        %(memacc_code)s;
+    }
+
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VlStrideConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    const int32_t num_elems_per_vreg = VLEN / width_EEW(_machInst.width);
+    int32_t remaining_vl = this->vl;
+    // Num of elems in one vreg
+    int32_t micro_vl = std::min(remaining_vl, num_elems_per_vreg);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; micro_vl > 0; ++i) {
+        for (int j = 0; j < micro_vl; ++j) {
+            microop = new %(class_name)sMicro(machInst, i, j, micro_vl);
+            microop->setFlag(IsDelayedCommit);
+            microop->setFlag(IsLoad);
+            this->microops.push_back(microop);
+        }
+        remaining_vl -= num_elems_per_vreg;
+        micro_vl = std::min(remaining_vl, num_elems_per_vreg);
+    }
+
+    this->microops.front()->setFlag(IsFirstMicroop);
+    this->microops.back()->setFlag(IsLastMicroop);
+    this->flags[IsVector] = true;
+}
+
+}};
+
+def template VlStrideMicroDeclare {{
+
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // rs1, rs2, vd, vm
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _regIdx, uint8_t _microIdx,
+        uint8_t _microVl)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s,
+        _regIdx, _microIdx, _microVl)
+    {
+        %(set_reg_idx_arr)s;
+        _numSrcRegs = 0;
+        _numDestRegs = 0;
+        setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _regIdx]);
+        _numTypedDestRegs[VecRegClass]++;
+        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs2]);
+        // We treat agnostic as undistrubed
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _regIdx]);
+        if (!_machInst.vm) {
+            setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
+        }
+        this->flags[IsLoad] = true;
+    }
+
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+    Fault completeAcc(PacketPtr, ExecContext *,
+                      trace::InstRecord *) const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VlStrideMicroExecute {{
+
+Fault
+%(class_name)s::execute(ExecContext *xc, trace::InstRecord *traceData) const
+{
+    Fault fault = NoFault;
+    Addr EA;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    constexpr uint8_t elem_size = sizeof(Vd[0]);
+    %(ea_code)s; // ea_code depends on elem_size
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if (!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs-1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    uint32_t mem_size = elem_size;
+    const std::vector<bool> byte_enable(mem_size, true);
+
+    size_t ei = this->regIdx * VLENB / elem_size + this->microIdx;
+    if (machInst.vm || elem_mask(v0, ei)) {
+        fault = xc->readMem(EA, Mem.as<uint8_t>(), mem_size,
+                                memAccessFlags, byte_enable);
+        if (fault != NoFault)
+            return fault;
+        %(memacc_code)s; /* Vd[this->microIdx] = Mem[0]; */
+    }
+
+    %(op_wb)s;
+    return fault;
+}
+
+}};
+
+def template VlStrideMicroInitiateAcc {{
+
+Fault
+%(class_name)s::initiateAcc(ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    Fault fault = NoFault;
+    Addr EA;
+
+    %(op_src_decl)s;
+    %(op_rd)s;
+    constexpr uint8_t elem_size = sizeof(Vd[0]);
+    %(ea_code)s; // ea_code depends on elem_size
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if (!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs-1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    uint32_t mem_size = elem_size;
+    size_t ei = this->regIdx * VLENB / elem_size + this->microIdx;
+    bool need_load = machInst.vm || elem_mask(v0, ei);
+    const std::vector<bool> byte_enable(mem_size, need_load);
+    fault = initiateMemRead(xc, EA, mem_size, memAccessFlags, byte_enable);
+    return fault;
+}
+
+}};
+
+def template VlStrideMicroCompleteAcc {{
+
+Fault
+%(class_name)s::completeAcc(PacketPtr pkt, ExecContext *xc,
+                            trace::InstRecord *traceData) const
+{
+    %(op_decl)s;
+    %(op_rd)s;
+
+    constexpr uint8_t elem_size = sizeof(Vd[0]);
+
+    RiscvISA::vreg_t old_vd;
+    decltype(Vd) old_Vd = nullptr;
+    // We treat agnostic as undistrubed
+    xc->getRegOperand(this, 2, &old_vd);
+    old_Vd = old_vd.as<std::remove_reference_t<decltype(Vd[0])> >();
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if (!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs-1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    if (microIdx == 0) {
+        // treat vma as vmu
+        // if (machInst.vtype8.vma == 0)
+        memcpy(Vd, old_Vd, microVl * elem_size);
+        // treat vta as vtu
+        // if (machInst.vtype8.vta == 0)
+        memcpy(Vd + microVl, old_Vd + microVl, VLENB - microVl * elem_size);
+    } else {
+        memcpy(Vd, old_Vd, VLENB);
+    }
+
+    size_t ei = this->regIdx * VLENB / sizeof(Vd[0]) + this->microIdx;
+    if (machInst.vm || elem_mask(v0, ei)) {
+        memcpy(Mem.as<uint8_t>(), pkt->getPtr<uint8_t>(), pkt->getSize());
+        %(memacc_code)s; /* Vd[this->microIdx] = Mem[0]; */
+    }
+
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VsStrideConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    const int32_t num_elems_per_vreg = VLEN / width_EEW(_machInst.width);
+    int32_t remaining_vl = this->vl;
+    // Num of elems in one vreg
+    int32_t micro_vl = std::min(remaining_vl, num_elems_per_vreg);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; micro_vl > 0; ++i) {
+        for (int j = 0; j < micro_vl; ++j) {
+            microop = new %(class_name)sMicro(machInst, i, j, micro_vl);
+            microop->setFlag(IsDelayedCommit);
+            microop->setFlag(IsStore);
+            this->microops.push_back(microop);
+        }
+        remaining_vl -= num_elems_per_vreg;
+        micro_vl = std::min(remaining_vl, num_elems_per_vreg);
+    }
+
+    this->microops.front()->setFlag(IsFirstMicroop);
+    this->microops.back()->setFlag(IsLastMicroop);
+    this->flags[IsVector] = true;
+}
+
+}};
+
+def template VsStrideMicroDeclare {{
+
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // rs1, rs2, vs3, vm
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[0];
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _regIdx, uint8_t _microIdx,
+            uint8_t _microVl)
+        : %(base_class)s("%(mnemonic)s""_micro", _machInst, %(op_class)s,
+            _regIdx, _microIdx, _microVl)
+    {
+        %(set_reg_idx_arr)s;
+        _numSrcRegs = 0;
+        _numDestRegs = 0;
+        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs2]);
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs3 + _regIdx]);
+        if (!_machInst.vm) {
+            setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
+        }
+        this->flags[IsStore] = true;
+    }
+
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+    Fault completeAcc(PacketPtr, ExecContext *,
+                      trace::InstRecord *) const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VsStrideMicroExecute {{
+
+Fault
+%(class_name)s::execute(ExecContext *xc, trace::InstRecord *traceData) const
+{
+    Fault fault = NoFault;
+    Addr EA;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    constexpr uint8_t elem_size = sizeof(Vs3[0]);
+    %(ea_code)s;
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if(!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs - 1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    uint32_t mem_size = elem_size;
+    const std::vector<bool> byte_enable(mem_size, true);
+
+    size_t ei = this->regIdx * VLENB / elem_size + this->microIdx;
+    if (machInst.vm || elem_mask(v0, ei)) {
+        %(memacc_code)s;
+        fault = xc->writeMem(Mem.as<uint8_t>(), mem_size, EA,
+                             memAccessFlags, nullptr, byte_enable);
+    }
+    return fault;
+}
+
+}};
+
+def template VsStrideMicroInitiateAcc {{
+
+Fault
+%(class_name)s::initiateAcc(ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    Fault fault = NoFault;
+    Addr EA;
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if(!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs - 1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    %(op_decl)s;
+    %(op_rd)s;
+    constexpr uint8_t elem_size = sizeof(Vs3[0]);
+    %(ea_code)s;
+
+    uint32_t mem_size = elem_size;
+    size_t ei = this->regIdx * VLENB / elem_size + this->microIdx;
+    bool need_store = machInst.vm || elem_mask(v0, ei);
+    if (need_store) {
+        const std::vector<bool> byte_enable(mem_size, need_store);
+        %(memacc_code)s;
+        fault = xc->writeMem(Mem.as<uint8_t>(), mem_size, EA,
+                            memAccessFlags, nullptr, byte_enable);
+    }
+    return fault;
+}
+
+}};
+
+def template VsStrideMicroCompleteAcc {{
+
+Fault
+%(class_name)s::completeAcc(PacketPtr pkt, ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    return NoFault;
+}
+
+}};
+
+def template VlIndexConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    const uint32_t vd_eewb = sizeof(ElemType);
+    const uint32_t vs2_eewb = width_EEW(_machInst.width) / 8;
+    const uint8_t vs2_split_num = (vd_eewb + vs2_eewb - 1) / vs2_eewb;
+    const uint8_t vd_split_num = (vs2_eewb + vd_eewb - 1) / vd_eewb;
+    const int32_t micro_vlmax = VLENB / std::max(vd_eewb, vs2_eewb);
+    int32_t remaining_vl = this->vl;
+    int32_t micro_vl = std::min(remaining_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (uint8_t i = 0; micro_vl > 0; i++) {
+        for (uint8_t j = 0; j < micro_vl; ++j) {
+            uint8_t vdRegIdx = i / vd_split_num;
+            uint8_t vs2RegIdx = i / vs2_split_num;
+            uint8_t vdElemIdx = j + micro_vlmax * (i % vd_split_num);
+            uint8_t vs2ElemIdx = j + micro_vlmax * (i % vs2_split_num);
+            microop = new %(class_name)sMicro<ElemType>(machInst,
+                vdRegIdx, vdElemIdx, vs2RegIdx, vs2ElemIdx);
+            microop->setFlag(IsDelayedCommit);
+            microop->setFlag(IsLoad);
+            this->microops.push_back(microop);
+        }
+        remaining_vl -= micro_vlmax;
+        micro_vl = std::min(remaining_vl, micro_vlmax);
+    }
+
+    this->microops.front()->setFlag(IsFirstMicroop);
+    this->microops.back()->setFlag(IsLastMicroop);
+    this->flags[IsVector] = true;
+}
+
+}};
+
+def template VlIndexMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // rs1, vs2, vd, vm
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+public:
+    %(class_name)s(ExtMachInst _machInst,
+        uint8_t _vdRegIdx, uint8_t _vdElemIdx,
+        uint8_t _vs2RegIdx, uint8_t _vs2ElemIdx)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s,
+        _vdRegIdx, _vdElemIdx, _vs2RegIdx, _vs2ElemIdx)
+    {
+        %(set_reg_idx_arr)s;
+        _numSrcRegs = 0;
+        _numDestRegs = 0;
+        setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _vdRegIdx]);
+        _numTypedDestRegs[VecRegClass]++;
+        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs2 + _vs2RegIdx]);
+        // We treat agnostic as undistrubed
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _vdRegIdx]);
+        if (!_machInst.vm) {
+            setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
+        }
+        this->flags[IsLoad] = true;
+    }
+
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+    Fault completeAcc(PacketPtr, ExecContext *,
+                      trace::InstRecord *) const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VlIndexMicroExecute {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext *xc,
+    trace::InstRecord *traceData)const
+{
+    using vu = std::make_unsigned_t<ElemType>;
+    Fault fault = NoFault;
+    Addr EA;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+    constexpr uint8_t elem_size = sizeof(Vd[0]);
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if (!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs-1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    uint32_t mem_size = elem_size;
+    const std::vector<bool> byte_enable(mem_size, true);
+
+    size_t ei = this->vdRegIdx * VLENB / elem_size + this->vdElemIdx;
+    if (machInst.vm || elem_mask(v0, ei)) {
+        fault = xc->readMem(EA, Mem.as<uint8_t>(), mem_size,
+                                memAccessFlags, byte_enable);
+        if (fault != NoFault)
+            return fault;
+        %(memacc_code)s; /* Vd[this->vdElemIdx] = Mem[0]; */
+    }
+
+    %(op_wb)s;
+    return fault;
+}
+
+}};
+
+def template VlIndexMicroInitiateAcc {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::initiateAcc(ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    using vu = std::make_unsigned_t<ElemType>;
+    Fault fault = NoFault;
+    Addr EA;
+
+    %(op_src_decl)s;
+    %(op_rd)s;
+    constexpr uint8_t elem_size = sizeof(Vd[0]);
+    %(ea_code)s; // ea_code depends on elem_size
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if (!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs-1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    uint32_t mem_size = elem_size;
+    size_t ei = this->vdRegIdx * VLENB / elem_size + this->vdElemIdx;
+    bool need_load = machInst.vm || elem_mask(v0, ei);
+    const std::vector<bool> byte_enable(mem_size, need_load);
+    fault = initiateMemRead(xc, EA, mem_size, memAccessFlags, byte_enable);
+    return fault;
+}
+
+}};
+
+def template VlIndexMicroCompleteAcc {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::completeAcc(PacketPtr pkt, ExecContext *xc,
+                            trace::InstRecord *traceData) const
+{
+    using vu = std::make_unsigned_t<ElemType>;
+    %(op_decl)s;
+    %(op_rd)s;
+
+    constexpr uint8_t elem_size = sizeof(Vd[0]);
+
+    RiscvISA::vreg_t old_vd;
+    decltype(Vd) old_Vd = nullptr;
+    // We treat agnostic as undistrubed
+    xc->getRegOperand(this, 2, &old_vd);
+    old_Vd = old_vd.as<std::remove_reference_t<decltype(Vd[0])> >();
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if (!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs-1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    memcpy(Vd, old_Vd, VLENB);
+
+    size_t ei = this->vdRegIdx * VLENB / elem_size + this->vdElemIdx;
+    if (machInst.vm || elem_mask(v0, ei)) {
+        memcpy(Mem.as<uint8_t>(), pkt->getPtr<uint8_t>(), pkt->getSize());
+        %(memacc_code)s; /* Vd[this->microIdx] = Mem[0]; */
+    }
+
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VsIndexConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    const uint32_t vs3_eewb = sizeof(ElemType);
+    const uint32_t vs2_eewb = width_EEW(_machInst.width) / 8;
+    const uint8_t vs2_split_num = (vs3_eewb + vs2_eewb - 1) / vs2_eewb;
+    const uint8_t vs3_split_num = (vs2_eewb + vs3_eewb - 1) / vs3_eewb;
+    const int32_t micro_vlmax = VLENB / std::max(vs3_eewb, vs2_eewb);
+    int32_t remaining_vl = this->vl;
+    int32_t micro_vl = std::min(remaining_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (uint8_t i = 0; micro_vl > 0; i++) {
+        for (uint8_t j = 0; j < micro_vl; ++j) {
+            uint8_t vs3RegIdx = i / vs3_split_num;
+            uint8_t vs2RegIdx = i / vs2_split_num;
+            uint8_t vs3ElemIdx = j + micro_vlmax * (i % vs3_split_num);
+            uint8_t vs2ElemIdx = j + micro_vlmax * (i % vs2_split_num);
+            microop = new %(class_name)sMicro<ElemType>(machInst,
+                vs3RegIdx, vs3ElemIdx, vs2RegIdx, vs2ElemIdx);
+            microop->setFlag(IsDelayedCommit);
+            microop->setFlag(IsStore);
+            this->microops.push_back(microop);
+        }
+        remaining_vl -= micro_vlmax;
+        micro_vl = std::min(remaining_vl, micro_vlmax);
+    }
+
+    this->microops.front()->setFlag(IsFirstMicroop);
+    this->microops.back()->setFlag(IsLastMicroop);
+    this->flags[IsVector] = true;
+}
+
+}};
+
+def template VsIndexMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // rs1, vs2, vs3, vm
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[0];
+public:
+    %(class_name)s(ExtMachInst _machInst,
+        uint8_t _vs3RegIdx, uint8_t _vs3ElemIdx,
+        uint8_t _vs2RegIdx, uint8_t _vs2ElemIdx)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s,
+        _vs3RegIdx, _vs3ElemIdx, _vs2RegIdx, _vs2ElemIdx)
+    {
+        %(set_reg_idx_arr)s;
+        _numSrcRegs = 0;
+        _numDestRegs = 0;
+        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs2 + _vs2RegIdx]);
+        // We treat agnostic as undistrubed
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs3 + _vs3RegIdx]);
+        if (!_machInst.vm) {
+            setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
+        }
+        this->flags[IsStore] = true;
+    }
+
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+    Fault completeAcc(PacketPtr, ExecContext *,
+                      trace::InstRecord *) const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VsIndexMicroExecute {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext *xc,
+    trace::InstRecord *traceData)const
+{
+    using vu = std::make_unsigned_t<ElemType>;
+    Fault fault = NoFault;
+    Addr EA;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+    constexpr uint8_t elem_size = sizeof(Vs3[0]);
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if (!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs-1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    uint32_t mem_size = elem_size;
+    const std::vector<bool> byte_enable(mem_size, true);
+
+    size_t ei = this->vs3RegIdx * VLENB / elem_size + this->vs3ElemIdx;
+    if (machInst.vm || elem_mask(v0, ei)) {
+        %(memacc_code)s; /* Mem[0] = Vs3[this->vs3ElemIdx] */
+        fault = xc->writeMem(Mem.as<uint8_t>(), mem_size, EA,
+                             memAccessFlags, nullptr, byte_enable);
+    }
+    return fault;
+}
+
+}};
+
+def template VsIndexMicroInitiateAcc {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::initiateAcc(ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    using vu = std::make_unsigned_t<ElemType>;
+    Fault fault = NoFault;
+    Addr EA;
+
+    %(op_src_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+    constexpr uint8_t elem_size = sizeof(Vs3[0]);
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if (!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs-1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    constexpr uint8_t mem_size = elem_size;
+    const std::vector<bool> byte_enable(mem_size, true);
+
+    size_t ei = this->vs3RegIdx * VLENB / elem_size + this->vs3ElemIdx;
+    if (machInst.vm || elem_mask(v0, ei)) {
+        %(memacc_code)s; /* Mem[0] = Vs3[this->vs3ElemIdx] */
+        fault = xc->writeMem(Mem.as<uint8_t>(), mem_size, EA,
+                             memAccessFlags, nullptr, byte_enable);
+    }
+    return fault;
+}
+
+}};
+
+def template VsIndexMicroCompleteAcc {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::completeAcc(PacketPtr pkt, ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    return NoFault;
+}
+
+}};
+
+def template VMemTemplateDecodeBlock {{
+
+switch(machInst.vtype8.vsew) {
+    case 0b000: {
+        return new %(class_name)s<uint8_t>(machInst);
+    }
+    case 0b001: {
+        return new %(class_name)s<uint16_t>(machInst);
+    }
+    case 0b010: {
+        return new %(class_name)s<uint32_t>(machInst);
+    }
+    case 0b011: {
+        return new %(class_name)s<uint64_t>(machInst);
+    }
+    default: GEM5_UNREACHABLE;
+}
+
+}};
diff --git a/src/arch/riscv/utility.hh b/src/arch/riscv/utility.hh
index e0a8494ece..1db6d6df3b 100644
--- a/src/arch/riscv/utility.hh
+++ b/src/arch/riscv/utility.hh
@@ -241,6 +241,61 @@ remu(T rs1, T rs2)
     return (rs2 == 0) ? rs1 : rs1 % rs2;
 }
 
+/*
+* Encode LMUL to lmul as follows:
+*     LMUL    vlmul    lmul
+*      1       000       0
+*      2       001       1
+*      4       010       2
+*      8       011       3
+*      -       100       -
+*     1/8      101      -3
+*     1/4      110      -2
+*     1/2      111      -1
+*
+* then, we can calculate VLMAX = vlen >> (vsew + 3 - lmul)
+* e.g. vlen = 256 bits, SEW = 16, LMUL = 1/8
+*      => VLMAX = vlen >> (1 + 3 - (-3))
+*               = 256 >> 7
+*               = 2
+* Ref: https://github.com/qemu/qemu/blob/5e9d14f2/target/riscv/cpu.h
+*/
+inline uint64_t
+vtype_VLMAX(const uint64_t vtype, const bool per_reg = false)
+{
+    int64_t lmul = (int64_t)sext<3>(bits(vtype, 2, 0));
+    lmul = per_reg ? std::min<int64_t>(0, lmul) : lmul;
+    int64_t vsew = bits(vtype, 5, 3);
+    return gem5::RiscvISA::VLEN >> (vsew + 3 - lmul);
+}
+
+inline uint64_t
+width_EEW(uint64_t width)
+{
+    switch (width) {
+    case 0b000: return 8;
+    case 0b101: return 16;
+    case 0b110: return 32;
+    case 0b111: return 64;
+    default: GEM5_UNREACHABLE;
+    }
+}
+
+/*
+  *  Spec Section 4.5
+  *  Ref:
+  *  https://github.com/qemu/qemu/blob/c7d773ae/target/riscv/vector_helper.c
+*/
+template<typename T>
+inline int
+elem_mask(const T* vs, const int index)
+{
+    static_assert(std::is_integral_v<T>);
+    int idx = index / (sizeof(T)*8);
+    int pos = index % (sizeof(T)*8);
+    return (vs[idx] >> pos) & 1;
+}
+
 } // namespace RiscvISA
 } // namespace gem5
 

From a9f9c4d6d3850308697632b585c5c4bfad9fe7b6 Mon Sep 17 00:00:00 2001
From: Xuan Hu <huxuan@bosc.ac.cn>
Date: Mon, 27 Feb 2023 21:31:22 +0800
Subject: [PATCH 06/10] arch-riscv: Add risc-v vector ext v1.0 arith insts
 support

TODOs:
  + vcompress.vm

Change-Id: I86eceae66e90380416fd3be2c10ad616512b5eba
Co-authored-by: Yang Liu <numbksco@gmail.com>
Co-authored-by: Fan Yang <1209202421@qq.com>
Co-authored-by: Jerin Joy <joy@rivosinc.com>

arch-riscv: Add LICENCE to template files

Change-Id: I825e72bffb84cce559d2e4c1fc2246c3b05a1243
---
 src/arch/riscv/insts/vector.cc                |  107 +
 src/arch/riscv/insts/vector.hh                |  200 ++
 src/arch/riscv/isa/decoder.isa                | 2054 +++++++++++++++++
 src/arch/riscv/isa/formats/formats.isa        |    1 +
 src/arch/riscv/isa/formats/vector_arith.isa   | 1319 +++++++++++
 src/arch/riscv/isa/templates/templates.isa    |   30 +
 src/arch/riscv/isa/templates/vector_arith.isa | 1989 ++++++++++++++++
 src/arch/riscv/isa/templates/vector_mem.isa   |   28 +
 src/arch/riscv/regs/float.hh                  |   14 +
 src/arch/riscv/utility.hh                     |  481 ++++
 10 files changed, 6223 insertions(+)
 create mode 100644 src/arch/riscv/isa/formats/vector_arith.isa
 create mode 100644 src/arch/riscv/isa/templates/vector_arith.isa

diff --git a/src/arch/riscv/insts/vector.cc b/src/arch/riscv/insts/vector.cc
index f2bde629e9..a1ccf402c9 100644
--- a/src/arch/riscv/insts/vector.cc
+++ b/src/arch/riscv/insts/vector.cc
@@ -122,6 +122,93 @@ VConfOp::generateZimmDisassembly() const
     return s.str();
 }
 
+std::string
+VectorNonSplitInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+        << registerName(srcRegIdx(0));
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VectorArithMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", ";
+    if (machInst.funct3 == 0x3) {
+        // OPIVI
+      ss  << registerName(srcRegIdx(0)) << ", " << machInst.vecimm;
+    } else {
+      ss  << registerName(srcRegIdx(1)) << ", " << registerName(srcRegIdx(0));
+    }
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VectorArithMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", ";
+    if (machInst.funct3 == 0x3) {
+        // OPIVI
+      ss  << registerName(srcRegIdx(0)) << ", " << machInst.vecimm;
+    } else {
+      ss  << registerName(srcRegIdx(1)) << ", " << registerName(srcRegIdx(0));
+    }
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VectorVMUNARY0MicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0));
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VectorVMUNARY0MacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0));
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VectorSlideMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) <<  ", ";
+    if (machInst.funct3 == 0x3) {
+      ss  << registerName(srcRegIdx(0)) << ", " << machInst.vecimm;
+    } else {
+      ss  << registerName(srcRegIdx(1)) << ", " << registerName(srcRegIdx(0));
+    }
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VectorSlideMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", ";
+    if (machInst.funct3 == 0x3) {
+      ss  << registerName(srcRegIdx(0)) << ", " << machInst.vecimm;
+    } else {
+      ss  << registerName(srcRegIdx(1)) << ", " << registerName(srcRegIdx(0));
+    }
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
 std::string VleMicroInst::generateDisassembly(Addr pc,
         const loader::SymbolTable *symtab) const
 {
@@ -295,5 +382,25 @@ std::string VsIndexMicroInst::generateDisassembly(Addr pc,
     return ss.str();
 }
 
+std::string
+VMvWholeMacroInst::generateDisassembly(Addr pc,
+    const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+        registerName(srcRegIdx(1));
+    return ss.str();
+}
+
+std::string
+VMvWholeMicroInst::generateDisassembly(Addr pc,
+    const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+        registerName(srcRegIdx(1));
+    return ss.str();
+}
+
 } // namespace RiscvISA
 } // namespace gem5
diff --git a/src/arch/riscv/insts/vector.hh b/src/arch/riscv/insts/vector.hh
index f989d7ffbf..5d0874a994 100644
--- a/src/arch/riscv/insts/vector.hh
+++ b/src/arch/riscv/insts/vector.hh
@@ -89,6 +89,24 @@ inline uint8_t checked_vtype(bool vill, uint8_t vtype) {
     return vtype;
 }
 
+class VectorNonSplitInst : public RiscvStaticInst
+{
+  protected:
+    uint32_t vl;
+    uint8_t vtype;
+    VectorNonSplitInst(const char* mnem, ExtMachInst _machInst,
+                   OpClass __opClass)
+        : RiscvStaticInst(mnem, _machInst, __opClass),
+        vl(_machInst.vl),
+        vtype(checked_vtype(_machInst.vill, _machInst.vtype8))
+    {
+        this->flags[IsVector] = true;
+    }
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
 class VectorMacroInst : public RiscvMacroInst
 {
   protected:
@@ -170,6 +188,63 @@ class VectorArithMacroInst : public VectorMacroInst
             Addr pc, const loader::SymbolTable *symtab) const override;
 };
 
+class VectorVMUNARY0MicroInst : public VectorMicroInst
+{
+protected:
+    VectorVMUNARY0MicroInst(const char *mnem, ExtMachInst _machInst,
+                         OpClass __opClass, uint8_t _microVl,
+                         uint8_t _microIdx)
+        : VectorMicroInst(mnem, _machInst, __opClass, _microVl, _microIdx)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VectorVMUNARY0MacroInst : public VectorMacroInst
+{
+  protected:
+    VectorVMUNARY0MacroInst(const char* mnem, ExtMachInst _machInst,
+                         OpClass __opClass)
+        : VectorMacroInst(mnem, _machInst, __opClass)
+    {
+        this->flags[IsVector] = true;
+    }
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VectorSlideMacroInst : public VectorMacroInst
+{
+  protected:
+    VectorSlideMacroInst(const char* mnem, ExtMachInst _machInst,
+                         OpClass __opClass)
+        : VectorMacroInst(mnem, _machInst, __opClass)
+    {
+        this->flags[IsVector] = true;
+    }
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VectorSlideMicroInst : public VectorMicroInst
+{
+  protected:
+    uint8_t vdIdx;
+    uint8_t vs2Idx;
+    VectorSlideMicroInst(const char *mnem, ExtMachInst _machInst,
+                         OpClass __opClass, uint8_t _microVl,
+                         uint8_t _microIdx, uint8_t _vdIdx, uint8_t _vs2Idx)
+        : VectorMicroInst(mnem, _machInst, __opClass, _microVl, _microIdx)
+        , vdIdx(_vdIdx), vs2Idx(_vs2Idx)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
 class VectorMemMicroInst : public VectorMicroInst
 {
   protected:
@@ -421,6 +496,131 @@ class VsIndexMicroInst : public VectorMemMicroInst
         Addr pc, const loader::SymbolTable *symtab) const override;
 };
 
+class VMvWholeMacroInst : public VectorArithMacroInst
+{
+  protected:
+    VMvWholeMacroInst(const char* mnem, ExtMachInst _machInst,
+                         OpClass __opClass)
+        : VectorArithMacroInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VMvWholeMicroInst : public VectorArithMicroInst
+{
+  protected:
+    VMvWholeMicroInst(const char *mnem, ExtMachInst _machInst,
+                         OpClass __opClass, uint8_t _microVl,
+                         uint8_t _microIdx)
+        : VectorArithMicroInst(mnem, _machInst, __opClass, _microVl, _microIdx)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+template<typename ElemType>
+class VMaskMergeMicroInst : public VectorArithMicroInst
+{
+  private:
+    RegId srcRegIdxArr[NumVecInternalRegs];
+    RegId destRegIdxArr[1];
+
+  public:
+    VMaskMergeMicroInst(ExtMachInst extMachInst, uint8_t _dstReg,
+        uint8_t _numSrcs)
+        : VectorArithMicroInst("vmask_mv_micro", extMachInst,
+          VectorIntegerArithOp, 0, 0)
+    {
+        setRegIdxArrays(
+            reinterpret_cast<RegIdArrayPtr>(
+                &std::remove_pointer_t<decltype(this)>::srcRegIdxArr),
+            reinterpret_cast<RegIdArrayPtr>(
+                &std::remove_pointer_t<decltype(this)>::destRegIdxArr));
+
+        _numSrcRegs = 0;
+        _numDestRegs = 0;
+
+        setDestRegIdx(_numDestRegs++, vecRegClass[_dstReg]);
+        _numTypedDestRegs[VecRegClass]++;
+        for (uint8_t i=0; i<_numSrcs; i++) {
+            setSrcRegIdx(_numSrcRegs++, vecRegClass[VecMemInternalReg0 + i]);
+        }
+    }
+
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)
+            const override {
+        vreg_t tmp_d0 = *(vreg_t *)xc->getWritableRegOperand(this, 0);
+        auto Vd = tmp_d0.as<uint8_t>();
+        constexpr uint8_t elems_per_vreg = VLENB / sizeof(ElemType);
+        size_t bit_cnt = elems_per_vreg;
+        vreg_t tmp_s;
+        xc->getRegOperand(this, 0, &tmp_s);
+        auto s = tmp_s.as<uint8_t>();
+        // cp the first result and tail
+        memcpy(Vd, s, VLENB);
+        for (uint8_t i = 1; i < this->_numSrcRegs; i++) {
+            xc->getRegOperand(this, i, &tmp_s);
+            s = tmp_s.as<uint8_t>();
+            if constexpr (elems_per_vreg < 8) {
+                constexpr uint8_t m = (1 << elems_per_vreg) - 1;
+                const uint8_t mask = m << (i * elems_per_vreg % 8);
+                // clr & ext bits
+                Vd[bit_cnt/8] ^= Vd[bit_cnt/8] & mask;
+                Vd[bit_cnt/8] |= s[bit_cnt/8] & mask;
+                bit_cnt += elems_per_vreg;
+            } else {
+                constexpr uint8_t byte_offset = elems_per_vreg / 8;
+                memcpy(Vd + i * byte_offset, s + i * byte_offset, byte_offset);
+            }
+        }
+        xc->setRegOperand(this, 0, &tmp_d0);
+        if (traceData)
+            traceData->setData(vecRegClass, &tmp_d0);
+        return NoFault;
+    }
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0));
+        for (uint8_t i = 0; i < this->_numSrcRegs; i++) {
+            ss << ", " << registerName(srcRegIdx(i));
+        }
+        ss << ", offset:" << VLENB / sizeof(ElemType);
+        return ss.str();
+    }
+};
+
+class VxsatMicroInst : public VectorArithMicroInst
+{
+  private:
+    bool* vxsat;
+  public:
+    VxsatMicroInst(bool* Vxsat, ExtMachInst extMachInst)
+        : VectorArithMicroInst("vxsat_micro", extMachInst,
+          VectorIntegerArithOp, 0, 0)
+    {
+        vxsat = Vxsat;
+    }
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)
+    const override
+    {
+        xc->setMiscReg(MISCREG_VXSAT,*vxsat);
+        auto vcsr = xc->readMiscReg(MISCREG_VCSR);
+        xc->setMiscReg(MISCREG_VCSR, ((vcsr&~1)|*vxsat));
+        return NoFault;
+    }
+    std::string generateDisassembly(Addr pc, const loader::SymbolTable *symtab)
+      const override
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << "VXSAT" << ", " << (*vxsat ? "0x1" : "0x0");
+        return ss.str();
+    }
+};
 
 } // namespace RiscvISA
 } // namespace gem5
diff --git a/src/arch/riscv/isa/decoder.isa b/src/arch/riscv/isa/decoder.isa
index 0288f37ad8..2b46752ffe 100644
--- a/src/arch/riscv/isa/decoder.isa
+++ b/src/arch/riscv/isa/decoder.isa
@@ -2281,6 +2281,2060 @@ decode QUADRANT default Unknown::unknown() {
         }
 
         0x15: decode FUNCT3 {
+            // OPIVV
+            0x0: decode VFUNCT6 {
+                format VectorIntFormat {
+                    0x0: vadd_vv({{
+                        Vd_vu[i] = Vs2_vu[i] + Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x2: vsub_vv({{
+                        Vd_vu[i] = Vs2_vu[i] - Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x4: vminu_vv({{
+                        Vd_vu[i] = Vs2_vu[i] < Vs1_vu[i] ?
+                                Vs2_vu[i] : Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x5: vmin_vv({{
+                        Vd_vi[i] = Vs2_vi[i] < Vs1_vi[i] ?
+                                Vs2_vi[i] : Vs1_vi[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x6: vmaxu_vv({{
+                        Vd_vu[i] = Vs2_vu[i] > Vs1_vu[i] ?
+                                Vs2_vu[i] : Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x7: vmax_vv({{
+                        Vd_vi[i] = Vs2_vi[i] > Vs1_vi[i] ?
+                                Vs2_vi[i] : Vs1_vi[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x9: vand_vv({{
+                        Vd_vu[i] = Vs2_vu[i] & Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0xa: vor_vv({{
+                        Vd_vu[i] = Vs2_vu[i] | Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0xb: vxor_vv({{
+                        Vd_vu[i] = Vs2_vu[i] ^ Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                }
+                0x0c: VectorGatherFormat::vrgather_vv({{
+                    for (uint32_t i = 0; i < microVl; i++) {
+                        uint32_t ei = i + vs1_idx * vs1_elems + vs1_bias;
+                        if (this->vm || elem_mask(v0, ei)) {
+                            const uint64_t idx = Vs1_vu[i]
+                                - vs2_elems * vs2_idx;
+                            auto res = (Vs1_vu[i] >= vlmax) ? 0
+                                : (idx < vs2_elems) ? Vs2_vu[idx]
+                                : Vs3_vu[i];
+                            Vd_vu[i] = res;
+                        }
+                    }
+                }}, OPIVV, VectorMiscOp);
+                0x0e: VectorGatherFormat::vrgatherei16_vv({{
+                    for (uint32_t i = 0; i < microVl; i++) {
+                        uint32_t ei = i + vs1_idx * vs1_elems + vs1_bias;
+                        if (this->vm || elem_mask(v0, ei)) {
+                            const uint16_t idx = Vs1_uh[i + vs1_bias]
+                                - vs2_elems * vs2_idx;
+                            auto res = (Vs1_uh[i + vs1_bias] >= vlmax) ? 0
+                                : (idx < vs2_elems) ? Vs2_vu[idx]
+                                : Vs3_vu[i + vd_bias];
+                            Vd_vu[i + vd_bias] = res;
+                        }
+                    }
+                }}, OPIVV, VectorMiscOp);
+                format VectorIntFormat {
+                    0x10: decode VM {
+                        0x0: vadc_vvm({{
+                            Vd_vi[i] = Vs2_vi[i] + Vs1_vi[i]
+                                    + elem_mask(v0, ei);
+                        }}, OPIVV, VectorIntegerArithOp);
+                        // the unmasked versions (vm=1) are reserved
+                    }
+                    0x12: decode VM {
+                        0x0: vsbc_vvm({{
+                            Vd_vi[i] = Vs2_vi[i] - Vs1_vi[i]
+                                    - elem_mask(v0, ei);
+                        }}, OPIVV, VectorIntegerArithOp);
+                        // the unmasked versions (vm=1) are reserved
+                    }
+                    0x17: decode VM {
+                        0x0: vmerge_vvm({{
+                            Vd_vu[i] = elem_mask(v0, ei)
+                                    ? Vs1_vu[i]
+                                    : Vs2_vu[i];
+                        }}, OPIVV, VectorIntegerArithOp);
+                        0x1: decode VS2 {
+                            0x0: vmv_v_v({{
+                                Vd_vu[i] = Vs1_vu[i];
+                            }}, OPIVV, VectorIntegerArithOp);
+                        }
+                    }
+                }
+                format VectorIntVxsatFormat{
+                    0x20: vsaddu_vv({{
+                        Vd_vu[i] = sat_addu<vu>(Vs2_vu[i], Vs1_vu[i],
+                            vxsatptr);
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x21: vsadd_vv({{
+                        Vd_vu[i] = sat_add<vi>(Vs2_vu[i], Vs1_vu[i],
+                            vxsatptr);
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x22: vssubu_vv({{
+                        Vd_vu[i] = sat_subu<vu>(Vs2_vu[i], Vs1_vu[i],
+                            vxsatptr);
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x23: vssub_vv({{
+                        Vd_vu[i] = sat_sub<vi>(Vs2_vu[i], Vs1_vu[i],
+                            vxsatptr);
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x27: vsmul_vv({{
+                        vi max = std::numeric_limits<vi>::max();
+                        vi min = std::numeric_limits<vi>::min();
+                        bool overflow = Vs1_vi[i] == Vs2_vi[i] &&
+                                        Vs1_vi[i] == min;
+                        __int128_t result = (__int128_t)Vs1_vi[i] *
+                                            (__int128_t)Vs2_vi[i];
+                        result = int_rounding<__int128_t>(
+                            result, 0 /* TODO */, sew - 1);
+                        result = result >> (sew - 1);
+                        if (overflow) {
+                            result = max;
+                            *vxsatptr = true;
+                        }
+
+                        Vd_vi[i] = (vi)result;
+                    }}, OPIVV, VectorIntegerArithOp);
+                }
+                format VectorIntFormat {
+                    0x25: vsll_vv({{
+                        Vd_vu[i] = Vs2_vu[i] << (Vs1_vu[i] & (sew - 1));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x28: vsrl_vv({{
+                        Vd_vu[i] = Vs2_vu[i] >> (Vs1_vu[i] & (sew - 1));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x29: vsra_vv({{
+                        Vd_vi[i] = Vs2_vi[i] >> (Vs1_vu[i] & (sew - 1));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x2a: vssrl_vv({{
+                        int sh = Vs1_vu[i] & (sew - 1);
+                        __uint128_t val = Vs2_vu[i];
+
+                        val = int_rounding<__uint128_t>(val,
+                            xc->readMiscReg(MISCREG_VXRM), sh);
+                        Vd_vu[i] = val >> sh;
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x2b: vssra_vv({{
+                        int sh = Vs1_vi[i] & (sew - 1);
+                        __int128_t val = Vs2_vi[i];
+
+                        val = int_rounding<__int128_t>(val,
+                            xc->readMiscReg(MISCREG_VXRM), sh);
+                        Vd_vi[i] = val >> sh;
+                    }}, OPIVV, VectorIntegerArithOp);
+                }
+                format VectorReduceIntWideningFormat {
+                    0x30: vwredsumu_vs({{
+                        Vd_vwu[0] = reduce_loop(std::plus<vwu>(),
+                            Vs1_vwu, Vs2_vu);
+                    }}, OPIVV, VectorIntegerReduceOp);
+                    0x31: vwredsum_vs({{
+                        Vd_vwu[0] = reduce_loop(std::plus<vwi>(),
+                            Vs1_vwi, Vs2_vi);
+                    }}, OPIVV, VectorIntegerReduceOp);
+                }
+                format VectorIntMaskFormat {
+                    0x11: decode VM {
+                        0x0: vmadc_vvm({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                carry_out(Vs2_vu[i], Vs1_vu[i],
+                                    elem_mask(v0, ei)));
+                        }}, OPIVV, VectorIntegerArithOp);
+                        0x1: vmadc_vv({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                carry_out(Vs2_vu[i], Vs1_vu[i]));
+                        }}, OPIVV, VectorIntegerArithOp);
+                    }
+                    0x13: decode VM {
+                        0x0: vmsbc_vvm({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                borrow_out(Vs2_vi[i], Vs1_vi[i],
+                                    elem_mask(v0, ei)));
+                        }}, OPIVV, VectorIntegerArithOp);
+                        0x1: vmsbc_vv({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                borrow_out(Vs2_vi[i], Vs1_vi[i]));
+                        }}, OPIVV, VectorIntegerArithOp);
+                    }
+                    0x18: vmseq_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] == Vs1_vu[i]));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x19: vmsne_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] != Vs1_vu[i]));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x1a: vmsltu_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] < Vs1_vu[i]));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x1b: vmslt_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] < Vs1_vi[i]));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x1c: vmsleu_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] <= Vs1_vu[i]));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x1d: vmsle_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] <= Vs1_vi[i]));
+                    }}, OPIVV, VectorIntegerArithOp);
+                }
+                format VectorIntNarrowingFormat {
+                    0x2c: vnsrl_wv({{
+                        Vd_vu[i + offset] = (vu)(Vs2_vwu[i] >>
+                            ((vwu)Vs1_vu[i + offset] & (sew * 2 - 1)));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x2d: vnsra_wv({{
+                        Vd_vi[i + offset] = (vi)(Vs2_vwi[i] >>
+                            ((vwu)Vs1_vu[i + offset] & (sew * 2 - 1)));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x2e: vnclipu_wv({{
+                        vu max = std::numeric_limits<vu>::max();
+                        uint64_t sign_mask =
+                            std::numeric_limits<uint64_t>::max() << sew;
+                        __uint128_t res = Vs2_vwu[i];
+                        unsigned shift = Vs1_vu[i + offset] & ((sew * 2) - 1);
+
+                        res = int_rounding<__uint128_t>(
+                            res, 0 /* TODO */, shift) >> shift;
+
+                        if (res & sign_mask) {
+                            res = max;
+                            // TODO: vxsat
+                        }
+
+                        Vd_vu[i + offset] = (vu)res;
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x2f: vnclip_wv({{
+                        vi max = std::numeric_limits<vi>::max();
+                        vi min = std::numeric_limits<vi>::min();
+                        __int128_t res = Vs2_vwi[i];
+                        unsigned shift = Vs1_vi[i + offset] & ((sew * 2) - 1);
+
+                        res = int_rounding<__int128_t>(
+                            res, 0 /* TODO */, shift) >> shift;
+
+                        if (res < min) {
+                            res = min;
+                            // TODO: vxsat
+                        } else if (res > max) {
+                            res = max;
+                            // TODO: vxsat
+                        }
+
+                        Vd_vi[i + offset] = (vi)res;
+                    }}, OPIVV, VectorIntegerArithOp);
+                }
+            }
+            // OPFVV
+            0x1: decode VFUNCT6 {
+                0x00: VectorFloatFormat::vfadd_vv({{
+                    auto fd = fadd<et>(ftype<et>(Vs2_vu[i]),
+                                       ftype<et>(Vs1_vu[i]));
+                    Vd_vu[i] = fd.v;
+                }}, OPFVV, VectorFloatArithOp);
+                0x01: VectorReduceFloatFormat::vfredusum_vs({{
+                    Vd_vu[0] = reduce_loop([](const vu& src1, const vu& src2) {
+                        return fadd<et>(ftype<et>(src1), ftype<et>(src2));
+                    }, Vs1_vu, Vs2_vu);
+                }}, OPFVV, VectorFloatReduceOp);
+                0x02: VectorFloatFormat::vfsub_vv({{
+                    auto fd = fsub<et>(ftype<et>(Vs2_vu[i]),
+                                       ftype<et>(Vs1_vu[i]));
+                    Vd_vu[i] = fd.v;
+                }}, OPFVV, VectorFloatArithOp);
+                0x03: VectorReduceFloatFormat::vfredosum_vs({{
+                    Vd_vu[0] = reduce_loop([](const vu& src1, const vu& src2) {
+                        return fadd<et>(ftype<et>(src1), ftype<et>(src2));
+                    }, Vs1_vu, Vs2_vu);
+                }}, OPFVV, VectorFloatReduceOp);
+                0x04: VectorFloatFormat::vfmin_vv({{
+                    auto fd = fmin<et>(ftype<et>(Vs2_vu[i]),
+                                       ftype<et>(Vs1_vu[i]));
+                    Vd_vu[i] = fd.v;
+                }}, OPFVV, VectorFloatArithOp);
+                0x05: VectorReduceFloatFormat::vfredmin_vs({{
+                    Vd_vu[0] = reduce_loop([](const vu& src1, const vu& src2) {
+                        return fmin<et>(ftype<et>(src1), ftype<et>(src2));
+                    }, Vs1_vu, Vs2_vu);
+                }}, OPFVV, VectorFloatReduceOp);
+                0x06: VectorFloatFormat::vfmax_vv({{
+                    auto fd = fmax<et>(ftype<et>(Vs2_vu[i]),
+                                       ftype<et>(Vs1_vu[i]));
+                    Vd_vu[i] = fd.v;
+                }}, OPFVV, VectorFloatArithOp);
+                0x07: VectorReduceFloatFormat::vfredmax_vs({{
+                    Vd_vu[0] = reduce_loop([](const vu& src1, const vu& src2) {
+                        return fmax<et>(ftype<et>(src1), ftype<et>(src2));
+                    }, Vs1_vu, Vs2_vu);
+                }}, OPFVV, VectorFloatReduceOp);
+                0x08: VectorFloatFormat::vfsgnj_vv({{
+                    Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
+                                         ftype<et>(Vs1_vu[i]),
+                                         false, false).v;
+                }}, OPFVV, VectorFloatArithOp);
+                0x09: VectorFloatFormat::vfsgnjn_vv({{
+                    Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
+                                         ftype<et>(Vs1_vu[i]),
+                                         true, false).v;
+                }}, OPFVV, VectorFloatArithOp);
+                0x0a: VectorFloatFormat::vfsgnjx_vv({{
+                    Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
+                                         ftype<et>(Vs1_vu[i]),
+                                         false, true).v;
+                }}, OPFVV, VectorFloatArithOp);
+                // VWFUNARY0
+                0x10: decode VS1 {
+                    0x00: decode VM {
+                        // The encodings corresponding to the masked versions
+                        // (vm=0) of vfmv.f.s are reserved
+                        0x1: VectorNonSplitFormat::vfmv_f_s({{
+                            freg_t fd = freg(Vs2_vu[0]);
+                            Fd_bits = fd.v;
+                        }}, OPFVV, VectorMiscOp);
+                    }
+                }
+                0x12: decode VS1 {
+                    format VectorFloatCvtFormat {
+                        0x00: vfcvt_xu_f_v({{
+                            Vd_vu[i] = f_to_ui<et>(ftype<et>(Vs2_vu[i]),
+                                                   softfloat_roundingMode);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x01: vfcvt_x_f_v({{
+                            Vd_vu[i] = f_to_i<et>(ftype<et>(Vs2_vu[i]),
+                                                  softfloat_roundingMode);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x02: vfcvt_f_xu_v({{
+                            auto fd = ui_to_f<et>(Vs2_vu[i]);
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x03: vfcvt_f_x_v({{
+                            auto fd = i_to_f<et>(Vs2_vu[i]);
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x06: vfcvt_rtz_xu_f_v({{
+                            Vd_vu[i] = f_to_ui<et>(ftype<et>(Vs2_vu[i]),
+                                                   softfloat_round_minMag);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x07: vfcvt_rtz_x_f_v({{
+                            Vd_vu[i] = f_to_i<et>(ftype<et>(Vs2_vu[i]),
+                                                  softfloat_round_minMag);
+                        }}, OPFVV, VectorFloatConvertOp);
+                    }
+                    format VectorFloatWideningCvtFormat {
+                        0x08: vfwcvt_xu_f_v({{
+                            Vd_vwu[i] = f_to_wui<et>(
+                                ftype<et>(Vs2_vu[i + offset]),
+                                softfloat_roundingMode);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x09: vfwcvt_x_f_v({{
+                            Vd_vwu[i] = f_to_wi<et>(
+                                ftype<et>(Vs2_vu[i + offset]),
+                                softfloat_roundingMode);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x0a: vfwcvt_f_xu_v({{
+                            auto fd = ui_to_wf<vu>(Vs2_vu[i + offset]);
+                            Vd_vwu[i] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x0b: vfwcvt_f_x_v({{
+                            auto fd = i_to_wf<vu>(Vs2_vu[i + offset]);
+                            Vd_vwu[i] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x0c: vfwcvt_f_f_v({{
+                            auto fd = f_to_wf<et>(
+                                ftype<et>(Vs2_vu[i + offset]));
+                            Vd_vwu[i] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x0e: vfwcvt_rtz_xu_f_v({{
+                            Vd_vwu[i] = f_to_wui<et>(
+                                ftype<et>(Vs2_vu[i + offset]),
+                                softfloat_round_minMag);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x0f: vfwcvt_rtz_x_f_v({{
+                            Vd_vwu[i] = f_to_wi<et>(
+                                ftype<et>(Vs2_vu[i + offset]),
+                                softfloat_round_minMag);
+                        }}, OPFVV, VectorFloatConvertOp);
+                    }
+                    format VectorFloatNarrowingCvtFormat {
+                        0x10: vfncvt_xu_f_w({{
+                            Vd_vu[i + offset] = f_to_nui<vu>(
+                                ftype<ewt>(Vs2_vwu[i]),
+                                softfloat_roundingMode);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x11: vfncvt_x_f_w({{
+                            Vd_vu[i + offset] = f_to_ni<vu>(
+                                ftype<ewt>(Vs2_vwu[i]),
+                                softfloat_roundingMode);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x12: vfncvt_f_xu_w({{
+                            auto fd = ui_to_nf<et>(Vs2_vwu[i]);
+                            Vd_vu[i + offset] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x13: vfncvt_f_x_w({{
+                            auto fd = i_to_nf<et>(Vs2_vwu[i]);
+                            Vd_vu[i + offset] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x14: vfncvt_f_f_w({{
+                            auto fd = f_to_nf<et>(ftype<ewt>(Vs2_vwu[i]));
+                            Vd_vu[i + offset] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x15: vfncvt_rod_f_f_w({{
+                            softfloat_roundingMode = softfloat_round_odd;
+                            auto fd = f_to_nf<et>(ftype<ewt>(Vs2_vwu[i]));
+                            Vd_vu[i + offset] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x16: vfncvt_rtz_xu_f_w({{
+                            Vd_vu[i + offset] = f_to_nui<vu>(
+                                ftype<ewt>(Vs2_vwu[i]),
+                                softfloat_round_minMag);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x17: vfncvt_rtz_x_f_w({{
+                            Vd_vu[i + offset] = f_to_ni<vu>(
+                                ftype<ewt>(Vs2_vwu[i]),
+                                softfloat_round_minMag);
+                        }}, OPFVV, VectorFloatConvertOp);
+                    }
+                }
+                0x13: decode VS1 {
+                    format VectorFloatCvtFormat {
+                        0x00: vfsqrt_v({{
+                            auto fd = fsqrt<et>(ftype<et>(Vs2_vu[i]));
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVV, VectorFloatArithOp);
+                        0x04: vfrsqrt7_v({{
+                            auto fd = frsqrte7<et>(ftype<et>(Vs2_vu[i]));
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVV, VectorFloatArithOp);
+                        0x05: vfrec7_v({{
+                            auto fd = frecip7<et>(ftype<et>(Vs2_vu[i]));
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVV, VectorFloatArithOp);
+                        0x10: vfclass_v({{
+                            auto fd = fclassify<et>(ftype<et>(Vs2_vu[i]));
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVV, VectorFloatArithOp);
+                    }
+                }
+
+                format VectorFloatMaskFormat {
+                    0x18: vmfeq_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            feq<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype<et>(Vs1_vu[i])));
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x19: vmfle_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            fle<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype<et>(Vs1_vu[i])));
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x1b: vmflt_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            flt<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype<et>(Vs1_vu[i])));
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x1c: vmfne_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            !feq<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype<et>(Vs1_vu[i])));
+                    }}, OPFVV, VectorFloatArithOp);
+                }
+                format VectorFloatFormat {
+                    0x20: vfdiv_vv({{
+                        auto fd = fdiv<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype<et>(Vs1_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x24: vfmul_vv({{
+                        auto fd = fmul<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype<et>(Vs1_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x28: vfmadd_vv({{
+                        auto fd = fmadd<et>(ftype<et>(Vs3_vu[i]),
+                                            ftype<et>(Vs1_vu[i]),
+                                            ftype<et>(Vs2_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x29: vfnmadd_vv({{
+                        auto fd = fmadd<et>(fneg(ftype<et>(Vs3_vu[i])),
+                                            ftype<et>(Vs1_vu[i]),
+                                            fneg(ftype<et>(Vs2_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x2a: vfmsub_vv({{
+                        auto fd = fmadd<et>(ftype<et>(Vs3_vu[i]),
+                                            ftype<et>(Vs1_vu[i]),
+                                            fneg(ftype<et>(Vs2_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x2b: vfnmsub_vv({{
+                        auto fd = fmadd<et>(fneg(ftype<et>(Vs3_vu[i])),
+                                            ftype<et>(Vs1_vu[i]),
+                                            ftype<et>(Vs2_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x2c: vfmacc_vv({{
+                        auto fd = fmadd<et>(ftype<et>(Vs1_vu[i]),
+                                            ftype<et>(Vs2_vu[i]),
+                                            ftype<et>(Vs3_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x2d: vfnmacc_vv({{
+                        auto fd = fmadd<et>(fneg(ftype<et>(Vs1_vu[i])),
+                                            ftype<et>(Vs2_vu[i]),
+                                            fneg(ftype<et>(Vs3_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x2e: vfmsac_vv({{
+                        auto fd = fmadd<et>(ftype<et>(Vs1_vu[i]),
+                                            ftype<et>(Vs2_vu[i]),
+                                            fneg(ftype<et>(Vs3_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x2f: vfnmsac_vv({{
+                        auto fd = fmadd<et>(fneg(ftype<et>(Vs1_vu[i])),
+                                            ftype<et>(Vs2_vu[i]),
+                                            ftype<et>(Vs3_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x31: VectorReduceFloatWideningFormat::vfwredusum_vs({{
+                        Vd_vwu[0] = reduce_loop(
+                            [](const vwu& src1, const vu& src2) {
+                                return fadd<ewt>(
+                                    ftype<ewt>(src1),
+                                    f_to_wf<et>(ftype<et>(src2))
+                                );
+                            }, Vs1_vwu, Vs2_vu);
+                    }}, OPFVV, VectorFloatReduceOp);
+                    0x33: VectorReduceFloatWideningFormat::vfwredosum_vs({{
+                        Vd_vwu[0] = reduce_loop(
+                            [](const vwu& src1, const vu& src2) {
+                                return fadd<ewt>(
+                                    ftype<ewt>(src1),
+                                    f_to_wf<et>(ftype<et>(src2))
+                                );
+                            }, Vs1_vwu, Vs2_vu);
+                    }}, OPFVV, VectorFloatReduceOp);
+                }
+                format VectorFloatWideningFormat {
+                    0x30: vfwadd_vv({{
+                        auto fd = fadd<ewt>(
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fwiden(ftype<et>(Vs1_vu[i + offset])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x32: vfwsub_vv({{
+                        auto fd = fsub<ewt>(
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fwiden(ftype<et>(Vs1_vu[i + offset])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x34: vfwadd_wv({{
+                        auto fd = fadd<ewt>(
+                            ftype<ewt>(Vs2_vwu[i]),
+                            fwiden(ftype<et>(Vs1_vu[i + offset])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x36: vfwsub_wv({{
+                        auto fd = fsub<ewt>(
+                            ftype<ewt>(Vs2_vwu[i]),
+                            fwiden(ftype<et>(Vs1_vu[i + offset])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x38: vfwmul_vv({{
+                        auto fd = fmul<ewt>(
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fwiden(ftype<et>(Vs1_vu[i + offset])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x3c: vfwmacc_vv({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(ftype<et>(Vs1_vu[i + offset])),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            ftype<ewt>(Vs3_vwu[i]));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x3d: vfwnmacc_vv({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(fneg(ftype<et>(Vs1_vu[i + offset]))),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fneg(ftype<ewt>(Vs3_vwu[i])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x3e: vfwmsac_vv({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(ftype<et>(Vs1_vu[i + offset])),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fneg(ftype<ewt>(Vs3_vwu[i])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x3f: vfwnmsac_vv({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(fneg(ftype<et>(Vs1_vu[i + offset]))),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            ftype<ewt>(Vs3_vwu[i]));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                }
+            }
+            // OPMVV
+            0x2: decode VFUNCT6 {
+                format VectorReduceIntFormat {
+                    0x0: vredsum_vs({{
+                        Vd_vi[0] =
+                            reduce_loop(std::plus<vi>(), Vs1_vi, Vs2_vi);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x1: vredand_vs({{
+                        Vd_vi[0] =
+                            reduce_loop(std::bit_and<vi>(), Vs1_vi, Vs2_vi);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x2: vredor_vs({{
+                        Vd_vi[0] =
+                            reduce_loop(std::bit_or<vi>(), Vs1_vi, Vs2_vi);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x3: vredxor_vs({{
+                        Vd_vi[0] =
+                            reduce_loop(std::bit_xor<vi>(), Vs1_vi, Vs2_vi);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x4: vredminu_vs({{
+                        Vd_vu[0] =
+                            reduce_loop([](const vu& src1, const vu& src2) {
+                                return std::min<vu>(src1, src2);
+                            }, Vs1_vu, Vs2_vu);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x5: vredmin_vs({{
+                        Vd_vi[0] =
+                            reduce_loop([](const vi& src1, const vi& src2) {
+                                return std::min<vi>(src1, src2);
+                            }, Vs1_vi, Vs2_vi);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x6: vredmaxu_vs({{
+                        Vd_vu[0] =
+                            reduce_loop([](const vu& src1, const vu& src2) {
+                                return std::max<vu>(src1, src2);
+                            }, Vs1_vu, Vs2_vu);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x7: vredmax_vs({{
+                        Vd_vi[0] =
+                            reduce_loop([](const vi& src1, const vi& src2) {
+                                return std::max<vi>(src1, src2);
+                            }, Vs1_vi, Vs2_vi);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                }
+                format VectorIntFormat {
+                    0x8: vaaddu_vv({{
+                        __uint128_t res = (__uint128_t)Vs2_vu[i] + Vs1_vu[i];
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vu[i] = res >> 1;
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x9: vaadd_vv({{
+                        __uint128_t res = (__uint128_t)Vs2_vi[i] + Vs1_vi[i];
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vi[i] = res >> 1;
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0xa: vasubu_vv({{
+                        __uint128_t res = (__uint128_t)Vs2_vu[i] - Vs1_vu[i];
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vu[i] = res >> 1;
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0xb: vasub_vv({{
+                        __uint128_t res = (__uint128_t)Vs2_vi[i] - Vs1_vi[i];
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vi[i] = res >> 1;
+                    }}, OPMVV, VectorIntegerArithOp);
+                }
+                // VWXUNARY0
+                0x10: decode VS1 {
+                    0x00: decode VM {
+                        // The encodings corresponding to the masked versions
+                        // (vm=0) of vmv.x.s are reserved.
+                        0x1: VectorNonSplitFormat::vmv_x_s({{
+                            Rd_ud = Vs2_vi[0];
+                        }}, OPMVV, VectorMiscOp);
+                    }
+                    0x10: Vector1Vs1RdMaskFormat::vcpop_m({{
+                        uint64_t popcount = 0;
+                        for (uint32_t i = 0; i < (uint32_t)machInst.vl; i++) {
+                            bool vs2_lsb = elem_mask(Vs2_vu, i);
+                            if(this->vm){
+                                popcount += vs2_lsb;
+                            }else{
+                                bool do_mask = elem_mask(v0, i);
+                                popcount += (vs2_lsb && do_mask);
+                            }
+                        }
+                        Rd_vu = popcount;
+                    }}, OPMVV, VectorMiscOp);
+                    0x11: Vector1Vs1RdMaskFormat::vfirst_m({{
+                        int64_t pos = -1;
+                        for (uint32_t i = 0; i < (uint32_t)machInst.vl; i++) {
+                            if(this->vm == 0){
+                                if(elem_mask(v0, i)==0){
+                                    continue;
+                                }
+                            }
+                            bool vs2_lsb = elem_mask(Vs2_vu, i);
+                            if (vs2_lsb) {
+                                pos = i;
+                                break;
+                            }
+                        }
+                        Rd_vu = pos;
+                    }}, OPMVV, VectorMiscOp);
+                }
+                0x12: decode VS1 {
+                    format VectorIntExtFormat {
+                        0x02: vzext_vf8({{
+                            Vd_vu[i] = Vs2_vextu[i + offset];
+                        }}, OPMVV, VectorIntegerExtensionOp);
+                        0x03: vsext_vf8({{
+                            Vd_vi[i] = Vs2_vext[i + offset];
+                        }}, OPMVV, VectorIntegerExtensionOp);
+                        0x04: vzext_vf4({{
+                            Vd_vu[i] = Vs2_vextu[i + offset];
+                        }}, OPMVV, VectorIntegerExtensionOp);
+                        0x05: vsext_vf4({{
+                            Vd_vi[i] = Vs2_vext[i + offset];
+                        }}, OPMVV, VectorIntegerExtensionOp);
+                        0x06: vzext_vf2({{
+                            Vd_vu[i] = Vs2_vextu[i + offset];
+                        }}, OPMVV, VectorIntegerExtensionOp);
+                        0x07: vsext_vf2({{
+                            Vd_vi[i] = Vs2_vext[i + offset];
+                        }}, OPMVV, VectorIntegerExtensionOp);
+                    }
+                }
+                0x14: decode VS1 {
+                    0x01: Vector1Vs1VdMaskFormat::vmsbf_m({{
+                        bool has_one = false;
+                        for (uint32_t i = 0; i < (uint32_t)machInst.vl; i++) {
+                            bool vs2_lsb = elem_mask(Vs2_vu, i);
+                            bool do_mask = elem_mask(v0, i);
+                            if(this->vm||(this->vm == 0&&do_mask)){
+                                uint64_t res = 0;
+                                if (!has_one && !vs2_lsb) {
+                                    res = 1;
+                                } else if(!has_one && vs2_lsb) {
+                                    has_one = true;
+                                }
+                                Vd_ub[i/8] = ASSIGN_VD_BIT(i, res);
+                            }
+                        }
+                    }}, OPMVV, VectorMiscOp);
+                    0x02: Vector1Vs1VdMaskFormat::vmsof_m({{
+                        bool has_one = false;
+                        for (uint32_t i = 0; i < (uint32_t)machInst.vl; i++) {
+                            bool vs2_lsb = elem_mask(Vs2_vu, i);
+                            bool do_mask = elem_mask(v0, i);
+                            if(this->vm||(this->vm == 0&&do_mask)){
+                                uint64_t res = 0;
+                                if(!has_one && vs2_lsb) {
+                                    has_one = true;
+                                    res = 1;
+                                }
+                                Vd_ub[i/8] = ASSIGN_VD_BIT(i, res);
+                            }
+                        }
+                    }}, OPMVV, VectorMiscOp);
+                    0x03: Vector1Vs1VdMaskFormat::vmsif_m({{
+                        bool has_one = false;
+                        for (uint32_t i = 0; i < (uint32_t)machInst.vl; i++) {
+                            bool vs2_lsb = elem_mask(Vs2_vu, i);
+                            bool do_mask = elem_mask(v0, i);
+                            if(this->vm||(this->vm == 0&&do_mask)){
+                                uint64_t res = 0;
+                                if (!has_one && !vs2_lsb) {
+                                    res = 1;
+                                } else if(!has_one && vs2_lsb) {
+                                    has_one = true;
+                                    res = 1;
+                                }
+                                Vd_ub[i/8] = ASSIGN_VD_BIT(i, res);
+                            }
+                        }
+                    }}, OPMVV, VectorMiscOp);
+                    0x10: ViotaFormat::viota_m({{
+                        RiscvISAInst::VecRegContainer tmp_s2;
+                        xc->getRegOperand(this, 2,
+                            &tmp_s2);
+                        auto Vs2bit = tmp_s2.as<vu>();
+                        for (uint32_t i = 0; i < this->microVl; i++) {
+                            uint32_t ei = i +
+                                vtype_VLMAX(vtype, true) * this->microIdx;
+                            bool vs2_lsb = elem_mask(Vs2bit, ei);
+                            bool do_mask = elem_mask(v0, ei);
+                            bool has_one = false;
+                            if (this->vm || (do_mask && !this->vm)) {
+                                if (vs2_lsb) {
+                                    has_one = true;
+                                }
+                            }
+                            bool use_ori = (!this->vm) && !do_mask;
+                            if(use_ori == false){
+                                Vd_vu[i] = *cnt;
+                            }
+                            if (has_one) {
+                                *cnt = *cnt+1;
+                            }
+                        }
+                    }}, OPMVV, VectorMiscOp);
+                    0x11: VectorIntFormat::vid_v({{
+                        Vd_vu[i] = ei;
+                    }}, OPMVV, VectorMiscOp);
+                }
+                format VectorMaskFormat {
+                    0x18: vmandn_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            elem_mask(Vs2_vu, i) & !elem_mask(Vs1_vu, i));
+                    }}, OPMVV, VectorMiscOp);
+                    0x19: vmand_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            elem_mask(Vs2_vu, i) & elem_mask(Vs1_vu, i));
+                    }}, OPMVV, VectorMiscOp);
+                    0x1a: vmor_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            elem_mask(Vs2_vu, i) | elem_mask(Vs1_vu, i));
+                    }}, OPMVV, VectorMiscOp);
+                    0x1b: vmxor_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            elem_mask(Vs2_vu, i) ^ elem_mask(Vs1_vu, i));
+                    }}, OPMVV, VectorMiscOp);
+                    0x1c: vmorn_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            elem_mask(Vs2_vu, i) | !elem_mask(Vs1_vu, i));
+                    }}, OPMVV, VectorMiscOp);
+                    0x1d: vmnand_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            !(elem_mask(Vs2_vu, i) & elem_mask(Vs1_vu, i)));
+                    }}, OPMVV, VectorMiscOp);
+                    0x1e: vmnor_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            !(elem_mask(Vs2_vu, i) | elem_mask(Vs1_vu, i)));
+                    }}, OPMVV, VectorMiscOp);
+                    0x1f: vmxnor_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            !(elem_mask(Vs2_vu, i) ^ elem_mask(Vs1_vu, i)));
+                    }}, OPMVV, VectorMiscOp);
+                }
+                format VectorIntFormat {
+                    0x20: vdivu_vv({{
+                        if (Vs1_vu[i] == 0)
+                            Vd_vu[i] = (vu)-1;
+                        else
+                            Vd_vu[i] = Vs2_vu[i] / Vs1_vu[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x21: vdiv_vv({{
+                        if (Vs1_vi[i] == 0)
+                            Vd_vi[i] = -1;
+                        else if (Vs2_vi[i] == std::numeric_limits<vi>::min()
+                                && Vs1_vi[i] == -1)
+                            Vd_vi[i] = Vs2_vi[i];
+                        else
+                            Vd_vi[i] = Vs2_vi[i] / Vs1_vi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x22: vremu_vv({{
+                        if (Vs1_vu[i] == 0) {
+                            Vd_vu[i] = Vs2_vu[i];
+                        } else {
+                            Vd_vu[i] = Vs2_vu[i] % Vs1_vu[i];
+                        }
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x23: vrem_vv({{
+                        if (Vs1_vi[i] == 0) {
+                            Vd_vi[i] = Vs2_vi[i];
+                        } else if (Vs2_vi[i] == std::numeric_limits<vi>::min()
+                                && Vs1_vi[i] == -1) {
+                            Vd_vi[i] = 0;
+                        } else {
+                            Vd_vi[i] = Vs2_vi[i] % Vs1_vi[i];
+                        }
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x24: vmulhu_vv({{
+                        if (sew < 64) {
+                            Vd_vu[i] = ((uint64_t)Vs2_vu[i] * Vs1_vu[i])
+                                        >> sew;
+                        } else {
+                            Vd_vu[i] = mulhu_64(Vs2_vu[i], Vs1_vu[i]);
+                        }
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x25: vmul_vv({{
+                        Vd_vi[i] = Vs2_vi[i] * Vs1_vi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x26: vmulhsu_vv({{
+                        if (sew < 64) {
+                            Vd_vi[i] = ((int64_t)Vs2_vi[i] *
+                                        (uint64_t)Vs1_vu[i])
+                                        >> sew;
+                        } else {
+                            Vd_vi[i] = mulhsu_64(Vs2_vi[i], Vs1_vu[i]);
+                        }
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x27: vmulh_vv({{
+                        if (sew < 64) {
+                            Vd_vi[i] = ((int64_t)Vs2_vi[i] * Vs1_vi[i])
+                                        >> sew;
+                        } else {
+                            Vd_vi[i] = mulh_64(Vs2_vi[i], Vs1_vi[i]);
+                        }
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x29: vmadd_vv({{
+                        Vd_vi[i] = Vs3_vi[i] * Vs1_vi[i] + Vs2_vi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x2b: vnmsub_vv({{
+                        Vd_vi[i] = -(Vs3_vi[i] * Vs1_vi[i]) + Vs2_vi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x2d: vmacc_vv({{
+                        Vd_vi[i] = Vs2_vi[i] * Vs1_vi[i] + Vs3_vi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x2f: vnmsac_vv({{
+                        Vd_vi[i] = -(Vs2_vi[i] * Vs1_vi[i]) + Vs3_vi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                }
+                format VectorIntWideningFormat {
+                    0x30: vwaddu_vv({{
+                        Vd_vwu[i] = vwu(Vs2_vu[i + offset])
+                                + vwu(Vs1_vu[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x31: vwadd_vv({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset])
+                                + vwi(Vs1_vi[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x32: vwsubu_vv({{
+                        Vd_vwu[i] = vwu(Vs2_vu[i + offset])
+                                - vwu(Vs1_vu[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x33: vwsub_vv({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset])
+                                - vwi(Vs1_vi[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x34: vwaddu_wv({{
+                        Vd_vwu[i] = Vs2_vwu[i] + vwu(Vs1_vu[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x35: vwadd_wv({{
+                        Vd_vwi[i] = Vs2_vwi[i] + vwi(Vs1_vi[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x36: vwsubu_wv({{
+                        Vd_vwu[i] = Vs2_vwu[i] - vwu(Vs1_vu[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x37: vwsub_wv({{
+                        Vd_vwi[i] = Vs2_vwi[i] - vwi(Vs1_vi[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x38: vwmulu_vv({{
+                        Vd_vwu[i] = vwu(Vs2_vu[i + offset])
+                                * vwu(Vs1_vu[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x3a: vwmulsu_vv({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset])
+                                * vwu(Vs1_vu[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x3b: vwmul_vv({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset])
+                                * vwi(Vs1_vi[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x3c: vwmaccu_vv({{
+                        Vd_vwu[i] = vwu(Vs1_vu[i + offset])
+                                * vwu(Vs2_vu[i + offset])
+                                + Vs3_vwu[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x3d: vwmacc_vv({{
+                        Vd_vwi[i] = vwi(Vs1_vi[i + offset])
+                                * vwi(Vs2_vi[i + offset])
+                                + Vs3_vwi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x3f: vwmaccsu_vv({{
+                        Vd_vwi[i] = vwi(Vs1_vi[i + offset])
+                                * vwu(Vs2_vu[i + offset])
+                                + Vs3_vwi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                }
+            }
+            // OPIVI
+            0x3: decode VFUNCT6 {
+                format VectorIntFormat {
+                    0x00: vadd_vi({{
+                        Vd_vi[i] = Vs2_vi[i] + (vi)sext<5>(SIMM5);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x03: vrsub_vi({{
+                        Vd_vi[i] = (vi)sext<5>(SIMM5) - Vs2_vi[i];
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x09: vand_vi({{
+                        Vd_vi[i] = Vs2_vi[i] & (vi)sext<5>(SIMM5);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x0a: vor_vi({{
+                        Vd_vi[i] = Vs2_vi[i] | (vi)sext<5>(SIMM5);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x0b: vxor_vi({{
+                        Vd_vi[i] = Vs2_vi[i] ^ (vi)sext<5>(SIMM5);
+                    }}, OPIVI, VectorIntegerArithOp);
+                }
+                0x0c: VectorGatherFormat::vrgather_vi({{
+                    for (uint32_t i = 0; i < microVl; i++) {
+                        uint32_t ei = i + vs1_idx * vs1_elems + vs1_bias;
+                        if (this->vm || elem_mask(v0, ei)) {
+                            const uint64_t idx =
+                                (uint64_t)sext<5>(SIMM5) - vs2_elems * vs2_idx;
+                            Vd_vu[i] = ((uint64_t)sext<5>(SIMM5) >= vlmax) ? 0
+                                : (idx < vs2_elems) ? Vs2_vu[idx]
+                                : Vs3_vu[i];
+                        }
+                    }
+                }}, OPIVI, VectorMiscOp);
+                0x0e: VectorSlideUpFormat::vslideup_vi({{
+                    const int offset = (int)(uint64_t)(SIMM5);
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vdIdx - vs2Idx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int elemOffset = vdOffset + vdIdx * microVlmax;
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            if (this->vm || elem_mask(v0, i + elemOffset)) {
+                                Vd_vu[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                            }
+                        }
+                    }
+                }}, OPIVI, VectorMiscOp);
+                0x0f: VectorSlideDownFormat::vslidedown_vi({{
+                    const int offset = (int)(uint64_t)(SIMM5);
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vs2Idx - vdIdx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    const int numVs2s = vtype_regs_per_group(vtype);
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const bool needZeroTail = numVs2s == vs2Idx + 1;
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int elemIdxBase = vdIdx * microVlmax;
+                        vreg_t resVreg;
+                        auto res = resVreg.as<vu>();
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            res[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                        }
+                        if (needZeroTail) {
+                            for (int i = upperBound + vdOffset;
+                                i < microVlmax; i++) {
+                                res[i] = 0;
+                            }
+                        }
+                        for (int i = vdOffset; i < microVl ; i++) {
+                            if (vm || elem_mask(v0, i + elemIdxBase)) {
+                                Vd_vu[i] = res[i];
+                            }
+                        }
+                    }
+                }}, OPIVI, VectorMiscOp);
+                format VectorIntFormat {
+                    0x10: decode VM {
+                        0x0: vadc_vim({{
+                            Vd_vi[i] = Vs2_vi[i] +
+                                (vi)sext<5>(SIMM5) + elem_mask(v0, ei);
+                        }}, OPIVI, VectorIntegerArithOp);
+                        // the unmasked versions (vm=1) are reserved
+                    }
+                    0x17: decode VM {
+                        0x0: vmerge_vim({{
+                            Vd_vi[i] = elem_mask(v0, ei)
+                                    ? (vi)sext<5>(SIMM5)
+                                    : Vs2_vi[i];
+                        }}, OPIVI, VectorIntegerArithOp);
+                        0x1: vmv_v_i({{
+                            Vd_vi[i] = (vi)sext<5>(SIMM5);
+                        }}, OPIVI, VectorIntegerArithOp);
+                    }
+                }
+                format VectorIntVxsatFormat{
+                    0x20: vsaddu_vi({{
+                        Vd_vu[i] = sat_addu<vu>(Vs2_vu[i], (vu)SIMM5,
+                            vxsatptr);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x21: vsadd_vi({{
+                        Vd_vu[i] = sat_add<vi>(Vs2_vu[i], (vu)SIMM5,
+                            vxsatptr);
+                    }}, OPIVI, VectorIntegerArithOp);
+                }
+                format VectorIntFormat {
+                    0x25: vsll_vi({{
+                        Vd_vu[i] = Vs2_vu[i] << ((vu)SIMM5 & (sew - 1) & 0x1f);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x28: vsrl_vi({{
+                        Vd_vu[i] = Vs2_vu[i] >> ((vu)SIMM5 & (sew - 1) & 0x1f);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x2a: vssrl_vi({{
+                        int sh = SIMM5 & (vtype_SEW(vtype) - 1);
+                        __uint128_t res = Vs2_vu[i];
+
+                        res = int_rounding<__uint128_t>(
+                            res, 0 /* TODO */, sh) >> sh;
+
+                        Vd_vu[i] = res;
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x29: vsra_vi({{
+                        Vd_vi[i] = Vs2_vi[i] >> ((vu)SIMM5 & (sew - 1) & 0x1f);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x2b: vssra_vi({{
+                        int sh = SIMM5 & (sew - 1);
+                        __int128_t val = Vs2_vi[i];
+
+                        val = int_rounding<__int128_t>(val,
+                            xc->readMiscReg(MISCREG_VXRM), sh);
+                        Vd_vi[i] = val >> sh;
+                    }}, OPIVI, VectorIntegerArithOp);
+                }
+                // According to Spec Section 16.6,
+                // vm must be 1 (unmasked) in vmv<nr>r.v instructions.
+                0x27: decode VM { 0x1: decode SIMM3 {
+                    format VMvWholeFormat {
+                        0x0: vmv1r_v({{
+                            Vd_ud[i] = Vs2_ud[i];
+                        }}, OPIVI, VectorMiscOp);
+                        0x1: vmv2r_v({{
+                            Vd_ud[i] = Vs2_ud[i];
+                        }}, OPIVI, VectorMiscOp);
+                        0x3: vmv4r_v({{
+                            Vd_ud[i] = Vs2_ud[i];
+                        }}, OPIVI, VectorMiscOp);
+                        0x7: vmv8r_v({{
+                            Vd_ud[i] = Vs2_ud[i];
+                        }}, OPIVI, VectorMiscOp);
+                    }
+                }}
+                format VectorIntMaskFormat {
+                    0x11: decode VM {
+                        0x0: vmadc_vim({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                carry_out(Vs2_vi[i], (vi)sext<5>(SIMM5),
+                                    elem_mask(v0, ei)));
+                        }}, OPIVI, VectorIntegerArithOp);
+                        0x1: vmadc_vi({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                carry_out(Vs2_vi[i], (vi)sext<5>(SIMM5)));
+                        }}, OPIVI, VectorIntegerArithOp);
+                    }
+                    0x18: vmseq_vi({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] == (vi)sext<5>(SIMM5)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x19: vmsne_vi({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] != (vi)sext<5>(SIMM5)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x1c: vmsleu_vi({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] <= (vu)sext<5>(SIMM5)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x1d: vmsle_vi({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] <= (vi)sext<5>(SIMM5)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x1e: vmsgtu_vi({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] > (vu)sext<5>(SIMM5)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x1f: vmsgt_vi({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] > (vi)sext<5>(SIMM5)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                }
+                format VectorIntNarrowingFormat {
+                    0x2c: vnsrl_wi({{
+                        Vd_vu[i + offset] = (vu)(Vs2_vwu[i] >>
+                                            ((vwu)SIMM5 & (sew * 2 - 1)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x2d: vnsra_wi({{
+                        Vd_vi[i + offset] = (vi)(Vs2_vwi[i] >>
+                                            ((vwu)SIMM5 & (sew * 2 - 1)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x2e: vnclipu_wi({{
+                        vu max = std::numeric_limits<vu>::max();
+                        uint64_t sign_mask =
+                            std::numeric_limits<uint64_t>::max() << sew;
+                        __uint128_t res = Vs2_vwu[i];
+                        unsigned shift = VS1 & ((sew * 2) - 1);
+
+                        res = int_rounding<__uint128_t>(
+                            res, 0 /* TODO */, shift) >> shift;
+
+                        if (res & sign_mask) {
+                            // TODO: vxsat
+                            res = max;
+                        }
+
+                        Vd_vu[i + offset] = (vu)res;
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x2f: vnclip_wi({{
+                        vi max = std::numeric_limits<vi>::max();
+                        vi min = std::numeric_limits<vi>::min();
+                        __int128_t res = Vs2_vwi[i];
+                        unsigned shift = VS1 & ((sew * 2) - 1);
+
+                        res = int_rounding<__int128_t>(
+                            res, 0 /* TODO */, shift) >> shift;
+
+                        if (res < min) {
+                            res = min;
+                            // TODO: vxsat
+                        } else if (res > max) {
+                            res = max;
+                            // TODO: vxsat
+                        }
+
+                        Vd_vi[i + offset] = (vi)res;
+                    }}, OPIVI, VectorIntegerArithOp);
+                }
+            }
+            // OPIVX
+            0x4: decode VFUNCT6 {
+                format VectorIntFormat {
+                    0x0: vadd_vx({{
+                        Vd_vu[i] = Vs2_vu[i] + Rs1_vu;
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x2: vsub_vx({{
+                        Vd_vu[i] = Vs2_vu[i] - Rs1_vu;
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x3: vrsub_vx({{
+                        Vd_vu[i] = Rs1_vu - Vs2_vu[i];
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x4: vminu_vx({{
+                        Vd_vu[i] = std::min(Vs2_vu[i], Rs1_vu);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x5: vmin_vx({{
+                        Vd_vi[i] = std::min(Vs2_vi[i], Rs1_vi);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x6: vmaxu_vx({{
+                        Vd_vu[i] = std::max(Vs2_vu[i], Rs1_vu);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x7: vmax_vx({{
+                        Vd_vi[i] = std::max(Vs2_vi[i], Rs1_vi);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x9: vand_vx({{
+                        Vd_vu[i] = Vs2_vu[i] & Rs1_vu;
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0xa: vor_vx({{
+                        Vd_vu[i] = Vs2_vu[i] | Rs1_vu;
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0xb: vxor_vx({{
+                        Vd_vu[i] = Vs2_vu[i] ^ Rs1_vu;
+                    }}, OPIVX, VectorIntegerArithOp);
+                }
+                0x0e: VectorSlideUpFormat::vslideup_vx({{
+                    const int offset = (int)Rs1_vu;
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vdIdx - vs2Idx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int elemOffset = vdOffset + vdIdx * microVlmax;
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            if (this->vm || elem_mask(v0, i + elemOffset)) {
+                                Vd_vu[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                            }
+                        }
+                    }
+                }}, OPIVX, VectorMiscOp);
+                0x0f: VectorSlideDownFormat::vslidedown_vx({{
+                    const int offset = (int)Rs1_vu;
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vs2Idx - vdIdx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    const int numVs2s = vtype_regs_per_group(vtype);
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const bool needZeroTail = numVs2s == vs2Idx + 1;
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int elemIdxBase = vdIdx * microVlmax;
+                        vreg_t resVreg;
+                        auto res = resVreg.as<vu>();
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            res[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                        }
+                        if (needZeroTail) {
+                            for (int i = upperBound + vdOffset;
+                                i < microVlmax; i++) {
+                                res[i] = 0;
+                            }
+                        }
+                        for (int i = vdOffset; i < microVl ; i++) {
+                            if (vm || elem_mask(v0, i + elemIdxBase)) {
+                                Vd_vu[i] = res[i];
+                            }
+                        }
+                    }
+                }}, OPIVX, VectorMiscOp);
+                0x0c: VectorGatherFormat::vrgather_vx({{
+                    for (uint32_t i = 0; i < microVl; i++) {
+                        uint32_t ei = i + vs1_idx * vs1_elems + vs1_bias;
+                        if (this->vm || elem_mask(v0, ei)) {
+                            const uint64_t idx = Rs1_vu - vs2_elems * vs2_idx;
+                            Vd_vu[i] = (Rs1_vu >= vlmax) ? 0
+                                : (idx < vs2_elems) ? Vs2_vu[idx]
+                                : Vs3_vu[i];
+                        }
+                    }
+                }}, OPIVX, VectorMiscOp);
+                format VectorIntFormat {
+                    0x10: decode VM {
+                        0x0: vadc_vxm({{
+                            Vd_vi[i] = Vs2_vi[i] + Rs1_vi + elem_mask(v0, ei);
+                        }}, OPIVX, VectorIntegerArithOp);
+                        // the unmasked versions (vm=1) are reserved
+                    }
+                    0x12: decode VM {
+                        0x0: vsbc_vxm({{
+                            Vd_vi[i] = Vs2_vi[i] - Rs1_vi - elem_mask(v0, ei);
+                        }}, OPIVX, VectorIntegerArithOp);
+                        // the unmasked versions (vm=1) are reserved
+                    }
+                    0x17: decode VM {
+                        0x0: vmerge_vxm({{
+                            Vd_vu[i] = elem_mask(v0, ei) ? Rs1_vu : Vs2_vu[i];
+                        }}, OPIVX, VectorIntegerArithOp);
+                        0x1: decode VS2 {
+                            0x0: vmv_v_x({{
+                                Vd_vu[i] = Rs1_vu;
+                            }}, OPIVX, VectorIntegerArithOp);
+                        }
+                    }
+                }
+                format VectorIntVxsatFormat{
+                    0x20: vsaddu_vx({{
+                        Vd_vu[i] = sat_addu<vu>(Vs2_vu[i], Rs1_vu,
+                            vxsatptr);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x21: vsadd_vx({{
+                        Vd_vu[i] = sat_add<vi>(Vs2_vu[i], Rs1_vu,
+                            vxsatptr);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x22: vssubu_vx({{
+                        Vd_vu[i] = sat_subu<vu>(Vs2_vu[i], Rs1_vu,
+                            vxsatptr);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x23: vssub_vx({{
+                        Vd_vu[i] = sat_sub<vi>(Vs2_vu[i], Rs1_vu,
+                            vxsatptr);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x27: vsmul_vx({{
+                        vi max = std::numeric_limits<vi>::max();
+                        vi min = std::numeric_limits<vi>::min();
+                        bool overflow = Rs1_vi == Vs2_vi[i] && Rs1_vi == min;
+                        __int128_t result =
+                            (__int128_t)Rs1_vi * (__int128_t)Vs2_vi[i];
+                        result = int_rounding<__uint128_t>(
+                            result, 0 /* TODO */, sew - 1);
+                        result = result >> (sew - 1);
+                        if (overflow) {
+                            result = max;
+                            *vxsatptr = true;
+                        }
+
+                        Vd_vi[i] = (vi)result;
+                    }}, OPIVX, VectorIntegerArithOp);
+                }
+                format VectorIntFormat {
+                    0x25: vsll_vx({{
+                        Vd_vu[i] = Vs2_vu[i] << (Rs1_vu & (sew - 1));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x28: vsrl_vx({{
+                        Vd_vu[i] = Vs2_vu[i] >> (Rs1_vu & (sew - 1));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x29: vsra_vx({{
+                        Vd_vi[i] = Vs2_vi[i] >> (Rs1_vu & (sew - 1));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x2a: vssrl_vx({{
+                        int sh = Rs1_vu & (sew - 1);
+                        __uint128_t val = Vs2_vu[i];
+
+                        val = int_rounding<__uint128_t>(val,
+                            xc->readMiscReg(MISCREG_VXRM), sh);
+                        Vd_vu[i] = val >> sh;
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x2b: vssra_vx({{
+                        int sh = Rs1_vu & (sew - 1);
+                        __int128_t val = Vs2_vi[i];
+
+                        val = int_rounding<__int128_t>(val,
+                            xc->readMiscReg(MISCREG_VXRM), sh);
+                        Vd_vi[i] = val >> sh;
+                    }}, OPIVX, VectorIntegerArithOp);
+                }
+                format VectorIntNarrowingFormat {
+                    0x2c: vnsrl_wx({{
+                        Vd_vu[i + offset] = (vu)(Vs2_vwu[i] >>
+                                            ((vwu)Rs1_vu & (sew * 2 - 1)));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x2d: vnsra_wx({{
+                        Vd_vi[i + offset] = (vi)(Vs2_vwi[i] >>
+                                            ((vwu)Rs1_vu & (sew * 2 - 1)));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x2e: vnclipu_wx({{
+                        vu max = std::numeric_limits<vu>::max();
+                        uint64_t sign_mask =
+                            std::numeric_limits<uint64_t>::max() << sew;
+                        __uint128_t res = Vs2_vwu[i];
+                        unsigned shift = Rs1_vu & ((sew * 2) - 1);
+
+                        res = int_rounding<__uint128_t>(
+                            res, 0 /* TODO */, shift) >> shift;
+
+                        if (res & sign_mask) {
+                            // TODO: vxsat
+                            res = max;
+                        }
+
+                        Vd_vu[i + offset] = (vu)res;
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x2f: vnclip_wx({{
+                        vi max = std::numeric_limits<vi>::max();
+                        vi min = std::numeric_limits<vi>::min();
+                        __int128_t res = Vs2_vwi[i];
+                        unsigned shift = Rs1_vi & ((sew * 2) - 1);
+
+                        res = int_rounding<__int128_t>(
+                            res, 0 /* TODO */, shift) >> shift;
+
+                        if (res < min) {
+                            res = min;
+                            // TODO: vxsat
+                        } else if (res > max) {
+                            res = max;
+                            // TODO: vxsat
+                        }
+
+                        Vd_vi[i + offset] = (vi)res;
+                    }}, OPIVX, VectorIntegerArithOp);
+                }
+
+                format VectorIntMaskFormat {
+                    0x11: decode VM {
+                        0x0: vmadc_vxm({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                carry_out(Vs2_vi[i], Rs1_vi,
+                                    elem_mask(v0, ei)));
+                        }}, OPIVX, VectorIntegerArithOp);
+                        0x1: vmadc_vx({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                carry_out(Vs2_vi[i], Rs1_vi));
+                        }}, OPIVX, VectorIntegerArithOp);
+                    }
+                    0x13: decode VM {
+                        0x0: vmsbc_vxm({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                borrow_out(Vs2_vi[i], Rs1_vi,
+                                    elem_mask(v0, ei)));
+                        }}, OPIVX, VectorIntegerArithOp);
+                        0x1: vmsbc_vx({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                borrow_out(Vs2_vi[i], Rs1_vi));
+                        }}, OPIVX, VectorIntegerArithOp);
+                    }
+                    0x18: vmseq_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] == Rs1_vu));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x19: vmsne_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] != Rs1_vu));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x1a: vmsltu_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] < Rs1_vu));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x1b: vmslt_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] < Rs1_vi));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x1c: vmsleu_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] <= Rs1_vu));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x1d: vmsle_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] <= Rs1_vi));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x1e: vmsgtu_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] > Rs1_vu));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x1f: vmsgt_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] > Rs1_vi));
+                    }}, OPIVX, VectorIntegerArithOp);
+                }
+            }
+            // OPFVF
+            0x5: decode VFUNCT6 {
+                format VectorFloatFormat{
+                    0x00: vfadd_vf({{
+                        auto fd = fadd<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype_freg<et>(freg(Fs1_bits)));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x02: vfsub_vf({{
+                        auto fd = fsub<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype_freg<et>(freg(Fs1_bits)));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x04: vfmin_vf({{
+                        auto fd = fmin<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype_freg<et>(freg(Fs1_bits)));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x06: vfmax_vf({{
+                        auto fd = fmax<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype_freg<et>(freg(Fs1_bits)));
+                            Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x08: vfsgnj_vf({{
+                        Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
+                                             ftype_freg<et>(freg(Fs1_bits)),
+                                             false, false).v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x09: vfsgnjn_vf({{
+                        Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
+                                             ftype_freg<et>(freg(Fs1_bits)),
+                                             true, false).v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x0a: vfsgnjx_vf({{
+                        Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
+                                             ftype_freg<et>(freg(Fs1_bits)),
+                                             false, true).v;
+                    }}, OPFVF, VectorFloatArithOp);
+                }
+                0x0e: VectorFloatSlideUpFormat::vfslide1up_vf({{
+                    const int offset = 1;
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vdIdx - vs2Idx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int elemOffset = vdOffset + vdIdx * microVlmax;
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            if (this->vm || elem_mask(v0, i + elemOffset)) {
+                                Vd_vu[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                            }
+                        }
+                        // TODO: dirty code
+                        if (vdIdx == 0 && vs2Idx == 0 &&
+                                (this->vm || elem_mask(v0, 0))) {
+                            tmp_d0.as<vu>()[0] = Rs1_vu;
+                        }
+                    }
+                }}, OPFVF, VectorMiscOp);
+                0x0f: VectorFloatSlideDownFormat::vfslide1down_vf({{
+                    const int offset = 1;
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vs2Idx - vdIdx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    const int numVs2s = vtype_regs_per_group(vtype);
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const bool needZeroTail = numVs2s == vs2Idx + 1;
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int elemIdxBase = vdIdx * microVlmax;
+                        vreg_t resVreg;
+                        auto res = resVreg.as<vu>();
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            res[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                        }
+                        if (needZeroTail) {
+                            for (int i = upperBound + vdOffset;
+                                i < microVlmax; i++) {
+                                res[i] = 0;
+                            }
+                        }
+                        for (int i = vdOffset; i < microVl ; i++) {
+                            if (vm || elem_mask(v0, i + elemIdxBase)) {
+                                Vd_vu[i] = (i + elemIdxBase != machInst.vl - 1)
+                                    ? res[i]
+                                    : Rs1_vu;
+                            }
+                        }
+                    }
+                }}, OPFVF, VectorMiscOp);
+                // VRFUNARY0
+                0x10: decode VS2 {
+                    0x00: decode VM {
+                        // The encodings corresponding to the masked versions
+                        // (vm=0) of vfmv.s.f are reserved
+                        0x1: VectorNonSplitFormat::vfmv_s_f({{
+                            auto fd = ftype_freg<et>(freg(Fs1_bits));
+                            Vd_vu[0] = fd.v;
+                        }}, OPFVV, VectorMiscOp);
+                    }
+                }
+                format VectorFloatFormat{
+                    0x17: decode VM {
+                        0x0: vfmerge_vfm({{
+                            Vd_vu[i] = elem_mask(v0, ei)
+                                    ? ftype_freg<et>(freg(Fs1_bits)).v
+                                    : Vs2_vu[i];
+                        }}, OPFVF, VectorFloatArithOp);
+                        0x1: vfmv_v_f({{
+                            auto fd = ftype_freg<et>(freg(Fs1_bits));
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVF, VectorFloatArithOp);
+                    }
+                }
+                format VectorFloatMaskFormat {
+                    0x18: vmfeq_vf({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            feq<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype_freg<et>(freg(Fs1_bits))));
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x19: vmfle_vf({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            fle<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype_freg<et>(freg(Fs1_bits))));
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x1b: vmflt_vf({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            flt<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype_freg<et>(freg(Fs1_bits))));
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x1c: vmfne_vf({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            !feq<et>(ftype<et>(Vs2_vu[i]),
+                                     ftype_freg<et>(freg(Fs1_bits))));
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x1d: vmfgt_vf({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            flt<et>(ftype_freg<et>(freg(Fs1_bits)),
+                                    ftype<et>(Vs2_vu[i])));
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x1f: vmfge_vf({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            fle<et>(ftype_freg<et>(freg(Fs1_bits)),
+                                    ftype<et>(Vs2_vu[i])));
+                    }}, OPFVF, VectorFloatArithOp);
+                }
+                format VectorFloatFormat{
+                    0x20: vfdiv_vf({{
+                        auto fd = fdiv<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype_freg<et>(freg(Fs1_bits)));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x21: vfrdiv_vf({{
+                        auto fd = fdiv<et>(ftype_freg<et>(freg(Fs1_bits)),
+                                           ftype<et>(Vs2_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x24: vfmul_vf({{
+                        auto fd = fmul<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype_freg<et>(freg(Fs1_bits)));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x27: vfrsub_vf({{
+                        auto fd = fsub<et>(ftype_freg<et>(freg(Fs1_bits)),
+                                           ftype<et>(Vs2_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x28: vfmadd_vf({{
+                        auto fd = fmadd<et>(ftype<et>(Vs3_vu[i]),
+                                            ftype_freg<et>(freg(Fs1_bits)),
+                                            ftype<et>(Vs2_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x29: vfnmadd_vf({{
+                        auto fd = fmadd<et>(fneg(ftype<et>(Vs3_vu[i])),
+                                            ftype_freg<et>(freg(Fs1_bits)),
+                                            fneg(ftype<et>(Vs2_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x2a: vfmsub_vf({{
+                        auto fd = fmadd<et>(ftype<et>(Vs3_vu[i]),
+                                            ftype_freg<et>(freg(Fs1_bits)),
+                                            fneg(ftype<et>(Vs2_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x2b: vfnmsub_vf({{
+                        auto fd = fmadd<et>(fneg(ftype<et>(Vs3_vu[i])),
+                                            ftype_freg<et>(freg(Fs1_bits)),
+                                            ftype<et>(Vs2_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x2c: vfmacc_vf({{
+                        auto fd = fmadd<et>(ftype_freg<et>(freg(Fs1_bits)),
+                                            ftype<et>(Vs2_vu[i]),
+                                            ftype<et>(Vs3_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x2d: vfnmacc_vf({{
+                        auto fd = fmadd<et>(
+                            fneg(ftype_freg<et>(freg(Fs1_bits))),
+                            ftype<et>(Vs2_vu[i]),
+                            fneg(ftype<et>(Vs3_vu[i]))
+                        );
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x2e: vfmsac_vf({{
+                        auto fd = fmadd<et>(ftype_freg<et>(freg(Fs1_bits)),
+                                            ftype<et>(Vs2_vu[i]),
+                                            fneg(ftype<et>(Vs3_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x2f: vfnmsac_vf({{
+                        auto fd = fmadd<et>(
+                            fneg(ftype_freg<et>(freg(Fs1_bits))),
+                            ftype<et>(Vs2_vu[i]),
+                            ftype<et>(Vs3_vu[i])
+                        );
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                }
+                format VectorFloatWideningFormat {
+                    0x30: vfwadd_vf({{
+                        auto fd = fadd<ewt>(
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x32: vfwsub_vf({{
+                        auto fd = fsub<ewt>(
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x34: vfwadd_wf({{
+                        auto fd = fadd<ewt>(
+                            ftype<ewt>(Vs2_vwu[i]),
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x36: vfwsub_wf({{
+                        auto fd = fsub<ewt>(
+                            ftype<ewt>(Vs2_vwu[i]),
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x38: vfwmul_vf({{
+                        auto fd = fmul<ewt>(
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x3c: vfwmacc_vf({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            ftype<ewt>(Vs3_vwu[i]));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x3d: vfwnmacc_vf({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(fneg(ftype_freg<et>(freg(Fs1_bits)))),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fneg(ftype<ewt>(Vs3_vwu[i])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x3e: vfwmsac_vf({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fneg(ftype<ewt>(Vs3_vwu[i])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x3f: vfwnmsac_vf({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(fneg(ftype_freg<et>(freg(Fs1_bits)))),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            ftype<ewt>(Vs3_vwu[i]));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                }
+            }
+            // OPMVX
+            0x6: decode VFUNCT6 {
+                format VectorIntFormat {
+                    0x08: vaaddu_vx({{
+                        __uint128_t res = (__uint128_t)Vs2_vu[i] + Rs1_vu;
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vu[i] = res >> 1;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x09: vaadd_vx({{
+                        __uint128_t res = (__uint128_t)Vs2_vi[i] + Rs1_vi;
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vi[i] = res >> 1;
+                    }}, OPMVX, VectorIntegerArithOp);
+                }
+                0x0e: VectorSlideUpFormat::vslide1up_vx({{
+                    const int offset = 1;
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vdIdx - vs2Idx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int elemOffset = vdOffset + vdIdx * microVlmax;
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            if (this->vm || elem_mask(v0, i + elemOffset)) {
+                                Vd_vu[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                            }
+                        }
+                        // TODO: dirty code
+                        if (vdIdx == 0 && vs2Idx == 0 &&
+                                (this->vm || elem_mask(v0, 0))) {
+                            tmp_d0.as<vu>()[0] = Rs1_vu;
+                        }
+                    }
+                }}, OPIVX, VectorMiscOp);
+                0x0f: VectorSlideDownFormat::vslide1down_vx({{
+                    const int offset = 1;
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vs2Idx - vdIdx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    const int numVs2s = vtype_regs_per_group(vtype);
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const bool needZeroTail = numVs2s == vs2Idx + 1;
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int elemIdxBase = vdIdx * microVlmax;
+                        vreg_t resVreg;
+                        auto res = resVreg.as<vu>();
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            res[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                        }
+                        if (needZeroTail) {
+                            for (int i = upperBound + vdOffset;
+                                i < microVlmax; i++) {
+                                res[i] = 0;
+                            }
+                        }
+                        for (int i = vdOffset; i < microVl ; i++) {
+                            if (vm || elem_mask(v0, i + elemIdxBase)) {
+                                Vd_vu[i] = (i + elemIdxBase != machInst.vl - 1)
+                                    ? res[i]
+                                    : Rs1_vu;
+                            }
+                        }
+                    }
+                }}, OPIVX, VectorMiscOp);
+                // VRXUNARY0
+                0x10: decode VS2 {
+                    0x00: decode VM {
+                        // The encodings corresponding to the masked versions
+                        // (vm=0) of vmv.s.x are reserved.
+                        0x1: VectorNonSplitFormat::vmv_s_x({{
+                            Vd_vu[0] = Rs1_vu;
+                        }}, OPMVX, VectorMiscOp);
+                    }
+                }
+                format VectorIntFormat {
+                    0x0a: vasubu_vx({{
+                        __uint128_t res = (__uint128_t)Vs2_vu[i] - Rs1_vu;
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vu[i] = res >> 1;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x0b: vasub_vx({{
+                        __uint128_t res = (__uint128_t)Vs2_vi[i] - Rs1_vi;
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vi[i] = res >> 1;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x20: vdivu_vx({{
+                        if (Rs1_vu == 0)
+                            Vd_vu[i] = (vu)-1;
+                        else
+                            Vd_vu[i] = Vs2_vu[i] / Rs1_vu;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x21: vdiv_vx({{
+                        if (Rs1_vi == 0)
+                            Vd_vi[i] = -1;
+                        else if (Vs2_vi[i] == std::numeric_limits<vi>::min()
+                                && Rs1_vi == -1)
+                            Vd_vi[i] = Vs2_vi[i];
+                        else
+                            Vd_vi[i] = Vs2_vi[i] / Rs1_vi;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x22: vremu_vx({{
+                        if (Rs1_vu == 0)
+                            Vd_vu[i] = Vs2_vu[i];
+                        else
+                            Vd_vu[i] = Vs2_vu[i] % Rs1_vu;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x23: vrem_vx({{
+                        if (Rs1_vi == 0)
+                            Vd_vi[i] = Vs2_vi[i];
+                        else if (Vs2_vi[i] == std::numeric_limits<vi>::min()
+                                && Rs1_vi == -1)
+                            Vd_vi[i] = 0;
+                        else
+                            Vd_vi[i] = Vs2_vi[i] % Rs1_vi;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x24: vmulhu_vx({{
+                        if (sew < 64)
+                            Vd_vu[i] = ((uint64_t)Vs2_vu[i] * Rs1_vu)
+                                        >> sew;
+                        else
+                            Vd_vu[i] = mulhu_64(Vs2_vu[i], Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x25: vmul_vx({{
+                        Vd_vi[i] = Vs2_vi[i] * Rs1_vi;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x26: vmulhsu_vx({{
+                        if (sew < 64)
+                            Vd_vi[i] = ((int64_t)Vs2_vi[i] *
+                                        (uint64_t)Rs1_vu)
+                                        >> sew;
+                        else
+                            Vd_vi[i] = mulhsu_64(Vs2_vi[i], Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x27: vmulh_vx({{
+                        if (sew < 64)
+                            Vd_vi[i] = ((int64_t)Vs2_vi[i] * Rs1_vi)
+                                        >> sew;
+                        else
+                            Vd_vi[i] = mulh_64(Vs2_vi[i], Rs1_vi);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x29: vmadd_vx({{
+                        Vd_vi[i] = Vs3_vi[i] * Rs1_vi + Vs2_vi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x2b: vnmsub_vx({{
+                        Vd_vi[i] = -(Vs3_vi[i] * Rs1_vi) + Vs2_vi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x2d: vmacc_vx({{
+                        Vd_vi[i] = Vs2_vi[i] * Rs1_vi + Vs3_vi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x2f: vnmsac_vx({{
+                        Vd_vi[i] = -(Vs2_vi[i] * Rs1_vi) + Vs3_vi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                }
+                format VectorIntWideningFormat {
+                    0x30: vwaddu_vx({{
+                        Vd_vwu[i] = vwu(Vs2_vu[i + offset]) + vwu(Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x31: vwadd_vx({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset]) + vwi(Rs1_vi);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x32: vwsubu_vx({{
+                        Vd_vwu[i] = vwu(Vs2_vu[i + offset]) - vwu(Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x33: vwsub_vx({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset]) - vwi(Rs1_vi);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x34: vwaddu_wx({{
+                        Vd_vwu[i] = Vs2_vwu[i] + vwu(Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x35: vwadd_wx({{
+                        Vd_vwi[i] = Vs2_vwi[i] + vwi(Rs1_vi);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x36: vwsubu_wx({{
+                        Vd_vwu[i] = Vs2_vwu[i] - vwu(Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x37: vwsub_wx({{
+                        Vd_vwi[i] = Vs2_vwi[i] - vwi(Rs1_vi);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x38: vwmulu_vx({{
+                        Vd_vwu[i] = vwu(Vs2_vu[i + offset]) * vwu(Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x3a: vwmulsu_vx({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset]) * vwu(Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x3b: vwmul_vx({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset]) * vwi(Rs1_vi);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x3c: vwmaccu_vx({{
+                        Vd_vwu[i] = vwu(Rs1_vu) * vwu(Vs2_vu[i + offset])
+                                + Vs3_vwu[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x3d: vwmacc_vx({{
+                        Vd_vwi[i] = vwi(Rs1_vi) * vwi(Vs2_vi[i + offset])
+                                + Vs3_vwi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x3e: vwmaccus_vx({{
+                        Vd_vwi[i] = vwu(Rs1_vu) * vwi(Vs2_vi[i + offset])
+                                + Vs3_vwi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x3f: vwmaccsu_vx({{
+                        Vd_vwi[i] = vwi(Rs1_vi) * vwu(Vs2_vu[i + offset])
+                                + Vs3_vwi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                }
+            }
             0x7: decode BIT31 {
                 format VConfOp {
                     0x0: vsetvli({{
diff --git a/src/arch/riscv/isa/formats/formats.isa b/src/arch/riscv/isa/formats/formats.isa
index 4bdc3021d5..0102df17d7 100644
--- a/src/arch/riscv/isa/formats/formats.isa
+++ b/src/arch/riscv/isa/formats/formats.isa
@@ -38,6 +38,7 @@
 ##include "amo.isa"
 ##include "bs.isa"
 ##include "vector_conf.isa"
+##include "vector_arith.isa"
 ##include "vector_mem.isa"
 
 // Include formats for nonstandard extensions
diff --git a/src/arch/riscv/isa/formats/vector_arith.isa b/src/arch/riscv/isa/formats/vector_arith.isa
new file mode 100644
index 0000000000..c462e6c8d4
--- /dev/null
+++ b/src/arch/riscv/isa/formats/vector_arith.isa
@@ -0,0 +1,1319 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2022 PLCT Lab
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+let {{
+    def setDestWrapper(destRegId):
+        return "setDestRegIdx(_numDestRegs++, " + destRegId + ");\n" + \
+               "_numTypedDestRegs[VecRegClass]++;\n"
+    def setSrcWrapper(srcRegId):
+        return "setSrcRegIdx(_numSrcRegs++, " + srcRegId + ");\n"
+    def setSrcVm():
+        return "if (!this->vm)\n" + \
+               "    setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);"
+    def vmDeclAndReadData():
+        return '''
+            [[maybe_unused]] RiscvISA::vreg_t tmp_v0;
+            [[maybe_unused]] uint8_t* v0;
+            if(!machInst.vm) {
+                xc->getRegOperand(this, _numSrcRegs-1, &tmp_v0);
+                v0 = tmp_v0.as<uint8_t>();
+            }
+        '''
+    def copyOldVd(vd_idx):
+        return 'COPY_OLD_VD(%d);' % vd_idx
+    def loopWrapper(code, micro_inst = True):
+        if micro_inst:
+            upper_bound = "this->microVl"
+        else:
+            upper_bound = "(uint32_t)machInst.vl"
+        return '''
+            for (uint32_t i = 0; i < %s; i++) {
+                %s
+            }
+        ''' % (upper_bound, code)
+    def maskCondWrapper(code):
+        return "if (this->vm || elem_mask(v0, ei)) {\n" + \
+               code + "}\n"
+    def eiDeclarePrefix(code, widening = False):
+        if widening:
+            return '''
+            uint32_t ei = i + micro_vlmax * this->microIdx;
+            ''' + code
+        else:
+            return '''
+            uint32_t ei = i + vtype_VLMAX(vtype, true) * this->microIdx;
+            ''' + code
+
+    def wideningOpRegisterConstraintChecks(code):
+        return '''
+            const uint32_t num_microops = 1 << std::max<int64_t>(0, vtype_vlmul(machInst.vtype8) + 1);
+            if ((machInst.vd % alignToPowerOfTwo(num_microops)) != 0) {
+                std::string error =
+                    csprintf("Unaligned Vd group in Widening op");
+                return std::make_shared<IllegalInstFault>(error, machInst);
+            }
+            if ((machInst.vs2 <= machInst.vd) && (machInst.vd < (machInst.vs2 + num_microops - 1))) {
+                // A destination vector register group can overlap a source vector
+                // register group if The destination EEW is greater than the source
+                // EEW, the source EMUL is at least 1, and the overlap is in the
+                // highest- numbered part of the destination register group.
+                std::string error =
+                    csprintf("Unsupported overlap in Vs2 and Vd for Widening op");
+                return std::make_shared<IllegalInstFault>(error, machInst);
+            }
+            ''' + code
+
+    def narrowingOpRegisterConstraintChecks(code):
+        return '''
+            const uint32_t num_microops = 1 << std::max<int64_t>(0, vtype_vlmul(machInst.vtype8) + 1);
+            if ((machInst.vs2 % alignToPowerOfTwo(num_microops)) != 0) {
+                std::string error =
+                    csprintf("Unaligned VS2 group in Narrowing op");
+                return std::make_shared<IllegalInstFault>(error, machInst);
+            }
+            if ((machInst.vs2 < machInst.vd) && (machInst.vd <= (VS2 + num_microops - 1))) {
+                // A destination vector register group can overlap a source vector
+                // register group The destination EEW is smaller than the source EEW
+                // and the overlap is in the lowest-numbered part of the source
+                // register group
+                std::string error =
+                    csprintf("Unsupported overlap in Vs2 and Vd for Narrowing op");
+                return std::make_shared<IllegalInstFault>(error, machInst);
+            }
+        ''' + code
+
+    def fflags_wrapper(code):
+        return '''
+        RegVal FFLAGS = xc->readMiscReg(MISCREG_FFLAGS);
+        std::feclearexcept(FE_ALL_EXCEPT);
+        ''' + code + '''
+        FFLAGS |= softfloat_exceptionFlags;
+        softfloat_exceptionFlags = 0;
+        xc->setMiscReg(MISCREG_FFLAGS, FFLAGS);
+        '''
+}};
+
+
+def format VectorIntFormat(code, category, *flags) {{
+    macroop_class_name = 'VectorArithMacroInst'
+    microop_class_name = 'VectorArithMicroInst'
+
+    if name == "vid_v" :
+        macroop_class_name = 'VectorVMUNARY0MacroInst'
+        microp_class_name = 'VectorVMUNARY0MicroInst'
+
+    iop = InstObjParams(name, Name, macroop_class_name, {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    v0_required = inst_name not in ["vmv"]
+    mask_cond = v0_required and (inst_suffix not in ['vvm', 'vxm', 'vim'])
+    need_elem_idx = mask_cond or code.find("ei") != -1
+
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+
+    num_src_regs = 0
+
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    num_src_regs += 1
+
+    src1_reg_id = ""
+    if category in ["OPIVV", "OPMVV"]:
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx]"
+        num_src_regs += 1
+    elif category in ["OPIVX", "OPMVX"]:
+        src1_reg_id = "intRegClass[_machInst.rs1]"
+        num_src_regs += 1
+    elif category == "OPIVI":
+        pass
+    else:
+        error("not supported category for VectorIntFormat: %s" % category)
+
+    old_vd_idx = num_src_regs
+    src3_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    if category != "OPIVI":
+        set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    if v0_required:
+        set_src_reg_idx += setSrcVm()
+
+    # code
+    if mask_cond:
+        code = maskCondWrapper(code)
+    if need_elem_idx:
+        code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+
+    vm_decl_rd = ""
+    if v0_required:
+        vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        microop_class_name,
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntMicroDeclare.subst(microiop) + \
+        VectorIntMicroConstructor.subst(microiop) + \
+        VectorIntMicroExecute.subst(microiop) + \
+        VectorIntMacroDeclare.subst(iop) + \
+        VectorIntMacroConstructor.subst(iop)
+
+    decode_block = VectorIntDecodeBlock.subst(iop)
+}};
+
+
+def format VectorIntExtFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    ext_div = int(inst_suffix[-1])
+
+    old_vd_idx = 1
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx / " + \
+                      str(ext_div) + "]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    set_src_reg_idx += setSrcVm()
+
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+    vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx),
+         'ext_div': ext_div},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntExtMicroDeclare.subst(microiop) + \
+        VectorIntMicroConstructor.subst(microiop) + \
+        VectorIntExtMicroExecute.subst(microiop) + \
+        VectorIntExtMacroDeclare.subst(iop) + \
+        VectorIntMacroConstructor.subst(iop)
+
+    decode_block = VectorIntDecodeBlock.subst(iop)
+}};
+
+def format VectorIntWideningFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    v0_required = True
+    mask_cond = v0_required
+    need_elem_idx = mask_cond or code.find("ei") != -1
+    old_vd_idx = 2
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src1_reg_id = ""
+    if category in ["OPIVV", "OPMVV"]:
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx / 2]"
+    elif category in ["OPIVX", "OPMVX"]:
+        src1_reg_id = "intRegClass[_machInst.rs1]"
+    else:
+        error("not supported category for VectorIntFormat: %s" % category)
+    src2_reg_id = ""
+    if inst_suffix in ["vv", "vx"]:
+        src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx / 2]"
+    elif inst_suffix in ["wv", "wx"]:
+        src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    if v0_required:
+        set_src_reg_idx += setSrcVm()
+
+    # code
+    if mask_cond:
+        code = maskCondWrapper(code)
+    if need_elem_idx:
+        code = eiDeclarePrefix(code, widening=True)
+    code = loopWrapper(code)
+
+    code = wideningOpRegisterConstraintChecks(code)
+
+    vm_decl_rd = ""
+    if v0_required:
+        vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntWideningMicroDeclare.subst(microiop) + \
+        VectorIntWideningMicroConstructor.subst(microiop) + \
+        VectorIntWideningMicroExecute.subst(microiop) + \
+        VectorIntWideningMacroDeclare.subst(iop) + \
+        VectorIntWideningMacroConstructor.subst(iop)
+
+    decode_block = VectorIntWideningDecodeBlock.subst(iop)
+}};
+
+def format VectorIntNarrowingFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    mask_cond = True
+    need_elem_idx = True
+
+    old_vd_idx = 2
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx / 2]"
+    if category in ["OPIVV"]:
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx / 2]"
+    elif category in ["OPIVX"]:
+        src1_reg_id = "intRegClass[_machInst.rs1]"
+    elif category == "OPIVI":
+        old_vd_idx = 1
+    else:
+        error("not supported category for VectorIntFormat: %s" % category)
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vs3 + _microIdx / 2]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = ""
+    if category != "OPIVI":
+        set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_src_reg_idx += setSrcVm()
+    # code
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code, widening=True)
+    code = loopWrapper(code)
+    code = narrowingOpRegisterConstraintChecks(code)
+    vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx),
+         },
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntWideningMicroDeclare.subst(microiop) + \
+        VectorIntWideningMicroConstructor.subst(microiop) + \
+        VectorIntNarrowingMicroExecute.subst(microiop) + \
+        VectorIntWideningMacroDeclare.subst(iop) + \
+        VectorIntWideningMacroConstructor.subst(iop)
+
+    decode_block = VectorIntWideningDecodeBlock.subst(iop)
+}};
+
+def format VectorIntMaskFormat(code, category, *flags) {{
+    iop = InstObjParams(name,
+        Name,
+        'VectorArithMacroInst',
+        {'code': code},
+        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    v0_required = not (inst_name in ["vmadc", "vmsbc"] \
+        and inst_suffix in ["vv", "vx", "vi"])
+    mask_cond = inst_name not in ['vmadc', 'vmsbc']
+    need_elem_idx = mask_cond or code.find("ei") != -1
+
+    old_vd_idx = 2
+    dest_reg_id = "vecRegClass[VecMemInternalReg0 + _microIdx]"
+    src1_reg_id = ""
+    if category == "OPIVV":
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx]"
+    elif category == "OPIVX":
+        src1_reg_id = "intRegClass[_machInst.rs1]"
+    elif category == "OPIVI":
+        old_vd_idx = 1
+    else:
+        error("not supported category for VectorIntFormat: %s" % category)
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = ""
+    if category != "OPIVI":
+        set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    if v0_required:
+        set_src_reg_idx += setSrcVm()
+
+    #code
+    if mask_cond:
+        code = maskCondWrapper(code)
+    if need_elem_idx:
+        code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+
+    vm_decl_rd = ""
+    if v0_required:
+        vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntMaskMicroDeclare.subst(microiop) + \
+        VectorIntMaskMicroConstructor.subst(microiop) + \
+        VectorIntMaskMicroExecute.subst(microiop) + \
+        VectorIntMaskMacroDeclare.subst(iop) + \
+        VectorIntMaskMacroConstructor.subst(iop)
+    decode_block = VectorIntDecodeBlock.subst(iop)
+}};
+
+def format VectorGatherFormat(code, category, *flags) {{
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    if inst_name == "vrgatherei16":
+        idx_type = "uint16_t"
+    else:
+        idx_type = "elem_type"
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst',
+        {'idx_type': idx_type,
+         'code': code},
+        flags)
+    old_vd_idx = 2
+    dest_reg_id = "vecRegClass[_machInst.vd + vd_idx]"
+    src1_reg_id = ""
+    if category in ["OPIVV"]:
+        src1_reg_id = "vecRegClass[_machInst.vs1 + vs1_idx]"
+    elif category in ["OPIVX"]:
+        src1_reg_id = "intRegClass[_machInst.rs1]"
+    elif category == "OPIVI":
+        old_vd_idx = 1
+    else:
+        error("not supported category for VectorIntFormat: %s" % category)
+    src2_reg_id = "vecRegClass[_machInst.vs2 + vs2_idx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + vd_idx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    if category != "OPIVI":
+        set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    set_src_reg_idx += setSrcVm()
+
+    # code
+
+    vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx),
+         'idx_type': idx_type},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorGatherMicroDeclare.subst(microiop) + \
+        VectorGatherMicroConstructor.subst(microiop) + \
+        VectorGatherMicroExecute.subst(microiop) + \
+        VectorGatherMacroDeclare.subst(iop) + \
+        VectorGatherMacroConstructor.subst(iop)
+
+    decode_block = VectorGatherDecodeBlock.subst(iop)
+
+}};
+
+def format VectorFloatFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    v0_required = inst_name not in ["vfmv"]
+    mask_cond = v0_required and (inst_suffix not in ['vvm', 'vfm'])
+    need_elem_idx = mask_cond or code.find("ei") != -1
+
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src1_reg_id = ""
+    if category == "OPFVV":
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx]"
+    elif category == "OPFVF":
+        src1_reg_id = "floatRegClass[_machInst.rs1]"
+    else:
+        error("not supported category for VectorFloatFormat: %s" % category)
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    if v0_required:
+        set_src_reg_idx += setSrcVm()
+    # code
+    if mask_cond:
+        code = maskCondWrapper(code)
+    if need_elem_idx:
+        code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+    code = fflags_wrapper(code)
+
+    vm_decl_rd = ""
+    if v0_required:
+        vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorFloatMicroDeclare.subst(microiop) + \
+        VectorFloatMicroConstructor.subst(microiop) + \
+        VectorFloatMicroExecute.subst(microiop) + \
+        VectorFloatMacroDeclare.subst(iop) + \
+        VectorFloatMacroConstructor.subst(iop)
+
+    decode_block = VectorFloatDecodeBlock.subst(iop)
+}};
+
+def format VectorFloatCvtFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+
+    old_vd_idx = 1
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    set_src_reg_idx += setSrcVm()
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+    code = fflags_wrapper(code)
+
+    vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorFloatCvtMicroDeclare.subst(microiop) + \
+        VectorFloatMicroConstructor.subst(microiop) + \
+        VectorFloatMicroExecute.subst(microiop) + \
+        VectorFloatCvtMacroDeclare.subst(iop) + \
+        VectorFloatMacroConstructor.subst(iop)
+
+    decode_block = VectorFloatDecodeBlock.subst(iop)
+}};
+
+def format VectorFloatWideningFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    v0_required = True
+    mask_cond = v0_required
+    need_elem_idx = mask_cond or code.find("ei") != -1
+
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src1_reg_id = ""
+    if category in ["OPFVV"]:
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx / 2]"
+    elif category in ["OPFVF"]:
+        src1_reg_id = "floatRegClass[_machInst.rs1]"
+    else:
+        error("not supported category for VectorFloatFormat: %s" % category)
+    src2_reg_id = ""
+    if inst_suffix in ["vv", "vf"]:
+        src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx / 2]"
+    elif inst_suffix in ["wv", "wf"]:
+        src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    if v0_required:
+        set_src_reg_idx += setSrcVm()
+
+    # code
+    if mask_cond:
+        code = maskCondWrapper(code)
+    if need_elem_idx:
+        code = eiDeclarePrefix(code, widening=True)
+    code = loopWrapper(code)
+    code = fflags_wrapper(code)
+
+    code = wideningOpRegisterConstraintChecks(code)
+
+    vm_decl_rd = ""
+    if v0_required:
+        vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntWideningMicroDeclare.subst(microiop) + \
+        VectorIntWideningMicroConstructor.subst(microiop) + \
+        VectorFloatWideningMicroExecute.subst(microiop) + \
+        VectorIntWideningMacroDeclare.subst(iop) + \
+        VectorIntWideningMacroConstructor.subst(iop)
+
+    decode_block = VectorFloatWideningDecodeBlock.subst(iop)
+}};
+
+def format VectorFloatWideningCvtFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+
+    old_vd_idx = 1
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx / 2]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    set_src_reg_idx += setSrcVm()
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+    code = fflags_wrapper(code)
+
+    vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorFloatCvtMicroDeclare.subst(microiop) + \
+        VectorFloatMicroConstructor.subst(microiop) + \
+        VectorFloatWideningMicroExecute.subst(microiop) + \
+        VectorFloatCvtMacroDeclare.subst(iop) + \
+        VectorIntWideningMacroConstructor.subst(iop)
+
+    decode_block = VectorFloatWideningDecodeBlock.subst(iop)
+}};
+
+def format VectorFloatNarrowingCvtFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+
+    old_vd_idx = 1
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx / 2]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx / 2]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    set_src_reg_idx += setSrcVm()
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+    code = fflags_wrapper(code)
+    code = narrowingOpRegisterConstraintChecks(code)
+
+    vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorFloatCvtMicroDeclare.subst(microiop) + \
+        VectorFloatMicroConstructor.subst(microiop) + \
+        VectorFloatNarrowingMicroExecute.subst(microiop) + \
+        VectorFloatCvtMacroDeclare.subst(iop) + \
+        VectorIntWideningMacroConstructor.subst(iop)
+
+    decode_block = VectorFloatWideningDecodeBlock.subst(iop)
+}};
+
+def format VectorFloatMaskFormat(code, category, *flags) {{
+    iop = InstObjParams(name,
+        Name,
+        'VectorArithMacroInst',
+        {'code': code},
+        flags)
+    dest_reg_id = "vecRegClass[VecMemInternalReg0 + _microIdx]"
+    src1_reg_id = ""
+    if category == "OPFVV":
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx]"
+    elif category == "OPFVF":
+        src1_reg_id = "floatRegClass[_machInst.rs1]"
+    else:
+        error("not supported category for VectorFloatFormat: %s" % category)
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_src_reg_idx += setSrcVm()
+    vm_decl_rd = vmDeclAndReadData()
+
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+    code = fflags_wrapper(code)
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorFloatMaskMicroDeclare.subst(microiop) + \
+        VectorFloatMaskMicroConstructor.subst(microiop) + \
+        VectorFloatMaskMicroExecute.subst(microiop) + \
+        VectorFloatMaskMacroDeclare.subst(iop) + \
+        VectorFloatMaskMacroConstructor.subst(iop)
+    decode_block = VectorFloatDecodeBlock.subst(iop)
+}};
+
+def format VMvWholeFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VMvWholeMacroInst', {'code': code}, flags)
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VMvWholeMicroInst',
+        {'code': code},
+        flags)
+
+    header_output = \
+        VMvWholeMacroDeclare.subst(iop) + \
+        VMvWholeMicroDeclare.subst(microiop)
+    decoder_output = \
+        VMvWholeMacroConstructor.subst(iop) + \
+        VMvWholeMicroConstructor.subst(microiop)
+    exec_output = VMvWholeMicroExecute.subst(microiop)
+    decode_block = BasicDecode.subst(iop)
+}};
+
+def format ViotaFormat(code, category, *flags){{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    # The tail of vector mask inst should be treated as tail-agnostic.
+    # We treat it with tail-undisturbed policy, since
+    # the test suits only support undisturbed policy.
+    old_dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    vm_decl_rd = vmDeclAndReadData()
+    set_vm_idx = setSrcVm()
+
+    microiop = InstObjParams(name+"_micro",
+        Name+"Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'set_vm_idx': set_vm_idx,
+         'copy_old_vd': copyOldVd(1)},
+        flags)
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        ViotaMicroDeclare.subst(microiop) + \
+        ViotaMicroConstructor.subst(microiop) + \
+        ViotaMicroExecute.subst(microiop)+\
+        ViotaMacroDeclare.subst(iop) + \
+        ViotaMacroConstructor.subst(iop)
+
+    decode_block = VectorIntDecodeBlock.subst(iop)
+
+}};
+
+def format Vector1Vs1VdMaskFormat(code, category, *flags){{
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd]"
+    src2_reg_id = "vecRegClass[_machInst.vs2]"
+    # The tail of vector mask inst should be treated as tail-agnostic.
+    # We treat it with tail-undisturbed policy, since
+    # the test suits only support undisturbed policy.
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    vm_decl_rd = vmDeclAndReadData()
+    set_vm_idx = setSrcVm()
+    iop = InstObjParams(name,
+        Name,
+        'VectorNonSplitInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'set_vm_idx': set_vm_idx,
+         'copy_old_vd': copyOldVd(1)},
+        flags)
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        Vector1Vs1RdMaskDeclare.subst(iop) + \
+        Vector1Vs1VdMaskConstructor.subst(iop) + \
+        Vector1Vs1VdMaskExecute.subst(iop)
+
+    decode_block = VectorMaskDecodeBlock.subst(iop)
+}};
+
+def format Vector1Vs1RdMaskFormat(code, category, *flags){{
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    vm_decl_rd = vmDeclAndReadData()
+    set_vm_idx = setSrcVm()
+    iop = InstObjParams(name,
+        Name,
+        'VectorNonSplitInst',
+        {'code': code,
+         'vm_decl_rd': vm_decl_rd,
+         'set_vm_idx': set_vm_idx},
+        flags)
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        Vector1Vs1RdMaskDeclare.subst(iop) + \
+        Vector1Vs1RdMaskConstructor.subst(iop) + \
+        Vector1Vs1RdMaskExecute.subst(iop)
+
+    decode_block = VectorMaskDecodeBlock.subst(iop)
+}};
+
+def format VectorNonSplitFormat(code, category, *flags) {{
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    vm_decl_rd = ""
+
+    set_vm_idx = ""
+
+    if inst_name == "vfmv" :
+        code = fflags_wrapper(code)
+
+    iop = InstObjParams(name,
+        Name,
+        'VectorNonSplitInst',
+        {'code': code,
+         'vm_decl_rd': vm_decl_rd,
+         'set_vm_idx': set_vm_idx},
+        flags)
+
+
+    if inst_name == "vfmv" :
+        execute_block = VectorFloatNonSplitExecute.subst(iop)
+        decode_block = VectorFloatDecodeBlock.subst(iop)
+    elif inst_name == "vmv" :
+        execute_block = VectorIntNonSplitExecute.subst(iop)
+        decode_block = VectorIntDecodeBlock.subst(iop)
+    else :
+        error("Unsupported inst for VectorNonSplitFormat: %s" % inst_name)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorNonSplitDeclare.subst(iop) + \
+        VectorNonSplitConstructor.subst(iop) + \
+        execute_block
+
+}};
+
+def format VectorMaskFormat(code, category, *flags) {{
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    old_vd_idx = 2
+    if category not in ["OPMVV"]:
+        error("not supported category for VectorIntFormat: %s" % category)
+    dest_reg_id = "vecRegClass[_machInst.vd]"
+    src1_reg_id = "vecRegClass[_machInst.vs1]"
+    src2_reg_id = "vecRegClass[_machInst.vs2]"
+
+    # The tail of vector mask inst should be treated as tail-agnostic.
+    # We treat it with tail-undisturbed policy, since
+    # the test suits only support undisturbed policy.
+    # TODO: remove it
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    code = loopWrapper(code, micro_inst = False)
+
+    iop = InstObjParams(name,
+        Name,
+        'VectorNonSplitInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorMaskDeclare.subst(iop) + \
+        VectorMaskConstructor.subst(iop) + \
+        VectorMaskExecute.subst(iop)
+
+    decode_block = VectorMaskDecodeBlock.subst(iop)
+}};
+
+def format VectorReduceIntFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd]"
+    src1_reg_id = "vecRegClass[_machInst.vs1]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    # Treat tail undisturbed/agnostic as the same
+    # We always need old rd as src vreg
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_src_reg_idx += setSrcVm()
+    vm_decl_rd = vmDeclAndReadData()
+    type_def = '''
+        using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+        using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    '''
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'type_def': type_def,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorReduceMicroDeclare.subst(microiop) + \
+        VectorReduceMicroConstructor.subst(microiop) + \
+        VectorReduceIntMicroExecute.subst(microiop) + \
+        VectorReduceMacroDeclare.subst(iop) + \
+        VectorReduceMacroConstructor.subst(iop)
+    decode_block = VectorIntDecodeBlock.subst(iop)
+}};
+
+def format VectorReduceFloatFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd]"
+    src1_reg_id = "vecRegClass[_machInst.vs1]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    # Treat tail undisturbed/agnostic as the same
+    # We always need old rd as src vreg
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_src_reg_idx += setSrcVm()
+    vm_decl_rd = vmDeclAndReadData()
+    type_def = '''
+        using et = ElemType;
+        using vu = decltype(et::v);
+    '''
+
+    code = fflags_wrapper(code)
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'type_def': type_def,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorReduceMicroDeclare.subst(microiop) + \
+        VectorReduceMicroConstructor.subst(microiop) + \
+        VectorReduceFloatMicroExecute.subst(microiop) + \
+        VectorReduceMacroDeclare.subst(iop) + \
+        VectorReduceMacroConstructor.subst(iop)
+    decode_block = VectorFloatDecodeBlock.subst(iop)
+}};
+
+def format VectorReduceFloatWideningFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd]"
+    src1_reg_id = "vecRegClass[_machInst.vs1]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    # Treat tail undisturbed/agnostic as the same
+    # We always need old rd as src vreg
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_src_reg_idx += setSrcVm()
+    vm_decl_rd = vmDeclAndReadData()
+    type_def = '''
+        using et = ElemType;
+        using vu [[maybe_unused]] = decltype(et::v);
+        using ewt = typename double_width<et>::type;
+        using vwu = decltype(ewt::v);
+    '''
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'type_def': type_def,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorReduceMicroDeclare.subst(microiop) + \
+        VectorReduceMicroConstructor.subst(microiop) + \
+        VectorReduceFloatWideningMicroExecute.subst(microiop) + \
+        VectorReduceMacroDeclare.subst(iop) + \
+        VectorReduceMacroConstructor.subst(iop)
+    decode_block = VectorFloatWideningDecodeBlock.subst(iop)
+}};
+
+def format VectorIntVxsatFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    old_vd_idx = 2
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src1_reg_id = ""
+    if category in ["OPIVV"]:
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx]"
+    elif category in ["OPIVX"]:
+        src1_reg_id = "intRegClass[_machInst.rs1]"
+    elif category == "OPIVI":
+        old_vd_idx = 1
+    else:
+        error("not supported category for VectorIntVxsatFormat: %s" % category)
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    if category != "OPIVI":
+        set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    set_src_reg_idx += setSrcVm()
+    vm_decl_rd = vmDeclAndReadData()
+
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntVxsatMicroDeclare.subst(microiop) + \
+        VectorIntVxsatMicroConstructor.subst(microiop) + \
+        VectorIntMicroExecute.subst(microiop) + \
+        VectorIntVxsatMacroDeclare.subst(iop) + \
+        VectorIntVxsatMacroConstructor.subst(iop)
+
+    decode_block = VectorIntDecodeBlock.subst(iop)
+}};
+
+def format VectorReduceIntWideningFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd]"
+    src1_reg_id = "vecRegClass[_machInst.vs1]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    # Treat tail undisturbed/agnostic as the same
+    # We always need old rd as src vreg
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_src_reg_idx += setSrcVm()
+    vm_decl_rd = vmDeclAndReadData()
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorReduceMicroDeclare.subst(microiop) + \
+        VectorReduceMicroConstructor.subst(microiop) + \
+        VectorReduceIntWideningMicroExecute.subst(microiop) + \
+        VectorReduceMacroDeclare.subst(iop) + \
+        VectorReduceMacroConstructor.subst(iop)
+    decode_block = VectorIntWideningDecodeBlock.subst(iop)
+}};
+
+let {{
+
+def VectorSlideBase(name, Name, category, code, flags, macro_construtor,
+        decode_template, micro_execute_template):
+    macroop_class_name = 'VectorSlideMacroInst'
+    microop_class_name = 'VectorSlideMicroInst'
+    # Make sure flags are in lists (convert to lists if not).
+    flags = makeList(flags)
+    iop = InstObjParams(name, Name, macroop_class_name, {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd + vdIdx]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + vs2Idx]"
+    src1_ireg_id = "intRegClass[_machInst.rs1]"
+    src1_freg_id = "floatRegClass[_machInst.rs1]"
+
+    # The tail of vector mask inst should be treated as tail-agnostic.
+    # We treat it with tail-undisturbed policy, since
+    # the test suits only support undisturbed policy.
+    num_src_regs = 0
+
+    old_dest_reg_id = "vecRegClass[_machInst.vd + vdIdx]"
+    set_src_reg_idx = ""
+    if category in ["OPIVX", "OPMVX"]:
+        set_src_reg_idx += setSrcWrapper(src1_ireg_id)
+        num_src_regs += 1
+    elif category in ["OPFVF"]:
+        set_src_reg_idx += setSrcWrapper(src1_freg_id)
+        num_src_regs += 1
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    num_src_regs += 1
+    old_vd_idx = num_src_regs
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    vm_decl_rd = vmDeclAndReadData()
+    set_src_reg_idx += setSrcVm()
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        microop_class_name,
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorSlideMicroDeclare.subst(microiop) + \
+        VectorSlideMicroConstructor.subst(microiop) + \
+        micro_execute_template.subst(microiop) + \
+        VectorSlideMacroDeclare.subst(iop) + \
+        macro_construtor.subst(iop)
+
+    decode_block = decode_template.subst(iop)
+    return (header_output, decode_block)
+
+}};
+
+def format VectorSlideUpFormat(code, category, *flags) {{
+    (header_output, decode_block) = VectorSlideBase(name, Name, category, code,
+        flags,
+        macro_construtor = VectorSlideUpMacroConstructor,
+        decode_template = VectorIntDecodeBlock,
+        micro_execute_template = VectorSlideMicroExecute)
+}};
+
+def format VectorSlideDownFormat(code, category, *flags) {{
+    (header_output, decode_block) = VectorSlideBase(name, Name, category, code,
+        flags,
+        macro_construtor = VectorSlideDownMacroConstructor,
+        decode_template = VectorIntDecodeBlock,
+        micro_execute_template = VectorSlideMicroExecute)
+}};
+
+def format VectorFloatSlideUpFormat(code, category, *flags) {{
+    (header_output, decode_block) = VectorSlideBase(name, Name, category, code,
+        flags,
+        macro_construtor = VectorSlideUpMacroConstructor,
+        decode_template = VectorFloatDecodeBlock,
+        micro_execute_template = VectorFloatSlideMicroExecute)
+}};
+
+def format VectorFloatSlideDownFormat(code, category, *flags) {{
+    (header_output, decode_block) = VectorSlideBase(name, Name, category, code,
+        flags,
+        macro_construtor = VectorSlideDownMacroConstructor,
+        decode_template = VectorFloatDecodeBlock,
+        micro_execute_template = VectorFloatSlideMicroExecute)
+}};
diff --git a/src/arch/riscv/isa/templates/templates.isa b/src/arch/riscv/isa/templates/templates.isa
index b4de46d846..ed3f5287c0 100644
--- a/src/arch/riscv/isa/templates/templates.isa
+++ b/src/arch/riscv/isa/templates/templates.isa
@@ -1,2 +1,32 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2022 PLCT Lab
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
 // Include
 ##include "vector_mem.isa"
+##include "vector_arith.isa"
diff --git a/src/arch/riscv/isa/templates/vector_arith.isa b/src/arch/riscv/isa/templates/vector_arith.isa
new file mode 100644
index 0000000000..d15ab70f20
--- /dev/null
+++ b/src/arch/riscv/isa/templates/vector_arith.isa
@@ -0,0 +1,1989 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2022 PLCT Lab
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+output header {{
+
+#define ASSIGN_VD_BIT(idx, bit) \
+    ((Vd[(idx)/8] & ~(1 << (idx)%8)) | ((bit) << (idx)%8))
+
+#define COPY_OLD_VD(idx)                                             \
+    [[maybe_unused]] RiscvISA::vreg_t old_vd;                        \
+    [[maybe_unused]] decltype(Vd) old_Vd = nullptr;                  \
+    xc->getRegOperand(this, (idx), &old_vd);                           \
+    old_Vd = old_vd.as<std::remove_reference_t<decltype(Vd[0])> >(); \
+    memcpy(Vd, old_Vd, VLENB);
+
+#define VRM_REQUIRED                                                         \
+        uint_fast8_t frm = xc->readMiscReg(MISCREG_FRM);                     \
+        if (frm > 4)                                                         \
+            return std::make_shared<IllegalInstFault>("RM fault", machInst); \
+        softfloat_roundingMode = frm;
+
+template<typename Type>
+bool inline
+carry_out(Type a, Type b, bool carry_in = false) {
+    using TypeU = std::make_unsigned_t<Type>;
+    TypeU s = *reinterpret_cast<TypeU*>(&a)
+            + *reinterpret_cast<TypeU*>(&b) + carry_in;
+    return carry_in
+        ? (s <= *reinterpret_cast<TypeU*>(&a))
+        : (s <  *reinterpret_cast<TypeU*>(&a));
+}
+
+template<typename Type>
+bool inline
+borrow_out(Type a, Type b, bool borrow_in = false) {
+    using TypeU = std::make_unsigned_t<Type>;
+    return borrow_in
+        ? (*reinterpret_cast<TypeU*>(&a) <= *reinterpret_cast<TypeU*>(&b))
+        : (*reinterpret_cast<TypeU*>(&a) <  *reinterpret_cast<TypeU*>(&b));
+}
+
+}};
+
+def template VectorIntMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+}};
+
+def template VectorIntMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs1, vs2, vs3(old_vd), vm for *.vv, *.vx
+    // vs2, (old_vd), vm for *.vi
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+                   uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+                                         uint8_t _microVl, uint8_t _microIdx)
+    : %(base_class)s("%(mnemonic)s", _machInst,
+                     %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorIntMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    [[maybe_unused]] constexpr size_t sew = sizeof(vu) * 8;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+
+    return NoFault;
+}
+
+}};
+
+def template VectorIntExtMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    std::string generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const override
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+            << registerName(srcRegIdx(0));
+        if (machInst.vm == 0) ss << ", v0.t";
+        return ss.str();
+    }
+};
+
+}};
+
+def template VectorIntExtMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    RegId srcRegIdxArr[3];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+                   uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    std::string generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const override
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+            << registerName(srcRegIdx(0));
+        if (machInst.vm == 0) ss << ", v0.t";
+        return ss.str();
+    }
+};
+
+}};
+
+def template VectorIntExtMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    auto SEW = vtype_SEW(vtype);
+    auto offset = (VLEN / SEW) * (microIdx % %(ext_div)d);
+    switch (SEW / %(ext_div)d) {
+      case 8: {
+        using vext  [[maybe_unused]] = int8_t;
+        using vextu [[maybe_unused]] = uint8_t;
+        %(op_decl)s;
+        %(op_rd)s;
+        %(vm_decl_rd)s;
+        %(copy_old_vd)s;
+        %(code)s;
+        %(op_wb)s;
+        break;
+      }
+      case 16: {
+        using vext  [[maybe_unused]] = int16_t;
+        using vextu [[maybe_unused]] = uint16_t;
+        %(op_decl)s;
+        %(op_rd)s;
+        %(vm_decl_rd)s;
+        %(copy_old_vd)s;
+        %(code)s;
+        %(op_wb)s;
+        break;
+      }
+      case 32: {
+        using vext  [[maybe_unused]] = int32_t;
+        using vextu [[maybe_unused]] = uint32_t;
+        %(op_decl)s;
+        %(op_rd)s;
+        %(vm_decl_rd)s;
+        %(copy_old_vd)s;
+        %(code)s;
+        %(op_wb)s;
+      break;
+      }
+      default: break;
+    }
+
+    return NoFault;
+}
+
+}};
+
+def template VectorIntDecodeBlock {{
+
+switch(machInst.vtype8.vsew) {
+case 0b000: return new %(class_name)s<uint8_t>(machInst);
+case 0b001: return new %(class_name)s<uint16_t>(machInst);
+case 0b010: return new %(class_name)s<uint32_t>(machInst);
+case 0b011: return new %(class_name)s<uint64_t>(machInst);
+default: GEM5_UNREACHABLE;
+}
+
+}};
+
+def template VectorIntWideningMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntWideningMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const int64_t vlmul = vtype_vlmul(_machInst.vtype8);
+    // Todo: move to Decode template
+    panic_if(vlmul == 3, "LMUL=8 is illegal for widening inst");
+    // when LMUL setted as m1, need to split to 2 micro insts
+    const uint32_t num_microops = 1 << std::max<int64_t>(0, vlmul + 1);
+
+    int32_t tmp_vl = this->vl;
+    const int32_t t_micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    const int32_t micro_vlmax = vlmul < 0 ? t_micro_vlmax : t_micro_vlmax / 2;
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorIntWideningMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs1, vs2, vs3(old_vd), vm for *.vv, *.vx
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+                   uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntWideningMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+        uint8_t _microVl, uint8_t _microIdx)
+    : %(base_class)s("%(mnemonic)s", _machInst,
+                     %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorIntWideningMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    using vwu [[maybe_unused]] = typename double_width<vu>::type;
+    using vwi [[maybe_unused]] = typename double_width<vi>::type;
+    [[maybe_unused]] constexpr size_t sew = sizeof(vu) * 8;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    const int64_t vlmul = vtype_vlmul(machInst.vtype8);
+    const int32_t t_micro_vlmax = vtype_VLMAX(machInst.vtype8, true);
+    const int32_t micro_vlmax = vlmul < 0 ? t_micro_vlmax : t_micro_vlmax / 2;
+    [[maybe_unused]] const size_t offset =
+        (this->microIdx % 2 == 0) ? 0 : micro_vlmax;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorIntNarrowingMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    using vwu [[maybe_unused]] = typename double_width<vu>::type;
+    using vwi [[maybe_unused]] = typename double_width<vi>::type;
+    [[maybe_unused]] constexpr size_t sew = sizeof(vu) * 8;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    const int64_t vlmul = vtype_vlmul(machInst.vtype8);
+    const int32_t t_micro_vlmax = vtype_VLMAX(machInst.vtype8, true);
+    const int32_t micro_vlmax = vlmul < 0 ? t_micro_vlmax : t_micro_vlmax / 2;
+    [[maybe_unused]] const size_t offset =
+        (this->microIdx % 2 == 0) ? 0 : micro_vlmax;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorIntWideningDecodeBlock {{
+
+switch(machInst.vtype8.vsew) {
+case 0b000: return new %(class_name)s<uint8_t>(machInst);
+case 0b001: return new %(class_name)s<uint16_t>(machInst);
+case 0b010: return new %(class_name)s<uint32_t>(machInst);
+default: GEM5_UNREACHABLE;
+}
+
+}};
+
+def template VectorFloatMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorFloatMacroConstructor {{
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+}};
+
+def template VectorFloatMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs1, vs2, vs3(old_vd), vm
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst,
+        uint8_t _microVl, uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorFloatMicroConstructor {{
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+                                         uint8_t _microVl, uint8_t _microIdx)
+    : %(base_class)s("%(mnemonic)s", _machInst,
+                     %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorFloatMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using et = ElemType;
+    using vu = decltype(et::v);
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    VRM_REQUIRED;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+
+    return NoFault;
+}
+
+}};
+
+def template VectorFloatDecodeBlock {{
+
+switch(machInst.vtype8.vsew) {
+case 0b010: return new %(class_name)s<float32_t>(machInst);
+case 0b011: return new %(class_name)s<float64_t>(machInst);
+default: GEM5_UNREACHABLE;
+}
+
+}};
+
+def template VectorFloatCvtMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    std::string generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const override
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+            << registerName(srcRegIdx(0));
+        if (machInst.vm == 0) ss << ", v0.t";
+        return ss.str();
+    }
+};
+
+}};
+
+def template VectorFloatCvtMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    RegId srcRegIdxArr[3];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst,
+        uint8_t _microVl, uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    std::string generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const override
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+            << registerName(srcRegIdx(0));
+        if (machInst.vm == 0) ss << ", v0.t";
+        return ss.str();
+    }
+};
+
+}};
+
+
+def template VectorFloatWideningMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using et = ElemType;
+    using vu [[maybe_unused]] = decltype(et::v);
+    using ewt = typename double_width<et>::type;
+    using vwu = decltype(ewt::v);
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    VRM_REQUIRED;
+
+    const int64_t vlmul = vtype_vlmul(machInst.vtype8);
+    const int32_t t_micro_vlmax = vtype_VLMAX(machInst.vtype8, true);
+    const int32_t micro_vlmax = vlmul < 0 ? t_micro_vlmax : t_micro_vlmax / 2;
+    [[maybe_unused]] const size_t offset =
+        (this->microIdx % 2 == 0) ? 0 : micro_vlmax;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorFloatNarrowingMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using et = ElemType;
+    using vu [[maybe_unused]] = decltype(et::v);
+    using ewt = typename double_width<et>::type;
+    using vwu = decltype(ewt::v);
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    VRM_REQUIRED;
+
+    const int64_t vlmul = vtype_vlmul(machInst.vtype8);
+    const int32_t t_micro_vlmax = vtype_VLMAX(machInst.vtype8, true);
+    const int32_t micro_vlmax = vlmul < 0 ? t_micro_vlmax : t_micro_vlmax / 2;
+    [[maybe_unused]] const size_t offset =
+        (this->microIdx % 2 == 0) ? 0 : micro_vlmax;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorFloatWideningDecodeBlock {{
+
+switch(machInst.vtype8.vsew) {
+case 0b010: return new %(class_name)s<float32_t>(machInst);
+default: GEM5_UNREACHABLE;
+}
+
+}};
+
+def template ViotaMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    int cnt = 0;
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+
+def template ViotaMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+
+    StaticInstPtr microop;
+
+    // Allow one empty micro op to hold IsLastMicroop flag
+    for (int i = 0; i < num_microops && micro_vl >= 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i,
+            &cnt);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template ViotaMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+    int* cnt;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+                   uint8_t _microIdx, int* cnt);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template ViotaMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+    uint8_t _microVl, uint8_t _microIdx, int* cnt)
+    : %(base_class)s("%(mnemonic)s", _machInst,
+                     %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    this->cnt = cnt;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+    setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs2]);
+}
+
+}};
+
+def template ViotaMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+
+def template Vector1Vs1VdMaskConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+    %(set_vm_idx)s;
+}
+
+}};
+
+def template Vector1Vs1VdMaskExecute {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu = uint8_t;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+};
+
+}};
+
+def template Vector1Vs1RdMaskDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    RegId srcRegIdxArr[2];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template Vector1Vs1RdMaskConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    %(set_vm_idx)s;
+}
+
+}};
+
+def template Vector1Vs1RdMaskExecute {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    %(op_rd)s;
+    uint64_t Rd = 0;
+    %(vm_decl_rd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+};
+
+}};
+
+def template VectorIntMaskMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntMaskMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+    microop = new VMaskMergeMicroInst<ElemType>(_machInst, _machInst.vd,
+        this->microops.size());
+    this->microops.push_back(microop);
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorIntMaskMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs1(rs1), vs2, old_vd, v0 for *.vv[m] or *.vx[m]
+    // vs2, old_vd, v0 for *.vi[m]
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst,
+                   uint8_t _microVl, uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntMaskMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+                                         uint8_t _microVl, uint8_t _microIdx)
+: %(base_class)s("%(mnemonic)s", _machInst,
+                 %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorIntMaskMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+
+    constexpr uint16_t bit_offset = VLENB / sizeof(ElemType);
+    const uint16_t offset = bit_offset * microIdx;
+
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorFloatMaskMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorFloatMaskMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+    microop = new VMaskMergeMicroInst<ElemType>(_machInst, _machInst.vd,
+        this->microops.size());
+    this->microops.push_back(microop);
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorFloatMaskMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs1(rs1), vs2, old_vd, v0 for *.vv or *.vf
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst,
+                   uint8_t _microVl, uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorFloatMaskMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+                                         uint8_t _microVl, uint8_t _microIdx)
+: %(base_class)s("%(mnemonic)s", _machInst,
+                 %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorFloatMaskMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using et = ElemType;
+    using vu = decltype(et::v);
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+
+    constexpr uint16_t bit_offset = VLENB / sizeof(ElemType);
+    const uint16_t offset = bit_offset * microIdx;
+
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VMvWholeMacroDeclare {{
+
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VMvWholeMacroConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = _machInst.simm3 + 1;
+    StaticInstPtr microop;
+
+    for (int i = 0; i < num_microops; ++i) {
+        microop = new %(class_name)sMicro(_machInst, 0, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VMvWholeMicroDeclare {{
+
+class %(class_name)s : public %(base_class)s
+{
+private:
+    RegId srcRegIdxArr[1];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+                   uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VMvWholeMicroConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst,
+                               uint8_t _microVl, uint8_t _microIdx)
+    : %(base_class)s("%(mnemonic)s", _machInst,
+                     %(op_class)s, _microVl, _microIdx)
+{
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _microIdx]);
+    _numTypedDestRegs[VecRegClass]++;
+    setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs2 + _microIdx]);
+}
+
+}};
+
+def template VMvWholeMicroExecute {{
+
+Fault
+%(class_name)s::execute(ExecContext* xc, trace::InstRecord* traceData) const
+{
+    // TODO: Check register alignment.
+    // TODO: If vd is equal to vs2 the instruction is an architectural NOP.
+    %(op_decl)s;
+    %(op_rd)s;
+    for (size_t i = 0; i < (VLEN / 64); i++) {
+        %(code)s;
+    }
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorMaskDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    RegId srcRegIdxArr[3];
+    RegId destRegIdxArr[1];
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorMaskConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorMaskExecute {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu = uint8_t;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    // TODO: remove it
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+
+    return NoFault;
+};
+
+}};
+
+def template VectorMaskDecodeBlock {{
+
+return new %(class_name)s<uint8_t>(machInst);
+
+}};
+
+def template VectorNonSplitDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    RegId srcRegIdxArr[2];
+    RegId destRegIdxArr[1];
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorNonSplitConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    %(set_vm_idx)s;
+}
+
+}};
+
+def template VectorIntNonSplitExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                    trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorFloatNonSplitExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                    trace::InstRecord* traceData) const
+{
+    using et = ElemType;
+    using vu = decltype(et::v);
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorReduceMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorReduceMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorReduceMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs2, vs1, vd, vm
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst,
+                   uint8_t _microVl, uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorReduceMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+                                         uint8_t _microVl, uint8_t _microIdx)
+: %(base_class)s("%(mnemonic)s", _machInst,
+                 %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorReduceIntMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    %(type_def)s;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+
+    auto reduce_loop =
+        [&, this](const auto& f, const auto* _, const auto* vs2) {
+            ElemType microop_result = this->microIdx != 0 ? old_Vd[0] : Vs1[0];
+            for (uint32_t i = 0; i < this->microVl; i++) {
+                uint32_t ei = i + vtype_VLMAX(vtype, true) * this->microIdx;
+                if (this->vm || elem_mask(v0, ei)) {
+                    microop_result = f(microop_result, Vs2[i]);
+                }
+            }
+            return microop_result;
+        };
+
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorReduceFloatMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    %(type_def)s;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+
+    Vd[0] = this->microIdx != 0 ? old_Vd[0] : Vs1[0];
+
+    auto reduce_loop =
+        [&, this](const auto& f, const auto* _, const auto* vs2) {
+            vu tmp_val = Vd[0];
+            for (uint32_t i = 0; i < this->microVl; i++) {
+                uint32_t ei = i + vtype_VLMAX(vtype, true) * this->microIdx;
+                if (this->vm || elem_mask(v0, ei)) {
+                    tmp_val = f(tmp_val, Vs2[i]).v;
+                }
+            }
+            return tmp_val;
+        };
+
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorReduceFloatWideningMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    %(type_def)s;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+
+    Vd[0] = this->microIdx != 0 ? old_Vd[0] : Vs1[0];
+
+    auto reduce_loop =
+        [&, this](const auto& f, const auto* _, const auto* vs2) {
+            vwu tmp_val = Vd[0];
+            for (uint32_t i = 0; i < this->microVl; i++) {
+                uint32_t ei = i + vtype_VLMAX(vtype, true) * this->microIdx;
+                if (this->vm || elem_mask(v0, ei)) {
+                    tmp_val = f(tmp_val, Vs2[i]).v;
+                }
+            }
+            return tmp_val;
+        };
+
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorGatherMacroDeclare {{
+
+template<typename ElemType, typename IndexType>
+class %(class_name)s : public %(base_class)s{
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorGatherMacroConstructor {{
+
+template<typename ElemType, typename IndexType>
+%(class_name)s<ElemType, IndexType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    constexpr uint32_t vd_eewb = sizeof(ElemType);
+    constexpr uint32_t vs2_eewb = sizeof(ElemType);
+    constexpr uint32_t vs1_eewb = sizeof(IndexType);
+    constexpr bool vs1_split = vd_eewb > vs1_eewb;
+    const int8_t lmul = vtype_vlmul(vtype);
+    const int8_t vs1_emul = lmul +
+        (vs1_split ? -(vs2_eewb / vs1_eewb) : vs1_eewb / vs2_eewb);
+    const uint8_t vs2_vregs = lmul < 0 ? 1 : 1 << lmul;
+    const uint8_t vs1_vregs = vs1_emul < 0 ? 1 : 1 << vs1_emul;
+    const uint8_t vd_vregs = vs2_vregs;
+    const int32_t micro_vlmax = VLENB / std::max(vd_eewb, vs1_eewb);
+    int32_t remaining_vl = this->vl;
+    int32_t micro_vl = std::min(remaining_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (uint8_t i = 0; i < std::max(vs1_vregs, vd_vregs) && micro_vl > 0;
+            i++) {
+        for (uint8_t j = 0; j < vs2_vregs; j++) {
+            microop = new %(class_name)sMicro<ElemType, IndexType>(
+                _machInst, micro_vl, i * vs2_vregs + j);
+            microop->setDelayedCommit();
+            this->microops.push_back(microop);
+        }
+        micro_vl = std::min(remaining_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorGatherMicroDeclare {{
+
+template<typename ElemType, typename IndexType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs2, vs1, vd, vm
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst,
+                   uint8_t _microVl, uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorGatherMicroConstructor {{
+
+template<typename ElemType, typename IndexType>
+%(class_name)s<ElemType, IndexType>::%(class_name)s(ExtMachInst _machInst,
+    uint8_t _microVl, uint8_t _microIdx)
+: %(base_class)s("%(mnemonic)s", _machInst,
+                 %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    [[maybe_unused]] constexpr uint32_t vd_eewb = sizeof(ElemType);
+    [[maybe_unused]] constexpr uint32_t vs2_eewb = sizeof(ElemType);
+    [[maybe_unused]] constexpr uint32_t vs1_eewb = sizeof(IndexType);
+    constexpr uint8_t vs1_split_num = (vd_eewb + vs1_eewb - 1) / vs1_eewb;
+    constexpr uint8_t vd_split_num = (vs1_eewb + vd_eewb - 1) / vd_eewb;
+    const int8_t lmul = vtype_vlmul(vtype);
+    const uint8_t vs2_vregs = lmul < 0 ? 1 : 1 << lmul;
+    [[maybe_unused]] const uint8_t vs2_idx = _microIdx % vs2_vregs;
+    [[maybe_unused]] const uint8_t vs1_idx =
+        _microIdx / vs2_vregs / vs1_split_num;
+    [[maybe_unused]] const uint8_t vd_idx =
+        _microIdx / vs2_vregs / vd_split_num;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorGatherMicroExecute {{
+
+template <typename ElemType, typename IndexType>
+Fault
+%(class_name)s<ElemType, IndexType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    [[maybe_unused]] constexpr size_t sew = sizeof(vu) * 8;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    const uint32_t vlmax = vtype_VLMAX(vtype);
+    constexpr uint8_t vd_eewb = sizeof(ElemType);
+    constexpr uint8_t vs1_eewb = sizeof(IndexType);
+    constexpr uint8_t vs2_eewb = sizeof(ElemType);
+    constexpr uint8_t vs1_split_num = (vd_eewb + vs1_eewb - 1) / vs1_eewb;
+    constexpr uint8_t vd_split_num = (vs1_eewb + vd_eewb - 1) / vd_eewb;
+    [[maybe_unused]] constexpr uint16_t vd_elems = VLENB / vd_eewb;
+    [[maybe_unused]] constexpr uint16_t vs1_elems = VLENB / vs1_eewb;
+    [[maybe_unused]] constexpr uint16_t vs2_elems = VLENB / vs2_eewb;
+    [[maybe_unused]] const int8_t lmul = vtype_vlmul(vtype);
+    [[maybe_unused]] const uint8_t vs2_vregs = lmul < 0 ? 1 : 1 << lmul;
+    [[maybe_unused]] const uint8_t vs2_idx = microIdx % vs2_vregs;
+    [[maybe_unused]] const uint8_t vs1_idx =
+        microIdx / vs2_vregs / vs1_split_num;
+    [[maybe_unused]] const uint8_t vd_idx =
+        microIdx / vs2_vregs / vd_split_num;
+    [[maybe_unused]] const uint16_t vs1_bias =
+        vs1_elems * (vd_idx % vs1_split_num) / vs1_split_num;
+    [[maybe_unused]] const uint16_t vd_bias =
+        vd_elems * (vs1_idx % vd_split_num) / vd_split_num;
+
+    %(code)s;
+    %(op_wb)s;
+
+    return NoFault;
+}
+
+}};
+
+def template VectorGatherDecodeBlock {{
+
+switch(machInst.vtype8.vsew) {
+    case 0b000: {
+        using elem_type [[maybe_unused]] = uint8_t;
+        return new %(class_name)s<uint8_t, %(idx_type)s>(machInst);
+    }
+    case 0b001: {
+        using elem_type [[maybe_unused]] = uint16_t;
+        return new %(class_name)s<uint16_t, %(idx_type)s>(machInst);
+    }
+    case 0b010: {
+        using elem_type [[maybe_unused]] = uint32_t;
+        return new %(class_name)s<uint32_t, %(idx_type)s>(machInst);
+    }
+    case 0b011: {
+        using elem_type [[maybe_unused]] = uint64_t;
+        return new %(class_name)s<uint64_t, %(idx_type)s>(machInst);
+    }
+    default: GEM5_UNREACHABLE;
+}
+
+}};
+
+def template VectorIntVxsatMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s{
+private:
+    %(reg_idx_arr_decl)s;
+    bool vxsat = false;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntVxsatMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst,
+            micro_vl, i, &vxsat);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    microop = new VxsatMicroInst(&vxsat, _machInst);
+    microop->setFlag(StaticInst::IsSerializeAfter);
+    microop->setFlag(StaticInst::IsNonSpeculative);
+    this->microops.push_back(microop);
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+}};
+
+def template VectorIntVxsatMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+    bool* vxsatptr;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+                   uint8_t _microIdx, bool* vxsatptr);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntVxsatMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+    uint8_t _microVl, uint8_t _microIdx, bool* vxsatptr)
+    : %(base_class)s("%(mnemonic)s", _machInst,
+                     %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    this->vxsatptr = vxsatptr;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorReduceIntWideningMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    using vwu [[maybe_unused]] = typename double_width<vu>::type;
+    using vwi [[maybe_unused]] = typename double_width<vi>::type;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+
+    Vd[0] = this->microIdx != 0 ? old_Vd[0] : Vs1[0];
+
+    auto reduce_loop =
+        [&, this](const auto& f, const auto* _, const auto* vs2) {
+            vwu tmp_val = Vd[0];
+            for (uint32_t i = 0; i < this->microVl; i++) {
+                uint32_t ei = i + vtype_VLMAX(vtype, true) * this->microIdx;
+                if (this->vm || elem_mask(v0, ei)) {
+                    tmp_val = f(tmp_val, Vs2[i]);
+                }
+            }
+            return tmp_val;
+        };
+
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorSlideMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorSlideUpMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    // Todo static filter out useless uop
+    int micro_idx = 0;
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        for (int j = 0; j <= i; ++j) {
+            microop = new %(class_name)sMicro<ElemType>(
+                _machInst, micro_vl, micro_idx++, i, j);
+            microop->setDelayedCommit();
+            this->microops.push_back(microop);
+        }
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorSlideDownMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    // Todo static filter out useless uop
+    int micro_idx = 0;
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        for (int j = i; j < num_microops; ++j) {
+            microop = new %(class_name)sMicro<ElemType>(
+                _machInst, micro_vl, micro_idx++, i, j);
+            microop->setDelayedCommit();
+            this->microops.push_back(microop);
+        }
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorSlideMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs2, vs1, vs3(old_vd), vm for *.vv, *.vx
+    // vs2, (old_vd), vm for *.vi
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+        uint8_t _microIdx, uint8_t _vdIdx, uint8_t _vs2Idx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorSlideMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+        uint8_t _microVl, uint8_t _microIdx, uint8_t _vdIdx, uint8_t _vs2Idx)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s, _microVl,
+        _microIdx, _vdIdx, _vs2Idx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorSlideMicroExecute {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    [[maybe_unused]]const uint32_t vlmax = vtype_VLMAX(vtype);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+
+    return NoFault;
+};
+
+}};
+
+def template VectorFloatSlideMicroExecute {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using et = ElemType;
+    using vu = decltype(et::v);
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    [[maybe_unused]]const uint32_t vlmax = vtype_VLMAX(vtype);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+
+    return NoFault;
+};
+
+}};
diff --git a/src/arch/riscv/isa/templates/vector_mem.isa b/src/arch/riscv/isa/templates/vector_mem.isa
index d54243ad7d..f8be1e555b 100644
--- a/src/arch/riscv/isa/templates/vector_mem.isa
+++ b/src/arch/riscv/isa/templates/vector_mem.isa
@@ -1,3 +1,31 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2022 PLCT Lab
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 def template VMemMacroDeclare {{
 
 class %(class_name)s : public %(base_class)s
diff --git a/src/arch/riscv/regs/float.hh b/src/arch/riscv/regs/float.hh
index 4809372070..cca9e1be2f 100644
--- a/src/arch/riscv/regs/float.hh
+++ b/src/arch/riscv/regs/float.hh
@@ -211,6 +211,20 @@ const std::vector<std::string> RegNames = {
 
 } // namespace float_reg
 
+inline float32_t
+fsgnj32(float32_t a, float32_t b, bool n, bool x) {
+    if (n) b.v = ~b.v;
+    else if (x) b.v = a.v ^ b.v;
+    return f32(insertBits(b.v, 30, 0, a.v));
+}
+
+inline float64_t
+fsgnj64(float64_t a, float64_t b, bool n, bool x) {
+    if (n) b.v = ~b.v;
+    else if (x) b.v = a.v ^ b.v;
+    return f64(insertBits(b.v, 62, 0, a.v));
+}
+
 } // namespace RiscvISA
 } // namespace gem5
 
diff --git a/src/arch/riscv/utility.hh b/src/arch/riscv/utility.hh
index 1db6d6df3b..40054aec0f 100644
--- a/src/arch/riscv/utility.hh
+++ b/src/arch/riscv/utility.hh
@@ -241,6 +241,13 @@ remu(T rs1, T rs2)
     return (rs2 == 0) ? rs1 : rs1 % rs2;
 }
 
+// Vector extension functions
+inline uint64_t
+vtype_SEW(const uint64_t vtype)
+{
+    return 8 << bits(vtype, 5, 3);
+}
+
 /*
 * Encode LMUL to lmul as follows:
 *     LMUL    vlmul    lmul
@@ -269,6 +276,25 @@ vtype_VLMAX(const uint64_t vtype, const bool per_reg = false)
     return gem5::RiscvISA::VLEN >> (vsew + 3 - lmul);
 }
 
+inline int64_t
+vtype_vlmul(const uint64_t vtype)
+{
+    return (int64_t)sext<3>(bits(vtype, 2, 0));
+}
+
+inline uint64_t
+vtype_regs_per_group(const uint64_t vtype)
+{
+    int64_t lmul = (int64_t)sext<3>(bits(vtype, 2, 0));
+    return 1 << std::max<int64_t>(0, lmul);
+}
+
+inline void
+vtype_set_vill(uint64_t& vtype)
+{
+    vtype = (uint64_t)0 ^ (1UL << (sizeof(RegVal) * 8 - 1));
+}
+
 inline uint64_t
 width_EEW(uint64_t width)
 {
@@ -296,6 +322,461 @@ elem_mask(const T* vs, const int index)
     return (vs[idx] >> pos) & 1;
 }
 
+template<typename Type> struct double_width;
+template<> struct double_width<uint8_t>     { using type = uint16_t;};
+template<> struct double_width<uint16_t>    { using type = uint32_t;};
+template<> struct double_width<uint32_t>    { using type = uint64_t;};
+template<> struct double_width<int8_t>      { using type = int16_t; };
+template<> struct double_width<int16_t>     { using type = int32_t; };
+template<> struct double_width<int32_t>     { using type = int64_t; };
+template<> struct double_width<float32_t>   { using type = float64_t;};
+
+template<typename Type> struct double_widthf;
+template<> struct double_widthf<uint32_t>    { using type = float64_t;};
+template<> struct double_widthf<int32_t>     { using type = float64_t;};
+
+template<typename FloatType, typename IntType = decltype(FloatType::v)> auto
+ftype(IntType a) -> FloatType
+{
+    if constexpr(std::is_same_v<uint32_t, IntType>)
+        return f32(a);
+    else if constexpr(std::is_same_v<uint64_t, IntType>)
+        return f64(a);
+    GEM5_UNREACHABLE;
+}
+
+// TODO: Consolidate ftype_freg(freg_t a) and ftype(IntType a) into a
+// single function
+template<typename FloatType, typename IntType = decltype(FloatType::v)> auto
+ftype_freg(freg_t a) -> FloatType
+{
+    if constexpr(std::is_same_v<uint32_t, IntType>)
+        return f32(a);
+    else if constexpr(std::is_same_v<uint64_t, IntType>)
+        return f64(a);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fadd(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_add(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_add(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fsub(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_sub(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_sub(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fmin(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_min(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_min(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fmax(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_max(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_max(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fdiv(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_div(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_div(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fmul(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_mul(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_mul(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fsqrt(FloatType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_sqrt(a);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_sqrt(a);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+frsqrte7(FloatType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_rsqrte7(a);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_rsqrte7(a);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+frecip7(FloatType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_recip7(a);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_recip7(a);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fclassify(FloatType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32(f32_classify(a));
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64(f64_classify(a));
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fsgnj(FloatType a, FloatType b, bool n, bool x)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return fsgnj32(a, b, n, x);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return fsgnj64(a, b, n, x);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> bool
+fle(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_le(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_le(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> bool
+feq(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_eq(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_eq(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> bool
+flt(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_lt(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_lt(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fmadd(FloatType a, FloatType b, FloatType c)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_mulAdd(a, b, c);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_mulAdd(a, b, c);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fneg(FloatType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32(a.v ^ uint32_t(mask(31, 31)));
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64(a.v ^ mask(63, 63));
+    GEM5_UNREACHABLE;
+}
+
+template<typename FT, typename WFT = typename double_width<FT>::type> WFT
+fwiden(FT a)
+{
+    if constexpr(std::is_same_v<float32_t, FT>)
+        return f32_to_f64(a);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType, typename IntType = decltype(FloatType::v)> IntType
+f_to_ui(FloatType a, uint_fast8_t mode)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_to_ui32(a, mode, true);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_to_ui64(a, mode, true);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename FloatType,
+    typename IntType = decltype(double_width<FloatType>::type::v)
+> IntType
+f_to_wui(FloatType a, uint_fast8_t mode)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_to_ui64(a, mode, true);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename IntType,
+    typename FloatType = typename double_widthf<IntType>::type
+> IntType
+f_to_nui(FloatType a, uint_fast8_t mode)
+{
+    if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_to_ui32(a, mode, true);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType, typename IntType = decltype(FloatType::v)> IntType
+f_to_i(FloatType a, uint_fast8_t mode)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return (uint32_t)f32_to_i32(a, mode, true);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return (uint64_t)f64_to_i64(a, mode, true);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename FloatType,
+    typename IntType = decltype(double_width<FloatType>::type::v)
+> IntType
+f_to_wi(FloatType a, uint_fast8_t mode)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return (uint64_t)f32_to_i64(a, mode, true);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename IntType,
+    typename FloatType = typename double_widthf<IntType>::type
+> IntType
+f_to_ni(FloatType a, uint_fast8_t mode)
+{
+    if constexpr(std::is_same_v<float64_t, FloatType>)
+        return (uint32_t)f64_to_i32(a, mode, true);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType, typename IntType = decltype(FloatType::v)>
+FloatType
+ui_to_f(IntType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return ui32_to_f32(a);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return ui64_to_f64(a);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename IntType,
+    typename FloatType = typename double_widthf<IntType>::type
+> FloatType
+ui_to_wf(IntType a)
+{
+    if constexpr(std::is_same_v<float64_t, FloatType>)
+        return ui32_to_f64(a);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename FloatType,
+    typename IntType = decltype(double_width<FloatType>::type::v)
+> FloatType
+ui_to_nf(IntType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return ui64_to_f32(a);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType, typename IntType = decltype(FloatType::v)>
+FloatType
+i_to_f(IntType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return i32_to_f32((int32_t)a);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return i64_to_f64((int64_t)a);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename IntType,
+    typename FloatType = typename double_widthf<IntType>::type
+> FloatType
+i_to_wf(IntType a)
+{
+    if constexpr(std::is_same_v<float64_t, FloatType>)
+        return i32_to_f64((int32_t)a);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename FloatType,
+    typename IntType = std::make_signed_t<
+        decltype(double_width<FloatType>::type::v)
+    >
+> FloatType
+i_to_nf(IntType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return i64_to_f32(a);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename FloatType,
+    typename FloatWType = typename double_width<FloatType>::type
+> FloatWType
+f_to_wf(FloatType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_to_f64(a);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename FloatNType,
+    typename FloatType = typename double_width<FloatNType>::type
+> FloatNType
+f_to_nf(FloatType a)
+{
+    if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_to_f32(a);
+    GEM5_UNREACHABLE;
+}
+
+//ref:  https://locklessinc.com/articles/sat_arithmetic/
+template<typename T> T
+sat_add(T x, T y, bool* sat)
+{
+    using UT = std::make_unsigned_t<T>;
+    UT ux = x;
+    UT uy = y;
+    UT res = ux + uy;
+
+    int sh = sizeof(T) * 8 - 1;
+
+    ux = (ux >> sh) + (((UT)0x1 << sh) - 1);
+
+    if ((T) ((ux ^ uy) | ~(uy ^ res)) >= 0) {
+    res = ux;
+    *sat = true;
+    }
+    return res;
+}
+
+template<typename T> T
+sat_sub(T x, T y, bool* sat)
+{
+    using UT = std::make_unsigned_t<T>;
+    UT ux = x;
+    UT uy = y;
+    UT res = ux - uy;
+
+    int sh = sizeof(T) * 8 - 1;
+
+    ux = (ux >> sh) + (((UT)0x1 << sh) - 1);
+
+    if ((T) ((ux ^ uy) & (ux ^ res)) < 0) {
+    res = ux;
+    *sat = true;
+    }
+    return res;
+}
+
+template<typename T> T
+sat_addu(T x, T y, bool* sat)
+{
+    T res = x + y;
+
+    bool t = res < x;
+    if (false == *sat){
+    *sat = t;
+    }
+    res |= -(res < x);
+
+    return res;
+}
+
+template<typename T> T
+sat_subu(T x, T y, bool* sat)
+{
+    T res = x - y;
+
+    bool t = !(res <= x);
+    if (false == *sat){
+    *sat = t;
+    }
+
+    res &= -(res <= x);
+
+    return res;
+}
+
+/**
+ * Ref:
+ * https://github.com/riscv-software-src/riscv-isa-sim
+ */
+template<typename T> T
+int_rounding(T result, uint8_t xrm, unsigned gb) {
+    const uint64_t lsb = 1UL << gb;
+    const uint64_t lsb_half = lsb >> 1;
+    switch (xrm) {
+    case 0 /* RNU */:
+        result += lsb_half;
+        break;
+    case 1 /* RNE */:
+        if ((result & lsb_half) &&
+            ((result & (lsb_half - 1)) || (result & lsb)))
+            result += lsb;
+        break;
+    case 2 /* RDN */:
+        break;
+    case 3 /* ROD */:
+        if (result & (lsb - 1))
+            result |= lsb;
+        break;
+    default:
+        panic("Invalid xrm value %d", (int)xrm);
+    }
+
+    return result;
+}
+
 } // namespace RiscvISA
 } // namespace gem5
 

From ae651f4de1a981a780bd450b031e6258ad022123 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A0=20Armejach?= <adria.armejach@bsc.es>
Date: Fri, 21 Jul 2023 18:36:48 +0200
Subject: [PATCH 07/10] configs: update riscv restore checkpoint test

Change-Id: I019fc6394a03196711ab52533ad8062b22c89daf
---
 .../gem5_library/checkpoints/riscv-hello-restore-checkpoint.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/example/gem5_library/checkpoints/riscv-hello-restore-checkpoint.py b/configs/example/gem5_library/checkpoints/riscv-hello-restore-checkpoint.py
index 9f9bf839a6..eed76e2448 100644
--- a/configs/example/gem5_library/checkpoints/riscv-hello-restore-checkpoint.py
+++ b/configs/example/gem5_library/checkpoints/riscv-hello-restore-checkpoint.py
@@ -90,7 +90,7 @@ board = SimpleBoard(
 board.set_se_binary_workload(
     # the workload should be the same as the save-checkpoint script
     obtain_resource("riscv-hello"),
-    checkpoint=obtain_resource("riscv-hello-example-checkpoint-v23"),
+    checkpoint=obtain_resource("riscv-hello-example-checkpoint"),
 )
 
 simulator = Simulator(

From af1b2ec2d5768a08bd42f11466be991302149482 Mon Sep 17 00:00:00 2001
From: Jason Lowe-Power <jason@lowepower.com>
Date: Fri, 28 Jul 2023 09:44:28 -0700
Subject: [PATCH 08/10] arch-riscv: Add fatal if RVV used with o3 or minor

Since the O3 and Minor CPU models do not support RVV right now as the
implementation stalls the decode until vsetvl instructions are exectued,
this change calls `fatal` if RVV is not explicitly enabled.

It is possible to override this if you explicitly enable RVV in the
config file.

Change-Id: Ia801911141bb2fb2bedcff3e139bf41ba8936085
Signed-off-by: Jason Lowe-Power <jason@lowepower.com>
---
 src/arch/riscv/RiscvCPU.py     | 15 +++++++++++++--
 src/arch/riscv/RiscvDecoder.py |  1 +
 src/arch/riscv/RiscvISA.py     |  2 ++
 src/arch/riscv/decoder.cc      | 12 ++++++++++++
 src/arch/riscv/decoder.hh      |  6 ++----
 src/arch/riscv/isa.cc          |  9 ++++++---
 src/arch/riscv/isa.hh          |  3 +++
 7 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/src/arch/riscv/RiscvCPU.py b/src/arch/riscv/RiscvCPU.py
index 1c77045c67..449bf5e7af 100644
--- a/src/arch/riscv/RiscvCPU.py
+++ b/src/arch/riscv/RiscvCPU.py
@@ -41,6 +41,17 @@ class RiscvCPU:
     ArchISA = RiscvISA
 
 
+class RiscvISANoRVV(RiscvISA):
+    enable_rvv = False
+
+
+class RiscvCPUNoRVV:
+    ArchDecoder = RiscvDecoder
+    ArchMMU = RiscvMMU
+    ArchInterrupts = RiscvInterrupts
+    ArchISA = RiscvISANoRVV
+
+
 class RiscvAtomicSimpleCPU(BaseAtomicSimpleCPU, RiscvCPU):
     mmu = RiscvMMU()
 
@@ -53,9 +64,9 @@ class RiscvTimingSimpleCPU(BaseTimingSimpleCPU, RiscvCPU):
     mmu = RiscvMMU()
 
 
-class RiscvO3CPU(BaseO3CPU, RiscvCPU):
+class RiscvO3CPU(BaseO3CPU, RiscvCPUNoRVV):
     mmu = RiscvMMU()
 
 
-class RiscvMinorCPU(BaseMinorCPU, RiscvCPU):
+class RiscvMinorCPU(BaseMinorCPU, RiscvCPUNoRVV):
     mmu = RiscvMMU()
diff --git a/src/arch/riscv/RiscvDecoder.py b/src/arch/riscv/RiscvDecoder.py
index 30c1077662..4100a3c5b3 100644
--- a/src/arch/riscv/RiscvDecoder.py
+++ b/src/arch/riscv/RiscvDecoder.py
@@ -24,6 +24,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from m5.objects.InstDecoder import InstDecoder
+from m5.params import *
 
 
 class RiscvDecoder(InstDecoder):
diff --git a/src/arch/riscv/RiscvISA.py b/src/arch/riscv/RiscvISA.py
index bb9a05babe..f66171a95a 100644
--- a/src/arch/riscv/RiscvISA.py
+++ b/src/arch/riscv/RiscvISA.py
@@ -56,3 +56,5 @@ class RiscvISA(BaseISA):
         True, "whether to check memory access alignment"
     )
     riscv_type = Param.RiscvType("RV64", "RV32 or RV64")
+
+    enable_rvv = Param.Bool(True, "Enable vector extension")
diff --git a/src/arch/riscv/decoder.cc b/src/arch/riscv/decoder.cc
index ce362ad522..702d84fd91 100644
--- a/src/arch/riscv/decoder.cc
+++ b/src/arch/riscv/decoder.cc
@@ -28,6 +28,7 @@
  */
 
 #include "arch/riscv/decoder.hh"
+#include "arch/riscv/isa.hh"
 #include "arch/riscv/types.hh"
 #include "base/bitfield.hh"
 #include "debug/Decode.hh"
@@ -38,6 +39,13 @@ namespace gem5
 namespace RiscvISA
 {
 
+Decoder::Decoder(const RiscvDecoderParams &p) : InstDecoder(p, &machInst)
+{
+    ISA *isa = dynamic_cast<ISA*>(p.isa);
+    enableRvv = isa->getEnableRvv();
+    reset();
+}
+
 void Decoder::reset()
 {
     aligned = true;
@@ -53,6 +61,10 @@ Decoder::moreBytes(const PCStateBase &pc, Addr fetchPC)
     // TODO: Current vsetvl instructions stall decode. Future fixes should
     // enable speculation, and this code will be removed.
     if (GEM5_UNLIKELY(!this->vConfigDone)) {
+        fatal_if(!enableRvv,
+            "Vector extension is not enabled for this CPU type\n"
+            "You can manually enable vector extensions by setting rvv_enabled "
+            "to true for each ISA object after `createThreads()`\n");
         DPRINTF(Decode, "Waiting for vset*vl* to be executed\n");
         instDone = false;
         outOfBytes = false;
diff --git a/src/arch/riscv/decoder.hh b/src/arch/riscv/decoder.hh
index d1d2f3cb0c..1f510e8280 100644
--- a/src/arch/riscv/decoder.hh
+++ b/src/arch/riscv/decoder.hh
@@ -61,6 +61,7 @@ class Decoder : public InstDecoder
     ExtMachInst emi;
     uint32_t machInst;
 
+    bool enableRvv = false;
     VTYPE machVtype;
     uint32_t machVl;
 
@@ -72,10 +73,7 @@ class Decoder : public InstDecoder
     StaticInstPtr decode(ExtMachInst mach_inst, Addr addr);
 
   public:
-    Decoder(const RiscvDecoderParams &p) : InstDecoder(p, &machInst)
-    {
-        reset();
-    }
+    Decoder(const RiscvDecoderParams &p);
 
     void reset() override;
 
diff --git a/src/arch/riscv/isa.cc b/src/arch/riscv/isa.cc
index 2f9d52e1b2..84205eb57a 100644
--- a/src/arch/riscv/isa.cc
+++ b/src/arch/riscv/isa.cc
@@ -253,7 +253,8 @@ RegClass ccRegClass(CCRegClass, CCRegClassName, 0, debug::IntRegs);
 } // anonymous namespace
 
 ISA::ISA(const Params &p) :
-    BaseISA(p), rv_type(p.riscv_type), checkAlignment(p.check_alignment)
+    BaseISA(p), rv_type(p.riscv_type), checkAlignment(p.check_alignment),
+    enableRvv(p.enable_rvv)
 {
     _regClasses.push_back(&intRegClass);
     _regClasses.push_back(&floatRegClass);
@@ -324,8 +325,10 @@ void ISA::clear()
         case RV64:
           misa.rv64_mxl = 2;
           status.uxl = status.sxl = 2;
-          status.vs = VPUStatus::INITIAL;
-          misa.rvv = 1;
+          if (getEnableRvv()) {
+              status.vs = VPUStatus::INITIAL;
+              misa.rvv = 1;
+          }
           break;
         default:
           panic("%s: Unknown rv_type: %d", name(), (int)rv_type);
diff --git a/src/arch/riscv/isa.hh b/src/arch/riscv/isa.hh
index d7b0a21a1f..1be45ac7fa 100644
--- a/src/arch/riscv/isa.hh
+++ b/src/arch/riscv/isa.hh
@@ -75,6 +75,7 @@ class ISA : public BaseISA
     RiscvType rv_type;
     std::vector<RegVal> miscRegFile;
     bool checkAlignment;
+    bool enableRvv;
 
     bool hpmCounterEnabled(int counter) const;
 
@@ -138,6 +139,8 @@ class ISA : public BaseISA
 
     RiscvType rvType() const { return rv_type; }
 
+    bool getEnableRvv() const { return enableRvv; }
+
     void
     clearLoadReservation(ContextID cid)
     {

From 98d68a7307e2f93e547acf2ff67c957a198bd0ac Mon Sep 17 00:00:00 2001
From: Jason Lowe-Power <jason@lowepower.com>
Date: Fri, 28 Jul 2023 09:46:21 -0700
Subject: [PATCH 09/10] arch-riscv: Improve style

Minor style fixes in vector code

Change-Id: If0de45a2dbfb5d5aaa65ed3b5d91d9bee9bcc960
Signed-off-by: Jason Lowe-Power <jason@lowepower.com>
---
 src/arch/riscv/insts/vector.cc | 20 +++++++++++---------
 src/arch/riscv/insts/vector.hh | 33 +++++++++++++++++++--------------
 2 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/src/arch/riscv/insts/vector.cc b/src/arch/riscv/insts/vector.cc
index a1ccf402c9..6ecec44dc5 100644
--- a/src/arch/riscv/insts/vector.cc
+++ b/src/arch/riscv/insts/vector.cc
@@ -56,18 +56,20 @@ namespace RiscvISA
  *
 **/
 float
-getVflmul(uint32_t vlmul_encoding) {
-  int vlmul = sext<3>(vlmul_encoding & 7);
-  float vflmul = vlmul >= 0 ? 1 << vlmul : 1.0 / (1 << -vlmul);
-  return vflmul;
+getVflmul(uint32_t vlmul_encoding)
+{
+    int vlmul = sext<3>(vlmul_encoding & 7);
+    float vflmul = vlmul >= 0 ? 1 << vlmul : 1.0 / (1 << -vlmul);
+    return vflmul;
 }
 
 uint32_t
-getVlmax(VTYPE vtype, uint32_t vlen) {
-  uint32_t sew = getSew(vtype.vsew);
-  // vlmax is defined in RVV 1.0 spec p12 chapter 3.4.2.
-  uint32_t vlmax = (vlen/sew) * getVflmul(vtype.vlmul);
-  return vlmax;
+getVlmax(VTYPE vtype, uint32_t vlen)
+{
+    uint32_t sew = getSew(vtype.vsew);
+    // vlmax is defined in RVV 1.0 spec p12 chapter 3.4.2.
+    uint32_t vlmax = (vlen/sew) * getVflmul(vtype.vlmul);
+    return vlmax;
 }
 
 std::string
diff --git a/src/arch/riscv/insts/vector.hh b/src/arch/riscv/insts/vector.hh
index 5d0874a994..cae0dcac0a 100644
--- a/src/arch/riscv/insts/vector.hh
+++ b/src/arch/riscv/insts/vector.hh
@@ -47,7 +47,9 @@ namespace RiscvISA
 float
 getVflmul(uint32_t vlmul_encoding);
 
-inline uint32_t getSew(uint32_t vsew) {
+inline uint32_t
+getSew(uint32_t vsew)
+{
     assert(vsew <= 3);
     return (8 << vsew);
 }
@@ -124,7 +126,7 @@ class VectorMacroInst : public RiscvMacroInst
 
 class VectorMicroInst : public RiscvMicroInst
 {
-protected:
+  protected:
     uint8_t microVl;
     uint8_t microIdx;
     uint8_t vtype;
@@ -420,7 +422,7 @@ class VsStrideMacroInst : public VectorMemMacroInst
 class VsStrideMicroInst : public VectorMemMicroInst
 {
   protected:
-  uint8_t regIdx;
+    uint8_t regIdx;
     VsStrideMicroInst(const char *mnem, ExtMachInst _machInst,
                       OpClass __opClass, uint8_t _regIdx,
                       uint8_t _microIdx, uint8_t _microVl)
@@ -487,9 +489,9 @@ class VsIndexMicroInst : public VectorMemMicroInst
     VsIndexMicroInst(const char *mnem, ExtMachInst _machInst,
                     OpClass __opClass, uint8_t _vs3RegIdx, uint8_t _vs3ElemIdx,
                     uint8_t _vs2RegIdx, uint8_t _vs2ElemIdx)
-        : VectorMemMicroInst(mnem, _machInst, __opClass, 1, 0, 0)
-        , vs3RegIdx(_vs3RegIdx), vs3ElemIdx(_vs3ElemIdx)
-        , vs2RegIdx(_vs2RegIdx), vs2ElemIdx(_vs2ElemIdx)
+        : VectorMemMicroInst(mnem, _machInst, __opClass, 1, 0, 0),
+          vs3RegIdx(_vs3RegIdx), vs3ElemIdx(_vs3ElemIdx),
+          vs2RegIdx(_vs2RegIdx), vs2ElemIdx(_vs2ElemIdx)
     {}
 
     std::string generateDisassembly(
@@ -532,7 +534,7 @@ class VMaskMergeMicroInst : public VectorArithMicroInst
     VMaskMergeMicroInst(ExtMachInst extMachInst, uint8_t _dstReg,
         uint8_t _numSrcs)
         : VectorArithMicroInst("vmask_mv_micro", extMachInst,
-          VectorIntegerArithOp, 0, 0)
+                               VectorIntegerArithOp, 0, 0)
     {
         setRegIdxArrays(
             reinterpret_cast<RegIdArrayPtr>(
@@ -550,8 +552,9 @@ class VMaskMergeMicroInst : public VectorArithMicroInst
         }
     }
 
-    Fault execute(ExecContext* xc, trace::InstRecord* traceData)
-            const override {
+    Fault
+    execute(ExecContext* xc, trace::InstRecord* traceData) const override
+    {
         vreg_t tmp_d0 = *(vreg_t *)xc->getWritableRegOperand(this, 0);
         auto Vd = tmp_d0.as<uint8_t>();
         constexpr uint8_t elems_per_vreg = VLENB / sizeof(ElemType);
@@ -582,8 +585,10 @@ class VMaskMergeMicroInst : public VectorArithMicroInst
         return NoFault;
     }
 
-    std::string generateDisassembly(
-            Addr pc, const loader::SymbolTable *symtab) const override {
+    std::string
+    generateDisassembly(Addr pc, const loader::SymbolTable *symtab)
+        const override
+    {
         std::stringstream ss;
         ss << mnemonic << ' ' << registerName(destRegIdx(0));
         for (uint8_t i = 0; i < this->_numSrcRegs; i++) {
@@ -605,8 +610,8 @@ class VxsatMicroInst : public VectorArithMicroInst
     {
         vxsat = Vxsat;
     }
-    Fault execute(ExecContext* xc, trace::InstRecord* traceData)
-    const override
+    Fault
+    execute(ExecContext* xc, trace::InstRecord* traceData) const override
     {
         xc->setMiscReg(MISCREG_VXSAT,*vxsat);
         auto vcsr = xc->readMiscReg(MISCREG_VCSR);
@@ -614,7 +619,7 @@ class VxsatMicroInst : public VectorArithMicroInst
         return NoFault;
     }
     std::string generateDisassembly(Addr pc, const loader::SymbolTable *symtab)
-      const override
+        const override
     {
         std::stringstream ss;
         ss << mnemonic << ' ' << "VXSAT" << ", " << (*vxsat ? "0x1" : "0x0");

From 884d62b33af866664960e399ddc09b654753b794 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A0=20Armejach?= <adria.armejach@bsc.es>
Date: Wed, 2 Aug 2023 14:05:21 +0200
Subject: [PATCH 10/10] arch-riscv: Make vset*vl* instructions serialize

Current implementation of vset*vl* instructions serialize pipeline and
are non-speculative.

Change-Id: Ibf93b60133fb3340690b126db12827e36e2c202d
---
 src/arch/riscv/isa/decoder.isa | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/arch/riscv/isa/decoder.isa b/src/arch/riscv/isa/decoder.isa
index 2b46752ffe..d34adfaa02 100644
--- a/src/arch/riscv/isa/decoder.isa
+++ b/src/arch/riscv/isa/decoder.isa
@@ -4344,7 +4344,7 @@ decode QUADRANT default Unknown::unknown() {
                         uint64_t requested_vtype = zimm11;
 
                         Rd_ud = 0;
-                    }}, VectorConfigOp, IsDirectControl, IsCondControl);
+                    }}, VectorConfigOp, IsSerializeAfter, IsNonSpeculative);
                     0x1: decode BIT30 {
                         0x0: vsetvl({{
                             uint64_t rd_bits = RD;
@@ -4353,7 +4353,8 @@ decode QUADRANT default Unknown::unknown() {
                             uint64_t requested_vtype = Rs2_ud;
 
                             Rd_ud = 0;
-                        }}, VectorConfigOp, IsDirectControl, IsCondControl);
+                        }}, VectorConfigOp, IsSerializeAfter,
+                        IsNonSpeculative);
                         0x1: vsetivli({{
                             uint64_t rd_bits = RD;
                             uint64_t rs1_bits = -1;
@@ -4361,7 +4362,8 @@ decode QUADRANT default Unknown::unknown() {
                             uint64_t requested_vtype = zimm10;
 
                             Rd_ud = 0;
-                        }}, VectorConfigOp, IsDirectControl, IsCondControl);
+                        }}, VectorConfigOp, IsSerializeAfter,
+                        IsNonSpeculative);
                     }
                 }
             }