diff --git a/.github/workflows/daily-tests.yaml b/.github/workflows/daily-tests.yaml
index 60203f1017..67024e6ee5 100644
--- a/.github/workflows/daily-tests.yaml
+++ b/.github/workflows/daily-tests.yaml
@@ -137,7 +137,7 @@ jobs:
     # run test
     - name: long ${{ matrix.test-type }} tests
       working-directory: ${{ github.workspace }}/tests
-      run: ./main.py run gem5/${{ matrix.image }} --length=long --skip-build -vv -t $(nproc)
+      run: ./main.py run gem5/${{ matrix.test-type }} --length=long --skip-build -vv -t $(nproc)
     - name: create zip of results
       if: success() || failure()
       run: |
@@ -147,7 +147,7 @@ jobs:
       if: success() || failure()
       uses: actions/upload-artifact@v3
       env:
-        MY_STEP_VAR: ${{github.job}}_COMMIT.${{github.sha}}_RUN.${{github.run_id}}_ATTEMPT.${{github.run_attempt}}
+        MY_STEP_VAR: ${{ matrix.test-type }}_COMMIT.${{github.sha}}_RUN.${{github.run_id}}_ATTEMPT.${{github.run_attempt}}
       with:
         name: ${{ env.MY_STEP_VAR }}
         path: output.zip
@@ -168,7 +168,7 @@ jobs:
       matrix:
         test-type: [gem5-library-example-x86-ubuntu-run-ALL-x86_64-opt, gem5-library-example-riscv-ubuntu-run-ALL-x86_64-opt, lupv-example-ALL-x86_64-opt, gem5-library-example-arm-ubuntu-run-test-ALL-x86_64-opt, gem5-library-example-riscvmatched-hello-ALL-x86_64-opt]
     container: gcr.io/gem5-test/ubuntu-22.04_all-dependencies:latest
-    needs: build-gem5
+    needs: [name-artifacts, build-gem5]
     timeout-minutes: 1440 # 24 hours
     steps:
     - uses: actions/checkout@v3
@@ -193,7 +193,7 @@ jobs:
       if: success() || failure()
       uses: actions/upload-artifact@v3
       env:
-        MY_STEP_VAR: ${{github.job}}_COMMIT.${{github.sha}}_RUN.${{github.run_id}}_ATTEMPT.${{github.run_attempt}}
+        MY_STEP_VAR: ${{ matrix.test-type }}_COMMIT.${{github.sha}}_RUN.${{github.run_id}}_ATTEMPT.${{github.run_attempt}}
       with:
         name: ${{ env.MY_STEP_VAR }}
         path: output.zip
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b41a73a4a6..208c9444e1 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,425 +1,390 @@
-If you've made changes to gem5 that might benefit others, we strongly encourage
-you to contribute those changes to the public gem5 repository. There are
-several reasons to do this:
- * Share your work with others, so that they can benefit from new functionality.
- * Support the scientific principle by enabling others to evaluate your
-   suggestions without having to guess what you did.
- * Once your changes are part of the main repo, you no longer have to merge
-   them back in every time you update your local repo. This can be a huge time
-   saving!
- * Once your code is in the main repo, other people have to make their changes
-   work with your code, and not the other way around.
- * Others may build on your contributions to make them even better, or extend
-   them in ways you did not have time to do.
- * You will have the satisfaction of contributing back to the community.
+This document serves as a guide to contributing to gem5.
+The following subsections outline, in order, the steps involved in contributing
+to the gem5 project.
 
-The main method for contributing code to gem5 is via our code review website:
-https://github.com/gem5/gem5/pulls/. This documents describes the details of
-how to create code changes, upload your changes, have your changes
-reviewed, and finally push your changes to gem5. More information can be found
-from the following sources:
- * http://gem5.org/contributing
- * https://docs.github.com/en/pull-requests
- * https://git-scm.com/book
+## Determining what you can contribute
 
+The easiest way to see how you can contribute to gem5 is to check our Jira
+issue tracker: <https://gem5.atlassian.net> or GitHub issue tracker:
+<https://github.com/gem5/gem5/issues>.
 
-High-level flow for submitting changes
-======================================
+Browse these open issues and see if there are any which you are capable of
+handling. When you find a task you are happy to carry out, verify no one else
+is presently assigned, then leave a comment asking if you may assign yourself
+this task. Though not mandatory, we
+advise first-time contributors do this so developers more familiar with the
+task may give advice on how best to implement the necessary changes.
 
-    +-------------+
-    | Make change |
-    +------+------+
-           |
-           |
-           v
-    +-------------+
-    |  Run tests  |<--------------+
-    +------+------+               |
-           |                      |
-           |                      |
-           v                      |
-    +------+------+               |
-    | Post review |               |
-    +------+------+               |
-           |                      |
-           v                      |
-    +--------+---------+          |
-    | Wait for reviews |          |
-    +--------+---------+          |
-           |                      |
-           |                      |
-           v                      |
-      +----+----+   No     +------+------+
-      |Reviewers+--------->+ Update code |
-      |happy?   |          +------+------+
-      +----+----+                 ^
-           |                      |
-           | Yes                  |
-           v                      |
-      +----+-----+   No           |
-      |Maintainer+----------------+
-      |happy?    |
-      +----+-----+
-           |
-           | Yes
-           v
-    +------+------+
-    | Submit code |
-    +-------------+
+Once a developers has replied to your comment (and given any advice they may
+have), you may officially assign yourself the task. This helps the gem5
+development community understand which parts of the project are presently being
+worked on.
 
-After creating your change to gem5, you can post a review to git
-via a pull request at: https://github.com/gem5/gem5/pulls/. Before being able to
-submit your code to the mainline of gem5, the code is reviewed by others in the
-community. Additionally, the maintainer for that part of the code must sign off
-on it.
+**If, for whatever reason, you stop working on a task, please unassign
+yourself from the task.**
 
-Cloning the gem5 repo to contribute
-===================================
+## Obtaining the git repo
 
-If you plan on contributing, it is strongly encouraged for you to clone the
-repository directly, and checkout the `develop` branch from our git instance
-at https://github.com/gem5/gem5/.
+The gem5 git repository is hosted at <https://github.com/gem5/gem5>.
+**Please note: contributions made to other gem5 repos
+will not be considered. Please contribute to <https://github.com/gem5/gem5>
+exclusively.**
 
-To clone the gem5 repository:
+To pull the gem5 git repo:
 
-```
- git clone https://github.com/gem5/gem5/
+```sh
+git clone https://github.com/gem5/gem5
 ```
 
-By default, the stable branch is checked out. The stable branch contains the
-latest released version of gem5. To obtain code still under-development (and
-which contributions can be made):
+If you wish to use gem5 and never contribute, this is fine. However, to
+contribute, we use the [GitHub Pull-Request model](https://docs.github.com/en/pull-requests), and therefore recommend [Forking the gem5 repository](https://docs.github.com/en/get-started/quickstart/fork-a-repo) prior to contributing.
 
-```
-cd gem5
-git checkout --track origin/develop
+### Forking
+
+Please consult the [GitHub documentation on Forking a GitHub repository](https://docs.github.com/en/get-started/quickstart/fork-a-repo).
+As we will be working atop the `develop` branch, please ensure you Fork all the repository's branches, not just the `stable` branch.
+
+This will create your own forked version of the gem5 repo on your own GitHub account.
+You may then obtain it locally using:
+
+```sh
+git clone https://github.com/{your github account}/gem5
 ```
 
-Changes should be made to this develop branch. Changes to the stable branch
-will be blocked. Once a change on the develop branch is properly incorporated
-into the gem5 repo it will be merged into the stable branch upon the next
-release of gem5. New releases of gem5 occur three times a year. Ergo, changes
-made to the develop branch should appear on the stable branch within three to
-four months as part of a stable release.
+### stable / develop branch
 
-Other gem5 repositories
------------------------
+When cloned the git repo will have the `stable` branch checked-out by default. The
+`stable` branch is the gem5 stable release branch. I.e., the HEAD
+of this branch contains the latest stable release of gem5. (execute `git tag`
+on the `stable` branch to see the list of stable releases. A particular
+release may be checked out by executing `git checkout <release>`). As the
+`stable` branch only contains officially released gem5 code **contributors
+should not develop changes on top of the `stable` branch** they should instead
+**develop changes on top of the `develop` branch**.
 
-There are a few repositories other than the main gem5 development repository.
+To switch to the `develop` branch:
 
- * public/m5threads: The code for a pthreads implementation that works with
-   gem5's syscall emulation mode.
- * public/gem5-resources: Resources to enable computer architecture research
-   with gem5. See the README.md file in the gem5-resources repository for more
-   information.
- * public/gem5-website: The gem5.org website source. See the README.md file in
-   the gem5-website repository for more information.
+```sh
+git switch develop
+```
 
-Making changes to gem5
-======================
+The develop `branch` is merged into the `stable` branch upon a gem5 release.
+Therefore, any changes you make exist on the develop branch until the next release.
 
-It is strongly encouraged to use git branches when making changes to gem5.
-Additionally, keeping changes small and concise and only have a single logical
-change per commit.
+We strongly recommend creating your own local branches to do changes.
+The flow of development works best if `develop` and `stable` are not modified directly.
+This helps keep your changes organized across different branches in your forked repository.
+The following example will create a new branch, from `develop`, called `new-feature`:
 
-Unlike our previous flow with Mercurial and patch queues, when using git, you
-will be committing changes to your local branch. By using separate branches in
-git, you will be able to pull in and merge changes from mainline and simply
-keep up with upstream changes.
+```sh
+git switch -c new-feature
+```
 
-We use a rebase-always model for contributions to the develop branch of gem5.
-In this model, the changes are rebased on top of the tip of develop instead of
-merged. This means that to contribute, you will have to frequently rebase any
-feature branches on top of develop. If you see a "merge conflict" in gerrit, it
-can often be solved with a simple rebase. To find out more information about
-rebasing and git, see the [git book].
+## Making modifications
 
-[git book]: https://git-scm.com/book/en/v2/Git-Branching-Rebasing
+### C/CPP
 
+Different tasks will require the project to be modified in different ways.
+Though, in all cases, our style-guide must be adhered to. The full C/C++ style
+guide is outlined [here](/documentation/general_docs/development/coding_style).
 
-Setting up pre-commit
----------------------
+As a high-level overview:
 
-To help ensure the gem5 style guide is maintained, we use [pre-commit](
-https://pre-commit.com) to run checks on changes to be contributed.
+* Lines must not exceed 79 characters in length.
+* There should be no trailing white-space on any line.
+* Indentations must be 4 spaces (no tab characters).
+* Class names must use upper camel case (e.g., `ThisIsAClass`).
+* Class member variables must use lower camel case (e.g.,
+`thisIsAMemberVariable`).
+* Class member variables with their own public accessor must start with an
+underscore (e.g., `_variableWithAccessor`).
+* Local variables must use snake case (e.g., `this_is_a_local_variable`).
+* Functions must use lower camel case (e.g., `thisIsAFunction`)
+* Function parameters must use snake case.
+* Macros must be in all caps with underscores (e.g., `THIS_IS_A_MACRO`).
+* Function declaration return types must be on their own line.
+* Function brackets must be on their own line.
+* `for`/`if`/`while` branching operations must be followed by a white-space
+before the conditional statement (e.g., `for (...)`).
+* `for`/`if`/`while` branching operations' opening bracket must be on the
+same line, with the closing bracket on its own line (e.g.,
+`for (...) {\n ... \n}\n`). There should be a space between the condition(s)
+and the opening bracket.
+* C++ access modifies must be indented by two spaces, with method/variables
+defined within indented by four spaces.
 
-To setup pre-commit, run the following in your gem5 directory to install the
-pre-commit and commit message hooks.
+Below is a simple toy example of how a class should be formatted:
+
+```C++
+#DEFINE EXAMPLE_MACRO 7
+class ExampleClass
+{
+  private:
+    int _fooBar;
+    int barFoo;
+
+  public:
+    int
+    getFooBar()
+    {
+        return _fooBar;
+    }
+
+    int
+    aFunction(int parameter_one, int parameter_two)
+    {
+        int local_variable = 0;
+        if (true) {
+            int local_variable = parameter_one + parameter_two + barFoo
+                               + EXAMPLE_MACRO;
+        }
+        return local_variable;
+    }
+
+}
+```
+
+### Python
+
+We use [Python Black](https://github.com/psf/black) to format our Python code
+to the correct style. To install:
+
+```sh
+pip install black
+```
+
+Then run on modified/added python files using:
+
+```sh
+black <files/directories>
+```
+
+For variable/method/etc. naming conventions, please follow the [PEP 8 naming
+convention recommendations](
+https://peps.python.org/pep-0008/#naming-conventions). While we try our best to
+enforce naming conventions across the gem5 project, we are aware there are
+instances where they are not. In such cases please **follow the convention
+of the code you are modifying**.
+
+### Using pre-commit
+
+To help enforce our style guide we use use [pre-commit](
+https://pre-commit.com). pre-commit is a git hook and, as such, must be
+explicitly installed by a gem5 developer.
+
+To install the gem5 pre-commit checks, execute the following in the gem5
+directory:
 
 ```sh
 pip install pre-commit
-pre-commit install -t pre-commit -t commit-msg
+pre-commit install
 ```
 
-The hooks are also automatically installed when gem5 is compiled.
+Once installed pre-commit will run checks on modified code prior to running the
+`git commit` command (see [our section on committing](#committing) for more
+details on committing your changes). If these tests fail you will not be able to
+commit.
 
-When you run a `git commit` command the pre-commit hook will run checks on your
-committed code. The commit will be blocked if a check fails.
+These same pre-commit checks are run as part our CI checks (those
+which must pass in order for a change to be merged into the develop branch). It
+is therefore strongly recommended that developers install pre-commit to catch
+style errors early.
 
-The same checks are run as part of github actions CI tests (those required to obtain
-a Verified label, necessary for a change to be accepted to the develop branch).
-Therefore setting up pre-commit in your local gem5 development environment is
-recommended.
+## Compiling and running tests
 
-You can automatically format files to pass the pre-commit tests by running:
+The minimum criteria for a change to be submitted is that the code is
+compilable and the test cases pass.
+
+The following command both compiles the project and runs our "quick"
+system-level checks:
 
 ```sh
-pre-commit run --files <files to format>
+cd tests
+./main.py run
 ```
 
-Requirements for change descriptions
-------------------------------------
-To help reviewers and future contributors more easily understand and track
-changes, we require all change descriptions be strictly formatted.
+**Note: These tests can take several hours to build and execute. `main.py` may
+be run on multiple threads with the `-j` flag. E.g.: `python main.py run
+-j6`.**
 
-A canonical commit message consists of three parts:
- * A short summary line describing the change. This line starts with one or
-   more keywords (found in the MAINTAINERS file) separated by commas followed
-   by a colon and a description of the change. This short description is
-   written in the imperative mood, and should say what happens when the patch
-   is applied. Keep it short and simple. Write it in sentence case preferably
-   not ending in a period. This line should be no more than 65 characters long
-   since version control systems usually add a prefix that causes line-wrapping
-   for longer lines.
- * (Optional, but highly recommended) A detailed description. This describes
-   what you have done and why. If the change isn't obvious, you might want to
-   motivate why it is needed. Lines need to be wrapped to 72 characters or
-   less. Leave a blank line between the first short summary line and this
-   detailed description.
- * Tags describing patch metadata. You are highly recommended to use
-   tags to acknowledge reviewers for their work.
+The unit tests should also pass. To run the unit tests:
 
-Tags are an optional mechanism to store additional metadata about a patch and
-acknowledge people who reported a bug or reviewed that patch. Tags are
-generally appended to the end of the commit message in the order they happen.
-We currently use the following tags:
- * Signed-off-by: Added by the author and the submitter (if different).
-   This tag is a statement saying that you believe the patch to be correct and
-   have the right to submit the patch according to the license in the affected
-   files. Similarly, if you commit someone else's patch, this tells the rest
-   of the world that you have have the right to forward it to the main
-   repository. If you need to make any changes at all to submit the change,
-   these should be described within hard brackets just before your
-   Signed-off-by tag. By adding this line, the contributor certifies the
-   contribution is made under the terms of the Developer Certificate of Origin
-   (DCO) [https://developercertificate.org/].
- * Reviewed-by: Used to acknowledge patch reviewers. It's generally considered
-   good form to add these. Added automatically.
- * Reported-by: Used to acknowledge someone for finding and reporting a bug.
- * Reviewed-on: Link to the review request corresponding to this patch. Added
-   automatically.
- * Change-Id: Used by Gerrit to track changes across rebases. Added
-   automatically with a commit hook by git.
- * Tested-by: Used to acknowledge people who tested a patch. Sometimes added
-   automatically by review systems that integrate with CI systems.
- * Issue-On: Used to link a commit to an issue in gem5's [issue tracker]. The
-   format should be https://gem5.atlassian.net/browse/GEM5-<NUMBER>
-
-[issue tracker]: https://gem5.atlassian.net/
-
-Other than the "Signed-off-by", "Issue-On", "Reported-by", and "Tested-by"
-tags, you generally don't need to add these manually as they are added
-automatically by Gerrit.
-
-It is encouraged for the author of the patch and the submitter to add a
-Signed-off-by tag to the commit message. By adding this line, the contributor
-certifies the contribution is made under the terms of the Developer Certificate
-of Origin (DCO) [https://developercertificate.org/].
-
-If your change relates to a [Jira Issue](https://gem5.atlassian.net), it is
-advised that you provide a link to the issue in the commit message (or messages
-if the Jira Issue relates to multiple commits). Though optional, doing this
-can help reviewers understand the context of a change.
-
-It is imperative that you use your real name and your real email address in
-both tags and in the author field of the changeset.
-
-For significant changes, authors are encouraged to add copyright information
-and their names at the beginning of the file. The main purpose of the author
-names on the file is to track who is most knowledgeable about the file (e.g.,
-who has contributed a significant amount of code to the file). The
-`util/update-copyright.py` helper script can help to keep your copyright dates
-up-to-date when you make further changes to files which already have your
-copyright but with older dates.
-
-Note: If you do not follow these guidelines, the github actions will
-automatically reject your patch.
-If this happens, update your changeset descriptions to match the required style
-and resubmit. The following is a useful git command to update the most recent
-commit (HEAD).
-
-```
- git commit --amend
+```sh
+scons build/NULL/unittests.opt
 ```
 
-Running tests
-=============
+To compile an individual gem5 binary:
 
-Before posting a change to the code review site, you should always run the
-quick tests!
-See TESTING.md for more information.
-
-Posting a review
-================
-
-If you have not signed up for an account on the github
-(https://github.com/), you first have to create an account.
-
-Setting up an account
----------------------
- 1. Go to https://github.com/
- 2. Click "Sign up" in the upper right corner.
-
-Submitting a change
--------------------
-
-In github, to submit a review request, you can simply push your git commits to
-a special named branch. For more information on git push see
-https://git-scm.com/docs/git-push.
-
-Push changes to GitHub
-----------------------------
-1. Fork the gem5 repository on GitHub from https://github.com/gem5/gem5/.
-2. Create a new branch in your forked repository for your feature or bug fix.
-3. Commit your changes to the new branch.
-4. Push the branch to your forked repository.
-5. Open a pull request from your branch in your forked repository to the main gem5 repository.
-
-We will continue to use the “develop” branch for development, so please ensure your pull requests are for the gem5 develop branch. Pull requests to the stable branch will be blocked.
-
-Branches
-========
-
-By default, contributions to gem5 should be made on the develop branch. The
-stable branch is maintained as a stable release branch (i.e., it can be pulled
-to obtain the latest official release of gem5). Creation of additional branches
-is generally discouraged due to their tendency to bloat git repositories with
-abandoned code.
-
-Reviewing patches
-=================
-
-Reviewing patches is done on our github instance at
-https://github.com/gem5/gem5/pulls/.
-
-After logging in with your GitHub account, you will be able to comment, review,
-and push your own patches as well as review others' patches. All gem5 users are
-encouraged to review patches. The only requirement to review patches is to be
-polite and respectful of others.
-
-There are multiple labels in Gerrit that can be applied to each review detailed
-below.
- * Code-review: This is used by any gem5 user to review patches. When reviewing
-   a patch you can give it a score of -2 to +2 with the following semantics.
-   * -2: This blocks the patch. You believe that this patch should never be
-     committed. This label should be very rarely used.
-   * -1: You would prefer this is not merged as is
-   * 0: No score
-   * +1: This patch seems good, but you aren't 100% confident that it should be
-     pushed.
-   * +2: This is a good patch and should be pushed as is.
- * Maintainer: Currently only PMC members are maintainers. At least one
-   maintainer must review your patch and give it a +1 before it can be merged.
- * Verified: This is automatically generated from the continuous integrated
-   (CI) tests. Each patch must receive at least a +1 from the CI tests before
-   the patch can be merged. The patch will receive a +1 if gem5 builds and
-   runs, and it will receive a +2 if the stats match.
- * Style-Check: This is automatically generated and tests the patch against the
-   gem5 code style
-   (http://www.gem5.org/documentation/general_docs/development/coding_style/).
-   The patch must receive a +1 from the style checker to be pushed.
-
-Note: Whenever the patch creator updates the patch all reviewers must re-review
-the patch. There is no longer a "Fix it, then Ship It" option.
-
-Once you have received reviews for your patch, you will likely need to make
-changes. To do this, you should update the original git changeset. Then, you
-can simply push the changeset again to the same Gerrit branch to update the
-review request.
-
-```
- git push origin HEAD:refs/for/develop
+```sh
+scons build/ALL/gem5.opt
 ```
 
-Committing changes
-==================
+This compiles a gem5 binary containing "ALL" ISA targets. For more information
+on building gem5 please consult our [building documentation](
+/documentation/general_docs/building).
 
-Each patch must meet the following criteria to be merged:
- * At least one review with +2
- * At least one maintainer with +1
- * At least +1 from the CI tests (gem5 must build and run)
- * At least +1 from the style checker
+## Committing
 
-Once a patch meets the above criteria, the submitter of the patch will be able
-to merge the patch by pressing the "Submit" button on Gerrit. When the patch is
-submitted, it is merged into the public gem5 branch.
+When you feel your change is done, you may commit. Start by adding the changed
+files:
 
-Review moderation and guidelines
---------------------------------
+```Shell
+git add <changed files>
+```
 
-Once a change is submitted, reviewers shall review the change. This may require
-several iterations before a merge. Comments from reviewers may include
-questions, and requests for alterations to the change prior to merging. The
-overarching philosophy in managing this process is that there should be
-politeness and clear communication between all parties at all times, and,
-whenever possible, permission should be asked before doing anything that may
-inconvenience another party. Included below are some guidelines we expect
-contributors and reviewers to follow.
+Make sure these changes are being added to your forked repository.
+Then commit using:
 
- * In all forms of communication, contributors and reviewers must be polite.
-   Comments seen as being needlessly hostile or dismissive will not be
-   tolerated.
- * Change contributors should respond to, or act upon, each item of feedback
-   given by reviewers. If there is disagreement with a piece of
-   feedback, a sufficiently detailed reason for this disagreement should
-   be given. Polite discussion, and sharing of information and expertise
-   is strongly encouraged.
- * Contributors are advised to assign reviewers when submitting a change.
-   Anyone who contributes to gem5 can be assigned as a reviewer. However,
-   all changes must be accepted by at least one maintainer prior to a
-   merge, ergo assigning of at least one maintainer as a reviewer is
-   strongly recommended. Please see MAINTAINERS for a breakdown of
-   gem5 maintainers and which components they claim responsibility for.
-   Maintainers should be chosen based on which components the change is
-   targeting. Assigning of reviewers is not strictly enforced, though not
-   assigning reviewers may slow the time in which a change is reviewed.
- * If a contributor posts a change and does not receive any reviews after two
-   working days (excluding regional holidays), it is acceptable to "prod"
-   reviewers. This can be done by adding a reply to the changeset review
-   (e.g., "Would it be possible for someone to review my change?"). If the
-   contributor has yet to assign reviewers, they are strongly advised to do so.
-   Reviewers will get notified when assigned to referee a change.
- * By default, the original contributor is assumed to own a change. I.e.,
-   they are assumed to be the sole party to submit patchsets. If someone
-   other than the original contributor wishes to submit patchsets to a
-   change on the original contributor's behalf, they should first ask
-   permission. If two working days pass without a response, a patchset may be
-   submitted without permission. Permission does not need to be asked to submit
-   a patchset consisting of minor, inoffensive, changes such a typo and format
-   fixes.
- * Once a change is ready to merge, it enters a "Ready to Submit" state. The
-   original contributor should  merge their change at this point, assuming they
-   are content with the commit in its present form. After two working days, a
-   reviewer may message a contributor to remind them of the change being in a
-   "Ready to Submit" state and ask if they can merge the change on the
-   contributors behalf. If a further two working days elapse without a
-   response, the reviewer may merge without permission. A contributor may keep
-   a change open for whatever reason though this should be communicated to the
-   reviewer when asked.
- * After a month of inactivity from a contributor on an active change, a
-   reviewer may post a message on the change reminding the submitter, and
-   anyone else watching the change, of its active status and ask if they are
-   still interested in eventually merging the change. After two weeks of no
-   response the reviewer reserves the right to abandon the change under the
-   assumption there is no longer interest.
- * The final arbiter in any dispute between reviewers and/or contributors
-   is the PMC (PMC members are highlighted in MAINTAINERS). Disputes requiring
-   intervention by the PMC are undesirable. Attempts should be made to resolve
-   disagreements via respectful and polite discourse before being escalated to
-   this level.
+```Shell
+git commit
+```
 
-Releases
-========
+The commit message must adhere to our style. The first line of the commit is
+the "header". The header starts with a tag (or tags, separated by a comma),
+then a colon. Which tags are used depend on which components of gem5
+you have modified. **Please refer to the [MAINTAINERS.yaml](
+https://github.com/gem5/gem5/blob/stable/MAINTAINERS.yaml) for
+a comprehensive list of accepted tags**. After this colon a short description
+of the commit must be provided. **This header line must not exceed 65
+characters**.
+
+After this, a more detailed description of the commit can be added. This is
+inserted below the header, separated by an empty line. Including a description
+is optional but it's strongly recommended. The description may span multiple
+lines, and multiple paragraphs. **No line in the description may exceed 72
+characters.**
+
+To improve the navigability of the gem5 project we would appreciate if commit
+messages include a link to the relevant Jira issue/issues.
+
+Below is an example of how a gem5 commit message should be formatted:
+
+```
+test,base: This commit tests some classes in the base component
+
+This is a more detailed description of the commit. This can be as long
+as is necessary to adequately describe the change.
+
+A description may spawn multiple paragraphs if desired.
+
+Jira Issue: https://gem5.atlassian.net/browse/GEM5-186
+```
+
+If you feel the need to change your commit, add the necessary files then
+_amend_ the changes to the commit using:
+
+```sh
+git commit --amend
+```
+
+This will give you opportunity to edit the commit message.
+
+You may continue to add more commits as a chain of commits to be included in the pull-request.
+However, we recommend that pull-requests are kept small and focused.
+For example, if you wish to add a different feature or fix a different bug, we recommend doing so in another pull requests.
+
+## Keeping your forked and local repositories up-to-date
+
+While working on your contribution, we recommend keeping your forked repository in-sync with the source gem5 repository.
+To do so, regularly [Sync your fork](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork).
+This can be done via the GitHub web interface and, if so, you should `git pull` on top of your local `stable` and `develop` branches to ensure your local repository is in-sync.
+To do so from the command line:
+
+```sh
+# Add the main gem5 repository as a remote on your local repository. This only
+# needs done once.
+git remote add upstream https://github.com/gem5/gem5.git
+
+git fetch upstream # Obtain the latest from the gem5 repo.
+git switch develop # Switch to the develop branch.
+git merge upstream/develop # Merge the latest changes into the develop branch.
+git push # Push to develop to your forked repo.
+git switch stable # Switch to the stable branch.
+git merge upstream/stable # Merge the latest changes into the stable branch.
+git push # Push the changes to stable to your forked repo.
+```
+
+As our local branch work atop the `develop` branch, once we've synced our forked repository, we can rebase our local branch on top of the `develop` branch.
+Assuming our local branch is called `new-feature`:
+
+```sh
+git switch develop # Switching back to the develop branch.
+git pull # Ensuring we have the latest from the forked repository.
+git switch new-feature # Switching back to our local branch.
+git rebase develop # Rebasing our local branch on top of the develop branch.
+```
+
+Conflicts may need resolved between your branch and new changes.
+
+## Pushing and creating a pull request
+
+Once you have completed your changes locally, you can push to your forked gem5 repository.
+Assuming the branch we are working on is `new-feature`:
+
+```sh
+git switch new-feature # Ensure we are on the 'new-feature' branch.
+git push --set-upstream origin new-feature
+```
+
+Now, via the GitHub web interface, you can [create a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) of your changes from your forked repository's branch into the gem5 `develop` branch.
+
+## Passing the checks
+
+Once you have created a pull request, the gem5 Continuous Integration (CI) tests will run.
+These run a series of checks to ensure your changes are valid.
+These must pass before your changes can be merged into the gem5 `develop` branch.
+
+In addition to the CI tests, your changes will be reviewed by the gem5 community.
+Your pull-request must have the approval of at least one community member prior to being merged.
+
+Once your pull-request has passed all the CI tests and has been approved by at least one community member, it will be merged a gem5 maintainer will do a [Merge](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/incorporating-changes-from-a-pull-request/about-pull-request-merges) on the pull-request.
+The gem5 maintainers are individuals granted the ability to merge pull requests into the gem5 `develop` branch.
+
+
+### Making iterative improvements based on feedback
+
+A reviewer will ask questions and post suggestions on GitHub. You should read
+these comments and answer these questions. **All communications between
+reviewers and contributors should be done in a polite manner. Rude and/or
+dismissive remarks will not be tolerated.**
+
+When you understand what changes are required make amendments to the pull
+request by adding patches to the same branch and then pushing to the forked repository.
+A git "force push" (i.e., `git push --force`) is also acceptable if you wish to alter the commits locally in order to make the changes.
+We encourage contributors to help keep our `git log` clean and readable.
+We recommend that users rebase their changes frequently on top of the develop branch, squash their commits where appropriate (e.g., in cases where there are many small fix commits to a change in the same PR) then force push changes to keep their PR commits concise.
+
+Once pushed to the forked repository, the pull request will automatically update with your changes.
+The reviewer will then re-review your changes and, if necessary, ask for further changes, or approve your pull-request.
+
+## Reviewing other contributions
+
+We encourage all gem5 developers to review other's contributions.
+Anyone may review a gem5 change and, if they feel it is ready, approve it.
+All pull-requests can be found at <https://github.com/gem5/gem5/pulls>.
+
+When reviewing a pull request we enforce the followings guidelines.
+These have been designed to ensure clear and polite communication between all parties:
+
+* In all forms of communication, contributors and reviewers must be polite.
+Comments seen as being rude or dismissive will not be tolerated.
+* If choosing to not approve a PR, please state clearly why.
+When asking for changes, the commits should be specific and actionable.
+General criticisms which cannot be addressed or understood by the contributor are unhelpful.
+If the contribution needs improvement, reviewers should state what their requested changes are.
+If more information is needed for the reviewers to make a decision the reviewer should ask clear questions.
+If the PR is generally not seen as a worthwhile contribution, a good justification should be given so the contributor may fairly rebuttal.
+* By default, the original contributor is assumed to own a change.
+I.e., they are assumed to be the sole party to submit patches to the pull request.
+If someone other than the original contributor wishes to submit patches on the original contributors behalf they should first ask permission.
+Pull requests which appear abandoned may be adopted by a new contributor as long as there is good enough reason to assume the original contributor is no longer working on the pull request.
+* Maintainers have the final say on whether a change is merged.
+Your review will be taken into account by the maintainer.
+It is expected, in all but the most extreme cases, that the reviewer's concerns must be addressed and for the reviewer to approve the the contribution prior to the maintainer merging the pull request.
+
+We also recommend consulting Google's ["How to write code review comments"](https://google.github.io/eng-practices/review/reviewer/comments.html) for advice on giving feedback to contributors.
+
+## Releases
 
 gem5 releases occur 3 times per year. The procedure for releasing gem5 is as
 follows:
@@ -435,7 +400,7 @@ gem5-dev mailing list will be notified that the staging branch will be merged
 into the stable branch after two weeks, thus marking the new release.
 3. The staging branch will have the full suite of gem5 tests run on it to
 ensure all tests pass and the to-be-released code is in a decent state.
-4. If a user submits a changeset to the staging branch, it will be considered
+4. If a user submits a pull request to the staging branch, it will be considered
 and undergo the standard github review process. However, only alterations that
 cannot wait until the following release will be accepted for submission into
 the branch (i.e., submissions to the staging branch for "last minute"
@@ -444,8 +409,8 @@ fix). The project maintainers will use their discretion in deciding whether a
 change may be submitted directly to the staging branch. All other submissions
 to gem5 will continue to be made to the develop branch. Patches submitted
 into the staging branch do not need to be re-added to the develop branch.
-5. Once signed off by members of the PMC the staging branch shall be merged
-into the stable and develop branch. The staging branch will then be deleted.
+5. Once the staging branch has been deemed ready for release, the [release procedures](https://www.gem5.org/documentation/general_docs/development/release_procedures/) will be carried out.
+This will end with the staging branch being merged into the stable branch.
 6. The stable branch shall be tagged with the correct version number for that
 release. gem5 conforms to a "v{YY}.{MAJOR}.{MINOR}.{HOTFIX}" versioning system.
 E.g., the first major release of 2022 will be "v22.0.0.0", followed by
@@ -455,8 +420,16 @@ the minor release numbers in case this policy changes in the future.
 7. The gem5-dev and gem5-user mailing lists shall be notified of the new gem5
 release.
 
-Hotfixes
---------
+### Exemptions
+
+Due to limitations with GitHub we may update the ".github" directory in the gem5 repo's `stable` branch between gem5 releases.
+This is due to certain processes carried out by the GitHub Actions infrastructure which rely on configurations being present on a repository's primary branch.
+As the files in ".github" only influence the functionality of our GitHub actions and other GitHub activities, updating these files does not change the functionality of the gem5 in way.
+It is therefore safe to do this.
+Despite this exemption to our normal procedure we aim to ensure that **the ".github" directory on the `stable` is never "ahead" of that in the `develop` branch**.
+Therefore contributors who wish to update files in ".github" should submit their changes to `develop` and then request their changes to be applied to the `stable` branch.
+
+### Hotfixes
 
 There may be circumstances in which a change to gem5 is deemed critical and
 cannot wait for an official release (e.g., a high-priority bug fix). In these
diff --git a/configs/example/gem5_library/checkpoints/riscv-hello-restore-checkpoint.py b/configs/example/gem5_library/checkpoints/riscv-hello-restore-checkpoint.py
index 9f9bf839a6..eed76e2448 100644
--- a/configs/example/gem5_library/checkpoints/riscv-hello-restore-checkpoint.py
+++ b/configs/example/gem5_library/checkpoints/riscv-hello-restore-checkpoint.py
@@ -90,7 +90,7 @@ board = SimpleBoard(
 board.set_se_binary_workload(
     # the workload should be the same as the save-checkpoint script
     obtain_resource("riscv-hello"),
-    checkpoint=obtain_resource("riscv-hello-example-checkpoint-v23"),
+    checkpoint=obtain_resource("riscv-hello-example-checkpoint"),
 )
 
 simulator = Simulator(
diff --git a/configs/example/gpufs/runfs.py b/configs/example/gpufs/runfs.py
index 5346622155..0f090e2f89 100644
--- a/configs/example/gpufs/runfs.py
+++ b/configs/example/gpufs/runfs.py
@@ -179,10 +179,15 @@ def runGpuFSSystem(args):
         math.ceil(float(n_cu) / args.cu_per_scalar_cache)
     )
 
-    # Verify MMIO trace is valid
-    mmio_md5 = hashlib.md5(open(args.gpu_mmio_trace, "rb").read()).hexdigest()
-    if mmio_md5 != "c4ff3326ae8a036e329b8b595c83bd6d":
-        m5.util.panic("MMIO file does not match gem5 resources")
+    # Verify MMIO trace is valid. This is only needed for Vega10 simulations.
+    # The md5sum refers to the md5sum of the Vega10 MMIO hardware trace in
+    # the gem5-resources repository. By checking it here, we avoid potential
+    # errors that would cause the driver not to load and simulations to fail.
+    if args.gpu_device == "Vega10":
+        mmio_file = open(args.gpu_mmio_trace, "rb")
+        mmio_md5 = hashlib.md5(mmio_file.read()).hexdigest()
+        if mmio_md5 != "c4ff3326ae8a036e329b8b595c83bd6d":
+            m5.util.panic("MMIO file does not match gem5 resources")
 
     system = makeGpuFSSystem(args)
 
diff --git a/configs/example/gpufs/system/system.py b/configs/example/gpufs/system/system.py
index 19df310295..7ddc4f0752 100644
--- a/configs/example/gpufs/system/system.py
+++ b/configs/example/gpufs/system/system.py
@@ -231,6 +231,42 @@ def makeGpuFSSystem(args):
         clock=args.ruby_clock, voltage_domain=system.voltage_domain
     )
 
+    # If we are using KVM cpu, enable AVX. AVX is used in some ROCm libraries
+    # such as rocBLAS which is used in higher level libraries like PyTorch.
+    use_avx = False
+    if ObjectList.is_kvm_cpu(TestCPUClass):
+        # AVX also requires CR4.osxsave to be 1. These must be set together
+        # of KVM will error out.
+        system.workload.enable_osxsave = 1
+        use_avx = True
+
+    # These values are taken from a real CPU and are further explained here:
+    # https://sandpile.org/x86/cpuid.htm#level_0000_000Dh
+    avx_extended_state = [
+        0x00000007,
+        0x00000340,
+        0x00000000,
+        0x00000340,
+        0x0000000F,
+        0x00000340,
+        0x00000000,
+        0x00000000,
+        0x00000100,
+        0x00000240,
+        0x00000000,
+        0x00000040,
+        0x00000000,
+        0x00000000,
+        0x00000000,
+        0x00000000,
+    ]
+
+    # This modifies the default value for ECX only (4th in this array).
+    # See: https://sandpile.org/x86/cpuid.htm#level_0000_0001h
+    # Enables AVX, OSXSAVE, XSAVE, POPCNT, SSE4.2, SSE4.1, CMPXCHG16B,
+    # and FMA.
+    avx_cpu_features = [0x00020F51, 0x00000805, 0xEFDBFBFF, 0x1C983209]
+
     for (i, cpu) in enumerate(system.cpu):
         # Break once we reach the shader "CPU"
         if i == args.num_cpus:
@@ -247,6 +283,9 @@ def makeGpuFSSystem(args):
 
         for j in range(len(system.cpu[i].isa)):
             system.cpu[i].isa[j].vendor_string = "AuthenticAMD"
+            if use_avx:
+                system.cpu[i].isa[j].ExtendedState = avx_extended_state
+                system.cpu[i].isa[j].FamilyModelStepping = avx_cpu_features
 
     if args.host_parallel:
         # To get the KVM CPUs to run on different host CPUs, specify a
diff --git a/src/arch/amdgpu/vega/insts/instructions.cc b/src/arch/amdgpu/vega/insts/instructions.cc
index 0d3f2dc00b..ab9c1cecf2 100644
--- a/src/arch/amdgpu/vega/insts/instructions.cc
+++ b/src/arch/amdgpu/vega/insts/instructions.cc
@@ -6394,7 +6394,7 @@ namespace VegaISA
             }
         };
 
-        vop2Helper<VecOperandU32>(gpuDynInst, opImpl);
+        vop2Helper<ConstVecOperandU32, VecOperandU32>(gpuDynInst, opImpl);
     } // execute
     // --- Inst_VOP2__V_MUL_HI_U32_U24 class methods ---
 
diff --git a/src/arch/amdgpu/vega/insts/op_encodings.hh b/src/arch/amdgpu/vega/insts/op_encodings.hh
index f1954723af..0f5f502add 100644
--- a/src/arch/amdgpu/vega/insts/op_encodings.hh
+++ b/src/arch/amdgpu/vega/insts/op_encodings.hh
@@ -339,7 +339,7 @@ namespace VegaISA
             return src0_dpp;
         }
 
-        template<typename T>
+        template<typename ConstT, typename T>
         void vop2Helper(GPUDynInstPtr gpuDynInst,
                         void (*fOpImpl)(T&, T&, T&, Wavefront*))
         {
@@ -359,7 +359,19 @@ namespace VegaISA
                 T src0_dpp = dppHelper(gpuDynInst, src1);
                 fOpImpl(src0_dpp, src1, vdst, wf);
             } else {
-                fOpImpl(src0, src1, vdst, wf);
+                // src0 is unmodified. We need to use the const container
+                // type to allow reading scalar operands from src0. Only
+                // src0 can index scalar operands. We copy this to vdst
+                // temporarily to pass to the lambda so the instruction
+                // does not need to write two lambda functions (one for
+                // a const src0 and one of a mutable src0).
+                ConstT const_src0(gpuDynInst, instData.SRC0);
+                const_src0.readSrc();
+
+                for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                    vdst[lane] = const_src0[lane];
+                }
+                fOpImpl(vdst, src1, vdst, wf);
             }
 
             vdst.write();
diff --git a/src/arch/amdgpu/vega/pagetable_walker.cc b/src/arch/amdgpu/vega/pagetable_walker.cc
index 96ac0fe179..6a71b14838 100644
--- a/src/arch/amdgpu/vega/pagetable_walker.cc
+++ b/src/arch/amdgpu/vega/pagetable_walker.cc
@@ -239,9 +239,22 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead,
     Addr part2 = 0;
     PageDirectoryEntry pde = static_cast<PageDirectoryEntry>(pte);
 
-    // For a four level page table block fragment size should not be needed.
-    // For now issue a panic to prevent strange behavior if it is non-zero.
-    panic_if(pde.blockFragmentSize, "PDE blockFragmentSize must be 0");
+    // Block fragment size can change the size of the pages pointed to while
+    // moving to the next PDE. A value of 0 implies native page size. A
+    // non-zero value implies the next leaf in the page table is a PTE unless
+    // the F bit is set. If we see a non-zero value, set it here and print
+    // for debugging.
+    if (pde.blockFragmentSize) {
+        DPRINTF(GPUPTWalker,
+                "blockFragmentSize: %d, pde: %#016lx, state: %d\n",
+                pde.blockFragmentSize, pde, state);
+        blockFragmentSize = pde.blockFragmentSize;
+
+        // At this time, only a value of 9 is used in the driver:
+        // https://github.com/torvalds/linux/blob/master/drivers/gpu/drm/
+        //     amd/amdgpu/gmc_v9_0.c#L1165
+        assert(pde.blockFragmentSize == 9);
+    }
 
     switch(state) {
       case PDE2:
@@ -287,7 +300,7 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead,
         nextState = PDE0;
         break;
       case PDE0:
-        if (pde.p) {
+        if (pde.p || (blockFragmentSize && !pte.f)) {
             DPRINTF(GPUPTWalker, "Treating PDE0 as PTE: %#016x frag: %d\n",
                     (uint64_t)pte, pte.fragment);
             entry.pte = pte;
@@ -299,7 +312,15 @@ Walker::WalkerState::walkStateMachine(PageTableEntry &pte, Addr &nextRead,
         }
         // Read the PteAddr
         part1 = ((((uint64_t)pte) >> 6) << 3);
-        part2 = offsetFunc(vaddr, 9, 0);
+        if (pte.f) {
+            // For F bit we want to use the blockFragmentSize in the previous
+            // PDE and the blockFragmentSize in this PTE for offset function.
+            part2 = offsetFunc(vaddr,
+                               blockFragmentSize,
+                               pde.blockFragmentSize);
+        } else {
+            part2 = offsetFunc(vaddr, 9, 0);
+        }
         nextRead = ((part1 + part2) << 3) & mask(48);
         DPRINTF(GPUPTWalker,
                 "Got PDE0 entry %#016x. write:%s->%#016x va:%#016x\n",
@@ -369,6 +390,7 @@ bool Walker::sendTiming(WalkerState* sending_walker, PacketPtr pkt)
         return true;
     } else {
         (void)pkt->popSenderState();
+        delete walker_state;
     }
 
     return false;
diff --git a/src/arch/amdgpu/vega/pagetable_walker.hh b/src/arch/amdgpu/vega/pagetable_walker.hh
index 2ad0748c14..232be5de70 100644
--- a/src/arch/amdgpu/vega/pagetable_walker.hh
+++ b/src/arch/amdgpu/vega/pagetable_walker.hh
@@ -99,11 +99,13 @@ class Walker : public ClockedObject
         bool started;
         bool timing;
         PacketPtr tlbPkt;
+        int blockFragmentSize;
 
       public:
         WalkerState(Walker *_walker, PacketPtr pkt, bool is_functional = false)
             : walker(_walker), state(Ready), nextState(Ready), dataSize(8),
-              enableNX(true), retrying(false), started(false), tlbPkt(pkt)
+              enableNX(true), retrying(false), started(false), tlbPkt(pkt),
+              blockFragmentSize(0)
         {
             DPRINTF(GPUPTWalker, "Walker::WalkerState %p %p %d\n",
                     this, walker, state);
diff --git a/src/arch/riscv/RiscvCPU.py b/src/arch/riscv/RiscvCPU.py
index 1c77045c67..449bf5e7af 100644
--- a/src/arch/riscv/RiscvCPU.py
+++ b/src/arch/riscv/RiscvCPU.py
@@ -41,6 +41,17 @@ class RiscvCPU:
     ArchISA = RiscvISA
 
 
+class RiscvISANoRVV(RiscvISA):
+    enable_rvv = False
+
+
+class RiscvCPUNoRVV:
+    ArchDecoder = RiscvDecoder
+    ArchMMU = RiscvMMU
+    ArchInterrupts = RiscvInterrupts
+    ArchISA = RiscvISANoRVV
+
+
 class RiscvAtomicSimpleCPU(BaseAtomicSimpleCPU, RiscvCPU):
     mmu = RiscvMMU()
 
@@ -53,9 +64,9 @@ class RiscvTimingSimpleCPU(BaseTimingSimpleCPU, RiscvCPU):
     mmu = RiscvMMU()
 
 
-class RiscvO3CPU(BaseO3CPU, RiscvCPU):
+class RiscvO3CPU(BaseO3CPU, RiscvCPUNoRVV):
     mmu = RiscvMMU()
 
 
-class RiscvMinorCPU(BaseMinorCPU, RiscvCPU):
+class RiscvMinorCPU(BaseMinorCPU, RiscvCPUNoRVV):
     mmu = RiscvMMU()
diff --git a/src/arch/riscv/RiscvDecoder.py b/src/arch/riscv/RiscvDecoder.py
index 30c1077662..4100a3c5b3 100644
--- a/src/arch/riscv/RiscvDecoder.py
+++ b/src/arch/riscv/RiscvDecoder.py
@@ -24,6 +24,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from m5.objects.InstDecoder import InstDecoder
+from m5.params import *
 
 
 class RiscvDecoder(InstDecoder):
diff --git a/src/arch/riscv/RiscvISA.py b/src/arch/riscv/RiscvISA.py
index bb9a05babe..f66171a95a 100644
--- a/src/arch/riscv/RiscvISA.py
+++ b/src/arch/riscv/RiscvISA.py
@@ -56,3 +56,5 @@ class RiscvISA(BaseISA):
         True, "whether to check memory access alignment"
     )
     riscv_type = Param.RiscvType("RV64", "RV32 or RV64")
+
+    enable_rvv = Param.Bool(True, "Enable vector extension")
diff --git a/src/arch/riscv/decoder.cc b/src/arch/riscv/decoder.cc
index 7faa310b1e..702d84fd91 100644
--- a/src/arch/riscv/decoder.cc
+++ b/src/arch/riscv/decoder.cc
@@ -28,6 +28,7 @@
  */
 
 #include "arch/riscv/decoder.hh"
+#include "arch/riscv/isa.hh"
 #include "arch/riscv/types.hh"
 #include "base/bitfield.hh"
 #include "debug/Decode.hh"
@@ -38,10 +39,18 @@ namespace gem5
 namespace RiscvISA
 {
 
+Decoder::Decoder(const RiscvDecoderParams &p) : InstDecoder(p, &machInst)
+{
+    ISA *isa = dynamic_cast<ISA*>(p.isa);
+    enableRvv = isa->getEnableRvv();
+    reset();
+}
+
 void Decoder::reset()
 {
     aligned = true;
     mid = false;
+    vConfigDone = true;
     machInst = 0;
     emi = 0;
 }
@@ -49,6 +58,19 @@ void Decoder::reset()
 void
 Decoder::moreBytes(const PCStateBase &pc, Addr fetchPC)
 {
+    // TODO: Current vsetvl instructions stall decode. Future fixes should
+    // enable speculation, and this code will be removed.
+    if (GEM5_UNLIKELY(!this->vConfigDone)) {
+        fatal_if(!enableRvv,
+            "Vector extension is not enabled for this CPU type\n"
+            "You can manually enable vector extensions by setting rvv_enabled "
+            "to true for each ISA object after `createThreads()`\n");
+        DPRINTF(Decode, "Waiting for vset*vl* to be executed\n");
+        instDone = false;
+        outOfBytes = false;
+        return;
+    }
+
     // The MSB of the upper and lower halves of a machine instruction.
     constexpr size_t max_bit = sizeof(machInst) * 8 - 1;
     constexpr size_t mid_bit = sizeof(machInst) * 4 - 1;
@@ -78,6 +100,14 @@ Decoder::moreBytes(const PCStateBase &pc, Addr fetchPC)
             instDone = compressed(emi);
         }
     }
+    if (instDone) {
+        emi.vl      = this->machVl;
+        emi.vtype8   = this->machVtype & 0xff;
+        emi.vill    = this->machVtype.vill;
+        if (vconf(emi)) {
+            this->vConfigDone = false; // set true when vconfig inst execute
+        }
+    }
 }
 
 StaticInstPtr
@@ -116,5 +146,14 @@ Decoder::decode(PCStateBase &_next_pc)
     return decode(emi, next_pc.instAddr());
 }
 
+void
+Decoder::setVlAndVtype(uint32_t vl, VTYPE vtype)
+{
+    this->machVtype = vtype;
+    this->machVl = vl;
+
+    this->vConfigDone = true;
+}
+
 } // namespace RiscvISA
 } // namespace gem5
diff --git a/src/arch/riscv/decoder.hh b/src/arch/riscv/decoder.hh
index 15cbefe39c..1f510e8280 100644
--- a/src/arch/riscv/decoder.hh
+++ b/src/arch/riscv/decoder.hh
@@ -32,6 +32,7 @@
 
 #include "arch/generic/decode_cache.hh"
 #include "arch/generic/decoder.hh"
+#include "arch/riscv/insts/vector.hh"
 #include "arch/riscv/types.hh"
 #include "base/logging.hh"
 #include "base/types.hh"
@@ -53,12 +54,17 @@ class Decoder : public InstDecoder
     decode_cache::InstMap<ExtMachInst> instMap;
     bool aligned;
     bool mid;
+    bool vConfigDone;
 
   protected:
     //The extended machine instruction being generated
     ExtMachInst emi;
     uint32_t machInst;
 
+    bool enableRvv = false;
+    VTYPE machVtype;
+    uint32_t machVl;
+
     StaticInstPtr decodeInst(ExtMachInst mach_inst);
 
     /// Decode a machine instruction.
@@ -67,20 +73,22 @@ class Decoder : public InstDecoder
     StaticInstPtr decode(ExtMachInst mach_inst, Addr addr);
 
   public:
-    Decoder(const RiscvDecoderParams &p) : InstDecoder(p, &machInst)
-    {
-        reset();
-    }
+    Decoder(const RiscvDecoderParams &p);
 
     void reset() override;
 
-    inline bool compressed(ExtMachInst inst) { return (inst & 0x3) < 0x3; }
+    inline bool compressed(ExtMachInst inst) { return inst.quadRant < 0x3; }
+    inline bool vconf(ExtMachInst inst) {
+      return inst.opcode == 0b1010111u && inst.funct3 == 0b111u;
+    }
 
     //Use this to give data to the decoder. This should be used
     //when there is control flow.
     void moreBytes(const PCStateBase &pc, Addr fetchPC) override;
 
     StaticInstPtr decode(PCStateBase &nextPC) override;
+
+    void setVlAndVtype(uint32_t vl, VTYPE vtype);
 };
 
 } // namespace RiscvISA
diff --git a/src/arch/riscv/faults.hh b/src/arch/riscv/faults.hh
index f687fd6f20..fa67e3b34c 100644
--- a/src/arch/riscv/faults.hh
+++ b/src/arch/riscv/faults.hh
@@ -173,7 +173,7 @@ class InstFault : public RiscvFault
         : RiscvFault(n, FaultType::OTHERS, INST_ILLEGAL), _inst(inst)
     {}
 
-    RegVal trap_value() const override { return bits(_inst, 31, 0); }
+    RegVal trap_value() const override { return _inst.instBits; }
 };
 
 class UnknownInstFault : public InstFault
diff --git a/src/arch/riscv/insts/SConscript b/src/arch/riscv/insts/SConscript
index 704152c040..2822cf86b4 100644
--- a/src/arch/riscv/insts/SConscript
+++ b/src/arch/riscv/insts/SConscript
@@ -33,3 +33,4 @@ Source('compressed.cc', tags='riscv isa')
 Source('mem.cc', tags='riscv isa')
 Source('standard.cc', tags='riscv isa')
 Source('static_inst.cc', tags='riscv isa')
+Source('vector.cc', tags='riscv isa')
diff --git a/src/arch/riscv/insts/static_inst.hh b/src/arch/riscv/insts/static_inst.hh
index f835713505..74f9ddb452 100644
--- a/src/arch/riscv/insts/static_inst.hh
+++ b/src/arch/riscv/insts/static_inst.hh
@@ -33,6 +33,7 @@
 #include <string>
 
 #include "arch/riscv/pcstate.hh"
+#include "arch/riscv/regs/misc.hh"
 #include "arch/riscv/types.hh"
 #include "cpu/exec_context.hh"
 #include "cpu/static_inst.hh"
diff --git a/src/arch/riscv/insts/vector.cc b/src/arch/riscv/insts/vector.cc
new file mode 100644
index 0000000000..6ecec44dc5
--- /dev/null
+++ b/src/arch/riscv/insts/vector.cc
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2022 PLCT Lab
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "arch/riscv/insts/vector.hh"
+
+#include <sstream>
+#include <string>
+
+#include "arch/riscv/insts/static_inst.hh"
+#include "arch/riscv/utility.hh"
+#include "cpu/static_inst.hh"
+
+namespace gem5
+{
+
+namespace RiscvISA
+{
+
+/**
+ * This function translates the 3-bit value of vlmul bits to the corresponding
+ * lmul value as specified in RVV 1.0 spec p11-12 chapter 3.4.2.
+ *
+ * I.e.,
+ * vlmul = -3 -> LMUL = 1/8
+ * vlmul = -2 -> LMUL = 1/4
+ * vlmul = -1 -> LMUL = 1/2
+ * vlmul = 0 -> LMUL = 1
+ * vlmul = 1 -> LMUL = 2
+ * vlmul = 2 -> LMUL = 4
+ * vlmul = 3 -> LMUL = 8
+ *
+**/
+float
+getVflmul(uint32_t vlmul_encoding)
+{
+    int vlmul = sext<3>(vlmul_encoding & 7);
+    float vflmul = vlmul >= 0 ? 1 << vlmul : 1.0 / (1 << -vlmul);
+    return vflmul;
+}
+
+uint32_t
+getVlmax(VTYPE vtype, uint32_t vlen)
+{
+    uint32_t sew = getSew(vtype.vsew);
+    // vlmax is defined in RVV 1.0 spec p12 chapter 3.4.2.
+    uint32_t vlmax = (vlen/sew) * getVflmul(vtype.vlmul);
+    return vlmax;
+}
+
+std::string
+VConfOp::generateDisassembly(Addr pc, const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", ";
+    if (bit31 && bit30 == 0) {
+        ss << registerName(srcRegIdx(0)) << ", " << registerName(srcRegIdx(1));
+    } else if (bit31 && bit30) {
+        ss << uimm << ", " << generateZimmDisassembly();
+    } else {
+        ss << registerName(srcRegIdx(0)) << ", " << generateZimmDisassembly();
+    }
+    return ss.str();
+}
+
+std::string
+VConfOp::generateZimmDisassembly() const
+{
+    std::stringstream s;
+
+    // VSETIVLI uses ZIMM10 and VSETVLI uses ZIMM11
+    uint64_t zimm = (bit31 && bit30) ? zimm10 : zimm11;
+
+    bool frac_lmul = bits(zimm, 2);
+    int sew = 1 << (bits(zimm, 5, 3) + 3);
+    int lmul = bits(zimm, 1, 0);
+    auto vta = bits(zimm, 6) == 1 ? "ta" : "tu";
+    auto vma = bits(zimm, 7) == 1 ? "ma" : "mu";
+    s << "e" << sew;
+    if (frac_lmul) {
+        std::string lmul_str = "";
+        switch(lmul){
+        case 3:
+            lmul_str = "f2";
+            break;
+        case 2:
+            lmul_str = "f4";
+            break;
+        case 1:
+            lmul_str = "f8";
+            break;
+        default:
+            panic("Unsupport fractional LMUL");
+        }
+        s << ", m" << lmul_str;
+    } else {
+        s << ", m" << (1 << lmul);
+    }
+    s << ", " << vta << ", " << vma;
+    return s.str();
+}
+
+std::string
+VectorNonSplitInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+        << registerName(srcRegIdx(0));
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VectorArithMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", ";
+    if (machInst.funct3 == 0x3) {
+        // OPIVI
+      ss  << registerName(srcRegIdx(0)) << ", " << machInst.vecimm;
+    } else {
+      ss  << registerName(srcRegIdx(1)) << ", " << registerName(srcRegIdx(0));
+    }
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VectorArithMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", ";
+    if (machInst.funct3 == 0x3) {
+        // OPIVI
+      ss  << registerName(srcRegIdx(0)) << ", " << machInst.vecimm;
+    } else {
+      ss  << registerName(srcRegIdx(1)) << ", " << registerName(srcRegIdx(0));
+    }
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VectorVMUNARY0MicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0));
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VectorVMUNARY0MacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0));
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VectorSlideMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) <<  ", ";
+    if (machInst.funct3 == 0x3) {
+      ss  << registerName(srcRegIdx(0)) << ", " << machInst.vecimm;
+    } else {
+      ss  << registerName(srcRegIdx(1)) << ", " << registerName(srcRegIdx(0));
+    }
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VectorSlideMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", ";
+    if (machInst.funct3 == 0x3) {
+      ss  << registerName(srcRegIdx(0)) << ", " << machInst.vecimm;
+    } else {
+      ss  << registerName(srcRegIdx(1)) << ", " << registerName(srcRegIdx(0));
+    }
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VleMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+       << VLENB * microIdx << '(' << registerName(srcRegIdx(0)) << ')' << ", "
+       << registerName(srcRegIdx(1));
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VlWholeMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+       << VLENB * microIdx << '(' << registerName(srcRegIdx(0)) << ')';
+    return ss.str();
+}
+
+std::string VseMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(srcRegIdx(1)) << ", "
+       << VLENB * microIdx  << '(' << registerName(srcRegIdx(0)) << ')';
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VsWholeMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(srcRegIdx(1)) << ", "
+       << VLENB * microIdx << '(' << registerName(srcRegIdx(0)) << ')';
+    return ss.str();
+}
+
+std::string VleMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+        '(' << registerName(srcRegIdx(0)) << ')';
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VlWholeMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+        '(' << registerName(srcRegIdx(0)) << ')';
+    return ss.str();
+}
+
+std::string VseMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(srcRegIdx(1)) << ", " <<
+        '(' << registerName(srcRegIdx(0)) << ')';
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VsWholeMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(srcRegIdx(1)) << ", " <<
+        '(' << registerName(srcRegIdx(0)) << ')';
+    return ss.str();
+}
+
+std::string VlStrideMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+        '(' << registerName(srcRegIdx(0)) << ')' <<
+        ", " << registerName(srcRegIdx(1));
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VlStrideMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+        '(' << registerName(srcRegIdx(0)) << ')' <<
+        ", "<< registerName(srcRegIdx(1));
+    if (microIdx != 0 || machInst.vtype8.vma == 0 || machInst.vtype8.vta == 0)
+        ss << ", " << registerName(srcRegIdx(2));
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VsStrideMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(srcRegIdx(2)) << ", " <<
+        '(' << registerName(srcRegIdx(0)) << ')' <<
+        ", " << registerName(srcRegIdx(1));
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VsStrideMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(srcRegIdx(2)) << ", " <<
+        '(' << registerName(srcRegIdx(0)) << ')' <<
+        ", "<< registerName(srcRegIdx(1));
+    if (microIdx != 0 || machInst.vtype8.vma == 0 || machInst.vtype8.vta == 0)
+        ss << ", " << registerName(srcRegIdx(2));
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VlIndexMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+        << '(' << registerName(srcRegIdx(0)) << "),"
+        << registerName(srcRegIdx(1));
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VlIndexMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' '
+        << registerName(destRegIdx(0)) << "[" << uint16_t(vdElemIdx) << "], "
+        << '(' << registerName(srcRegIdx(0)) << "), "
+        << registerName(srcRegIdx(1)) << "[" << uint16_t(vs2ElemIdx) << "]";
+    if (microIdx != 0 || machInst.vtype8.vma == 0 || machInst.vtype8.vta == 0)
+        ss << ", " << registerName(srcRegIdx(2));
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VsIndexMacroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(srcRegIdx(2)) << ", "
+        << '(' << registerName(srcRegIdx(0)) << "),"
+        << registerName(srcRegIdx(1));
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string VsIndexMicroInst::generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' '
+        << registerName(srcRegIdx(2)) << "[" << uint16_t(vs3ElemIdx) << "], "
+        << '(' << registerName(srcRegIdx(0)) << "), "
+        << registerName(srcRegIdx(1)) << "[" << uint16_t(vs2ElemIdx) << "]";
+    if (!machInst.vm) ss << ", v0.t";
+    return ss.str();
+}
+
+std::string
+VMvWholeMacroInst::generateDisassembly(Addr pc,
+    const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+        registerName(srcRegIdx(1));
+    return ss.str();
+}
+
+std::string
+VMvWholeMicroInst::generateDisassembly(Addr pc,
+    const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
+        registerName(srcRegIdx(1));
+    return ss.str();
+}
+
+} // namespace RiscvISA
+} // namespace gem5
diff --git a/src/arch/riscv/insts/vector.hh b/src/arch/riscv/insts/vector.hh
new file mode 100644
index 0000000000..cae0dcac0a
--- /dev/null
+++ b/src/arch/riscv/insts/vector.hh
@@ -0,0 +1,634 @@
+/*
+ * Copyright (c) 2022 PLCT Lab
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ARCH_RISCV_INSTS_VECTOR_HH__
+#define __ARCH_RISCV_INSTS_VECTOR_HH__
+
+#include <string>
+
+#include "arch/riscv/insts/static_inst.hh"
+#include "arch/riscv/regs/misc.hh"
+#include "arch/riscv/regs/vector.hh"
+#include "arch/riscv/utility.hh"
+#include "cpu/exec_context.hh"
+#include "cpu/static_inst.hh"
+
+namespace gem5
+{
+
+namespace RiscvISA
+{
+
+float
+getVflmul(uint32_t vlmul_encoding);
+
+inline uint32_t
+getSew(uint32_t vsew)
+{
+    assert(vsew <= 3);
+    return (8 << vsew);
+}
+
+uint32_t
+getVlmax(VTYPE vtype, uint32_t vlen);
+
+/**
+ * Base class for Vector Config operations
+ */
+class VConfOp : public RiscvStaticInst
+{
+  protected:
+    uint64_t bit30;
+    uint64_t bit31;
+    uint64_t zimm10;
+    uint64_t zimm11;
+    uint64_t uimm;
+    VConfOp(const char *mnem, ExtMachInst _extMachInst, OpClass __opClass)
+        : RiscvStaticInst(mnem, _extMachInst, __opClass),
+          bit30(_extMachInst.bit30), bit31(_extMachInst.bit31),
+          zimm10(_extMachInst.zimm_vsetivli),
+          zimm11(_extMachInst.zimm_vsetvli),
+          uimm(_extMachInst.uimm_vsetivli)
+    {}
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+
+    std::string generateZimmDisassembly() const;
+};
+
+inline uint8_t checked_vtype(bool vill, uint8_t vtype) {
+    panic_if(vill, "vill has been set");
+    const uint8_t vsew = bits(vtype, 5, 3);
+    panic_if(vsew >= 0b100, "vsew: %#x not supported", vsew);
+    const uint8_t vlmul = bits(vtype, 2, 0);
+    panic_if(vlmul == 0b100, "vlmul: %#x not supported", vlmul);
+    return vtype;
+}
+
+class VectorNonSplitInst : public RiscvStaticInst
+{
+  protected:
+    uint32_t vl;
+    uint8_t vtype;
+    VectorNonSplitInst(const char* mnem, ExtMachInst _machInst,
+                   OpClass __opClass)
+        : RiscvStaticInst(mnem, _machInst, __opClass),
+        vl(_machInst.vl),
+        vtype(checked_vtype(_machInst.vill, _machInst.vtype8))
+    {
+        this->flags[IsVector] = true;
+    }
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VectorMacroInst : public RiscvMacroInst
+{
+  protected:
+    uint32_t vl;
+    uint8_t vtype;
+    VectorMacroInst(const char* mnem, ExtMachInst _machInst,
+                   OpClass __opClass)
+        : RiscvMacroInst(mnem, _machInst, __opClass),
+        vl(_machInst.vl),
+        vtype(checked_vtype(_machInst.vill, _machInst.vtype8))
+    {
+        this->flags[IsVector] = true;
+    }
+};
+
+class VectorMicroInst : public RiscvMicroInst
+{
+  protected:
+    uint8_t microVl;
+    uint8_t microIdx;
+    uint8_t vtype;
+    VectorMicroInst(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                    uint8_t _microVl, uint8_t _microIdx)
+        : RiscvMicroInst(mnem, _machInst, __opClass),
+        microVl(_microVl),
+        microIdx(_microIdx),
+        vtype(_machInst.vtype8)
+    {
+        this->flags[IsVector] = true;
+    }
+};
+
+class VectorNopMicroInst : public RiscvMicroInst
+{
+public:
+    VectorNopMicroInst(ExtMachInst _machInst)
+        : RiscvMicroInst("vnop", _machInst, No_OpClass)
+    {}
+
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)
+        const override
+    {
+        return NoFault;
+    }
+
+    std::string generateDisassembly(Addr pc, const loader::SymbolTable *symtab)
+      const override
+    {
+        std::stringstream ss;
+        ss << mnemonic;
+        return ss.str();
+    }
+};
+
+class VectorArithMicroInst : public VectorMicroInst
+{
+protected:
+    VectorArithMicroInst(const char *mnem, ExtMachInst _machInst,
+                         OpClass __opClass, uint8_t _microVl,
+                         uint8_t _microIdx)
+        : VectorMicroInst(mnem, _machInst, __opClass, _microVl, _microIdx)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VectorArithMacroInst : public VectorMacroInst
+{
+  protected:
+    VectorArithMacroInst(const char* mnem, ExtMachInst _machInst,
+                         OpClass __opClass)
+        : VectorMacroInst(mnem, _machInst, __opClass)
+    {
+        this->flags[IsVector] = true;
+    }
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VectorVMUNARY0MicroInst : public VectorMicroInst
+{
+protected:
+    VectorVMUNARY0MicroInst(const char *mnem, ExtMachInst _machInst,
+                         OpClass __opClass, uint8_t _microVl,
+                         uint8_t _microIdx)
+        : VectorMicroInst(mnem, _machInst, __opClass, _microVl, _microIdx)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VectorVMUNARY0MacroInst : public VectorMacroInst
+{
+  protected:
+    VectorVMUNARY0MacroInst(const char* mnem, ExtMachInst _machInst,
+                         OpClass __opClass)
+        : VectorMacroInst(mnem, _machInst, __opClass)
+    {
+        this->flags[IsVector] = true;
+    }
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VectorSlideMacroInst : public VectorMacroInst
+{
+  protected:
+    VectorSlideMacroInst(const char* mnem, ExtMachInst _machInst,
+                         OpClass __opClass)
+        : VectorMacroInst(mnem, _machInst, __opClass)
+    {
+        this->flags[IsVector] = true;
+    }
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VectorSlideMicroInst : public VectorMicroInst
+{
+  protected:
+    uint8_t vdIdx;
+    uint8_t vs2Idx;
+    VectorSlideMicroInst(const char *mnem, ExtMachInst _machInst,
+                         OpClass __opClass, uint8_t _microVl,
+                         uint8_t _microIdx, uint8_t _vdIdx, uint8_t _vs2Idx)
+        : VectorMicroInst(mnem, _machInst, __opClass, _microVl, _microIdx)
+        , vdIdx(_vdIdx), vs2Idx(_vs2Idx)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VectorMemMicroInst : public VectorMicroInst
+{
+  protected:
+    uint32_t offset; // Used to calculate EA.
+    Request::Flags memAccessFlags;
+
+    VectorMemMicroInst(const char* mnem, ExtMachInst _machInst,
+                       OpClass __opClass, uint8_t _microVl, uint8_t _microIdx,
+                       uint32_t _offset)
+        : VectorMicroInst(mnem, _machInst, __opClass, _microVl, _microIdx)
+        , offset(_offset)
+        , memAccessFlags(0)
+    {}
+};
+
+class VectorMemMacroInst : public VectorMacroInst
+{
+  protected:
+    VectorMemMacroInst(const char* mnem, ExtMachInst _machInst,
+                       OpClass __opClass)
+        : VectorMacroInst(mnem, _machInst, __opClass)
+    {}
+};
+
+class VleMacroInst : public VectorMemMacroInst
+{
+  protected:
+    VleMacroInst(const char* mnem, ExtMachInst _machInst,
+                   OpClass __opClass)
+        : VectorMemMacroInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VseMacroInst : public VectorMemMacroInst
+{
+  protected:
+    VseMacroInst(const char* mnem, ExtMachInst _machInst,
+                   OpClass __opClass)
+        : VectorMemMacroInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VleMicroInst : public VectorMicroInst
+{
+  protected:
+    Request::Flags memAccessFlags;
+
+    VleMicroInst(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                 uint8_t _microVl, uint8_t _microIdx)
+        : VectorMicroInst(mnem, _machInst, __opClass, _microVl, _microIdx)
+    {
+        this->flags[IsLoad] = true;
+    }
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VseMicroInst : public VectorMicroInst
+{
+  protected:
+    Request::Flags memAccessFlags;
+
+    VseMicroInst(const char *mnem, ExtMachInst _machInst, OpClass __opClass,
+                 uint8_t _microVl, uint8_t _microIdx)
+        : VectorMicroInst(mnem, _machInst, __opClass, _microVl, _microIdx)
+    {
+        this->flags[IsStore] = true;
+    }
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VlWholeMacroInst : public VectorMemMacroInst
+{
+  protected:
+    VlWholeMacroInst(const char *mnem, ExtMachInst _machInst,
+                     OpClass __opClass)
+      : VectorMemMacroInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(
+      Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VlWholeMicroInst : public VectorMicroInst
+{
+  protected:
+    Request::Flags memAccessFlags;
+
+    VlWholeMicroInst(const char *mnem, ExtMachInst _machInst,
+                     OpClass __opClass, uint8_t _microVl, uint8_t _microIdx)
+        : VectorMicroInst(mnem, _machInst, __opClass, _microVl, _microIdx)
+    {}
+
+    std::string generateDisassembly(
+      Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VsWholeMacroInst : public VectorMemMacroInst
+{
+  protected:
+    VsWholeMacroInst(const char *mnem, ExtMachInst _machInst,
+                     OpClass __opClass)
+        : VectorMemMacroInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VsWholeMicroInst : public VectorMicroInst
+{
+  protected:
+    Request::Flags memAccessFlags;
+
+    VsWholeMicroInst(const char *mnem, ExtMachInst _machInst,
+                     OpClass __opClass, uint8_t _microVl, uint8_t _microIdx)
+        : VectorMicroInst(mnem, _machInst, __opClass, _microIdx, _microIdx)
+    {}
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VlStrideMacroInst : public VectorMemMacroInst
+{
+  protected:
+    VlStrideMacroInst(const char* mnem, ExtMachInst _machInst,
+                   OpClass __opClass)
+        : VectorMemMacroInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VlStrideMicroInst : public VectorMemMicroInst
+{
+  protected:
+  uint8_t regIdx;
+    VlStrideMicroInst(const char *mnem, ExtMachInst _machInst,
+                      OpClass __opClass, uint8_t _regIdx,
+                      uint8_t _microIdx, uint8_t _microVl)
+        : VectorMemMicroInst(mnem, _machInst, __opClass, _microVl,
+                             _microIdx, 0)
+        , regIdx(_regIdx)
+    {}
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VsStrideMacroInst : public VectorMemMacroInst
+{
+  protected:
+    VsStrideMacroInst(const char* mnem, ExtMachInst _machInst,
+                   OpClass __opClass)
+        : VectorMemMacroInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VsStrideMicroInst : public VectorMemMicroInst
+{
+  protected:
+    uint8_t regIdx;
+    VsStrideMicroInst(const char *mnem, ExtMachInst _machInst,
+                      OpClass __opClass, uint8_t _regIdx,
+                      uint8_t _microIdx, uint8_t _microVl)
+        : VectorMemMicroInst(mnem, _machInst, __opClass, _microVl,
+                             _microIdx, 0)
+        , regIdx(_regIdx)
+    {}
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VlIndexMacroInst : public VectorMemMacroInst
+{
+  protected:
+    VlIndexMacroInst(const char* mnem, ExtMachInst _machInst,
+                   OpClass __opClass)
+        : VectorMemMacroInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VlIndexMicroInst : public VectorMemMicroInst
+{
+  protected:
+    uint8_t vdRegIdx;
+    uint8_t vdElemIdx;
+    uint8_t vs2RegIdx;
+    uint8_t vs2ElemIdx;
+    VlIndexMicroInst(const char *mnem, ExtMachInst _machInst,
+                    OpClass __opClass, uint8_t _vdRegIdx, uint8_t _vdElemIdx,
+                    uint8_t _vs2RegIdx, uint8_t _vs2ElemIdx)
+        : VectorMemMicroInst(mnem, _machInst, __opClass, 1,
+                             0, 0)
+        , vdRegIdx(_vdRegIdx), vdElemIdx(_vdElemIdx)
+        , vs2RegIdx(_vs2RegIdx), vs2ElemIdx(_vs2ElemIdx)
+    {}
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VsIndexMacroInst : public VectorMemMacroInst
+{
+  protected:
+    VsIndexMacroInst(const char* mnem, ExtMachInst _machInst,
+                   OpClass __opClass)
+        : VectorMemMacroInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VsIndexMicroInst : public VectorMemMicroInst
+{
+  protected:
+    uint8_t vs3RegIdx;
+    uint8_t vs3ElemIdx;
+    uint8_t vs2RegIdx;
+    uint8_t vs2ElemIdx;
+    VsIndexMicroInst(const char *mnem, ExtMachInst _machInst,
+                    OpClass __opClass, uint8_t _vs3RegIdx, uint8_t _vs3ElemIdx,
+                    uint8_t _vs2RegIdx, uint8_t _vs2ElemIdx)
+        : VectorMemMicroInst(mnem, _machInst, __opClass, 1, 0, 0),
+          vs3RegIdx(_vs3RegIdx), vs3ElemIdx(_vs3ElemIdx),
+          vs2RegIdx(_vs2RegIdx), vs2ElemIdx(_vs2ElemIdx)
+    {}
+
+    std::string generateDisassembly(
+        Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VMvWholeMacroInst : public VectorArithMacroInst
+{
+  protected:
+    VMvWholeMacroInst(const char* mnem, ExtMachInst _machInst,
+                         OpClass __opClass)
+        : VectorArithMacroInst(mnem, _machInst, __opClass)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+class VMvWholeMicroInst : public VectorArithMicroInst
+{
+  protected:
+    VMvWholeMicroInst(const char *mnem, ExtMachInst _machInst,
+                         OpClass __opClass, uint8_t _microVl,
+                         uint8_t _microIdx)
+        : VectorArithMicroInst(mnem, _machInst, __opClass, _microVl, _microIdx)
+    {}
+
+    std::string generateDisassembly(
+            Addr pc, const loader::SymbolTable *symtab) const override;
+};
+
+template<typename ElemType>
+class VMaskMergeMicroInst : public VectorArithMicroInst
+{
+  private:
+    RegId srcRegIdxArr[NumVecInternalRegs];
+    RegId destRegIdxArr[1];
+
+  public:
+    VMaskMergeMicroInst(ExtMachInst extMachInst, uint8_t _dstReg,
+        uint8_t _numSrcs)
+        : VectorArithMicroInst("vmask_mv_micro", extMachInst,
+                               VectorIntegerArithOp, 0, 0)
+    {
+        setRegIdxArrays(
+            reinterpret_cast<RegIdArrayPtr>(
+                &std::remove_pointer_t<decltype(this)>::srcRegIdxArr),
+            reinterpret_cast<RegIdArrayPtr>(
+                &std::remove_pointer_t<decltype(this)>::destRegIdxArr));
+
+        _numSrcRegs = 0;
+        _numDestRegs = 0;
+
+        setDestRegIdx(_numDestRegs++, vecRegClass[_dstReg]);
+        _numTypedDestRegs[VecRegClass]++;
+        for (uint8_t i=0; i<_numSrcs; i++) {
+            setSrcRegIdx(_numSrcRegs++, vecRegClass[VecMemInternalReg0 + i]);
+        }
+    }
+
+    Fault
+    execute(ExecContext* xc, trace::InstRecord* traceData) const override
+    {
+        vreg_t tmp_d0 = *(vreg_t *)xc->getWritableRegOperand(this, 0);
+        auto Vd = tmp_d0.as<uint8_t>();
+        constexpr uint8_t elems_per_vreg = VLENB / sizeof(ElemType);
+        size_t bit_cnt = elems_per_vreg;
+        vreg_t tmp_s;
+        xc->getRegOperand(this, 0, &tmp_s);
+        auto s = tmp_s.as<uint8_t>();
+        // cp the first result and tail
+        memcpy(Vd, s, VLENB);
+        for (uint8_t i = 1; i < this->_numSrcRegs; i++) {
+            xc->getRegOperand(this, i, &tmp_s);
+            s = tmp_s.as<uint8_t>();
+            if constexpr (elems_per_vreg < 8) {
+                constexpr uint8_t m = (1 << elems_per_vreg) - 1;
+                const uint8_t mask = m << (i * elems_per_vreg % 8);
+                // clr & ext bits
+                Vd[bit_cnt/8] ^= Vd[bit_cnt/8] & mask;
+                Vd[bit_cnt/8] |= s[bit_cnt/8] & mask;
+                bit_cnt += elems_per_vreg;
+            } else {
+                constexpr uint8_t byte_offset = elems_per_vreg / 8;
+                memcpy(Vd + i * byte_offset, s + i * byte_offset, byte_offset);
+            }
+        }
+        xc->setRegOperand(this, 0, &tmp_d0);
+        if (traceData)
+            traceData->setData(vecRegClass, &tmp_d0);
+        return NoFault;
+    }
+
+    std::string
+    generateDisassembly(Addr pc, const loader::SymbolTable *symtab)
+        const override
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0));
+        for (uint8_t i = 0; i < this->_numSrcRegs; i++) {
+            ss << ", " << registerName(srcRegIdx(i));
+        }
+        ss << ", offset:" << VLENB / sizeof(ElemType);
+        return ss.str();
+    }
+};
+
+class VxsatMicroInst : public VectorArithMicroInst
+{
+  private:
+    bool* vxsat;
+  public:
+    VxsatMicroInst(bool* Vxsat, ExtMachInst extMachInst)
+        : VectorArithMicroInst("vxsat_micro", extMachInst,
+          VectorIntegerArithOp, 0, 0)
+    {
+        vxsat = Vxsat;
+    }
+    Fault
+    execute(ExecContext* xc, trace::InstRecord* traceData) const override
+    {
+        xc->setMiscReg(MISCREG_VXSAT,*vxsat);
+        auto vcsr = xc->readMiscReg(MISCREG_VCSR);
+        xc->setMiscReg(MISCREG_VCSR, ((vcsr&~1)|*vxsat));
+        return NoFault;
+    }
+    std::string generateDisassembly(Addr pc, const loader::SymbolTable *symtab)
+        const override
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << "VXSAT" << ", " << (*vxsat ? "0x1" : "0x0");
+        return ss.str();
+    }
+};
+
+} // namespace RiscvISA
+} // namespace gem5
+
+
+#endif // __ARCH_RISCV_INSTS_VECTOR_HH__
diff --git a/src/arch/riscv/isa.cc b/src/arch/riscv/isa.cc
index 94a8239bac..4b6f49d807 100644
--- a/src/arch/riscv/isa.cc
+++ b/src/arch/riscv/isa.cc
@@ -43,6 +43,7 @@
 #include "arch/riscv/regs/float.hh"
 #include "arch/riscv/regs/int.hh"
 #include "arch/riscv/regs/misc.hh"
+#include "arch/riscv/regs/vector.hh"
 #include "base/bitfield.hh"
 #include "base/compiler.hh"
 #include "base/logging.hh"
@@ -52,6 +53,7 @@
 #include "debug/LLSC.hh"
 #include "debug/MatRegs.hh"
 #include "debug/RiscvMisc.hh"
+#include "debug/VecRegs.hh"
 #include "mem/packet.hh"
 #include "mem/request.hh"
 #include "params/RiscvISA.hh"
@@ -189,6 +191,14 @@ namespace RiscvISA
     [MISCREG_FFLAGS]        = "FFLAGS",
     [MISCREG_FRM]           = "FRM",
 
+    [MISCREG_VSTART]        = "VSTART",
+    [MISCREG_VXSAT]         = "VXSAT",
+    [MISCREG_VXRM]          = "VXRM",
+    [MISCREG_VCSR]          = "VCSR",
+    [MISCREG_VL]            = "VL",
+    [MISCREG_VTYPE]         = "VTYPE",
+    [MISCREG_VLENB]         = "VLENB",
+
     [MISCREG_NMIVEC]        = "NMIVEC",
     [MISCREG_NMIE]          = "NMIE",
     [MISCREG_NMIP]          = "NMIP",
@@ -234,17 +244,18 @@ namespace
 {
 
 /* Not applicable to RISCV */
-RegClass vecRegClass(VecRegClass, VecRegClassName, 1, debug::IntRegs);
-RegClass vecElemClass(VecElemClass, VecElemClassName, 2, debug::IntRegs);
-RegClass vecPredRegClass(VecPredRegClass, VecPredRegClassName, 1,
+RegClass vecElemClass(VecElemClass, VecElemClassName, 0, debug::IntRegs);
+RegClass vecPredRegClass(VecPredRegClass, VecPredRegClassName, 0,
         debug::IntRegs);
-RegClass matRegClass(MatRegClass, MatRegClassName, 1, debug::MatRegs);
+RegClass matRegClass(MatRegClass, MatRegClassName, 0, debug::MatRegs);
 RegClass ccRegClass(CCRegClass, CCRegClassName, 0, debug::IntRegs);
 
 } // anonymous namespace
 
 ISA::ISA(const Params &p) :
-    BaseISA(p), rv_type(p.riscv_type), checkAlignment(p.check_alignment)
+    BaseISA(p), _rvType(p.riscv_type), checkAlignment(p.check_alignment),
+    enableRvv(p.enable_rvv)
+
 {
     _regClasses.push_back(&intRegClass);
     _regClasses.push_back(&floatRegClass);
@@ -275,6 +286,13 @@ ISA::copyRegsFrom(ThreadContext *src)
     for (auto &id: floatRegClass)
         tc->setReg(id, src->getReg(id));
 
+    // Third loop through the vector registers.
+    RiscvISA::VecRegContainer vc;
+    for (auto &id: vecRegClass) {
+        src->getReg(id, &vc);
+        tc->setReg(id, &vc);
+    }
+
     // Lastly copy PC/NPC
     tc->pcState(src->pcState());
 }
@@ -299,17 +317,21 @@ void ISA::clear()
     // mark FS is initial
     status.fs = INITIAL;
 
-    // rv_type dependent init.
-    switch (rv_type) {
+    // _rvType dependent init.
+    switch (_rvType) {
         case RV32:
           misa.rv32_mxl = 1;
           break;
         case RV64:
           misa.rv64_mxl = 2;
           status.uxl = status.sxl = 2;
+          if (getEnableRvv()) {
+              status.vs = VPUStatus::INITIAL;
+              misa.rvv = 1;
+          }
           break;
         default:
-          panic("%s: Unknown rv_type: %d", name(), (int)rv_type);
+          panic("%s: Unknown _rvType: %d", name(), (int)_rvType);
     }
 
     miscRegFile[MISCREG_ISA] = misa;
@@ -465,7 +487,7 @@ ISA::readMiscReg(RegIndex idx)
                 (status.xs == 3) || (status.fs == 3) || (status.vs == 3);
             // For RV32, the SD bit is at index 31
             // For RV64, the SD bit is at index 63.
-            switch (rv_type) {
+            switch (_rvType) {
                 case RV32:
                     status.rv32_sd = sd_bit;
                     break;
@@ -473,12 +495,23 @@ ISA::readMiscReg(RegIndex idx)
                     status.rv64_sd = sd_bit;
                     break;
                 default:
-                    panic("%s: Unknown rv_type: %d", name(), (int)rv_type);
+                    panic("%s: Unknown _rvType: %d", name(), (int)_rvType);
             }
             setMiscRegNoEffect(idx, status);
 
             return readMiscRegNoEffect(idx);
         }
+      case MISCREG_VLENB:
+        {
+            return VLENB;
+        }
+        break;
+      case MISCREG_VCSR:
+        {
+            return readMiscRegNoEffect(MISCREG_VXSAT) &
+                  (readMiscRegNoEffect(MISCREG_VXRM) << 1);
+        }
+        break;
       default:
         // Try reading HPM counters
         // As a placeholder, all HPM counters are just cycle counters
@@ -541,7 +574,7 @@ ISA::setMiscReg(RegIndex idx, RegVal val)
                 assert(readMiscRegNoEffect(MISCREG_PRV) == PRV_M);
 
                 int regSize = 0;
-                switch (rv_type) {
+                switch (_rvType) {
                     case RV32:
                         regSize = 4;
                     break;
@@ -549,7 +582,7 @@ ISA::setMiscReg(RegIndex idx, RegVal val)
                         regSize = 8;
                     break;
                     default:
-                        panic("%s: Unknown rv_type: %d", name(), (int)rv_type);
+                        panic("%s: Unknown _rvType: %d", name(), (int)_rvType);
                 }
 
                 // Specs do not seem to mention what should be
@@ -643,7 +676,7 @@ ISA::setMiscReg(RegIndex idx, RegVal val)
             break;
           case MISCREG_STATUS:
             {
-                if (rv_type != RV32) {
+                if (_rvType != RV32) {
                     // SXL and UXL are hard-wired to 64 bit
                     auto cur = readMiscRegNoEffect(idx);
                     val &= ~(STATUS_SXL_MASK | STATUS_UXL_MASK);
@@ -652,6 +685,22 @@ ISA::setMiscReg(RegIndex idx, RegVal val)
                 setMiscRegNoEffect(idx, val);
             }
             break;
+          case MISCREG_VXSAT:
+            {
+                setMiscRegNoEffect(idx, val & 0x1);
+            }
+            break;
+          case MISCREG_VXRM:
+            {
+                setMiscRegNoEffect(idx, val & 0x3);
+            }
+            break;
+          case MISCREG_VCSR:
+            {
+                setMiscRegNoEffect(MISCREG_VXSAT, val & 0x1);
+                setMiscRegNoEffect(MISCREG_VXRM, (val & 0x6) >> 1);
+            }
+            break;
           default:
             setMiscRegNoEffect(idx, val);
         }
diff --git a/src/arch/riscv/isa.hh b/src/arch/riscv/isa.hh
index 31001c04b4..f7726160c9 100644
--- a/src/arch/riscv/isa.hh
+++ b/src/arch/riscv/isa.hh
@@ -67,12 +67,15 @@ enum FPUStatus
     DIRTY = 3,
 };
 
+using VPUStatus = FPUStatus;
+
 class ISA : public BaseISA
 {
   protected:
-    RiscvType rv_type;
+    RiscvType _rvType;
     std::vector<RegVal> miscRegFile;
     bool checkAlignment;
+    bool enableRvv;
 
     bool hpmCounterEnabled(int counter) const;
 
@@ -89,7 +92,7 @@ class ISA : public BaseISA
     PCStateBase*
     newPCState(Addr new_inst_addr=0) const override
     {
-        return new PCState(new_inst_addr, rv_type);
+        return new PCState(new_inst_addr, _rvType);
     }
 
   public:
@@ -110,7 +113,7 @@ class ISA : public BaseISA
     virtual const std::unordered_map<int, RegVal>&
     getCSRMaskMap() const
     {
-        return CSRMasks[rv_type];
+        return CSRMasks[_rvType];
     }
 
     bool alignmentCheckEnabled() const { return checkAlignment; }
@@ -134,7 +137,9 @@ class ISA : public BaseISA
 
     void resetThread() override;
 
-    RiscvType rvType() const { return rv_type; }
+    RiscvType rvType() const { return _rvType; }
+
+    bool getEnableRvv() const { return enableRvv; }
 
     void
     clearLoadReservation(ContextID cid)
diff --git a/src/arch/riscv/isa/bitfields.isa b/src/arch/riscv/isa/bitfields.isa
index 8589269949..280bcbab22 100644
--- a/src/arch/riscv/isa/bitfields.isa
+++ b/src/arch/riscv/isa/bitfields.isa
@@ -133,3 +133,27 @@ def bitfield BIT25         <25>;
 def bitfield RNUM       <23:20>;
 def bitfield KFUNCT5    <29:25>;
 def bitfield BS         <31:30>;
+
+// Vector instructions
+def bitfield VFUNCT6    vfunct6;
+def bitfield VFUNCT5    vfunct5;
+def bitfield VFUNCT3    vfunct3;
+def bitfield VFUNCT2    vfunct2;
+
+def bitfield VS3        vs3;
+def bitfield VS2        vs2;
+def bitfield VS1        vs1;
+def bitfield VD         vd;
+
+def bitfield NF         nf;
+def bitfield MEW        mew;
+def bitfield MOP        mop;
+def bitfield VM         vm;
+def bitfield LUMOP      lumop;
+def bitfield SUMOP      sumop;
+def bitfield WIDTH      width;
+
+def bitfield BIT31      bit31;
+def bitfield BIT30      bit30;
+def bitfield SIMM5      uimm_vsetivli;
+def bitfield SIMM3      simm3;
diff --git a/src/arch/riscv/isa/decoder.isa b/src/arch/riscv/isa/decoder.isa
index a339c11375..71efac5958 100644
--- a/src/arch/riscv/isa/decoder.isa
+++ b/src/arch/riscv/isa/decoder.isa
@@ -500,6 +500,174 @@ decode QUADRANT default Unknown::unknown() {
                     Fd_bits = fd.v;
                 }}, inst_flags=FloatMemReadOp);
             }
+
+            0x0: decode MOP {
+                0x0: decode LUMOP {
+                    0x00: VleOp::vle8_v({{
+                        if ((machInst.vm || elem_mask(v0, ei)) &&
+                            i < this->microVl) {
+                            Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                        } else {
+                            Vd_ub[i] = Vs2_ub[i];
+                        }
+                    }}, inst_flags=VectorUnitStrideLoadOp);
+                    0x08: decode NF {
+                        format VlWholeOp {
+                            0x0: vl1re8_v({{
+                                Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x1: vl2re8_v({{
+                                Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x3: vl4re8_v({{
+                                Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x7: vl8re8_v({{
+                                Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                        }
+                    }
+                    0x0b: VlmOp::vlm_v({{
+                        Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
+                    }}, inst_flags=VectorUnitStrideMaskLoadOp);
+                }
+                0x1: VlIndexOp::vluxei8_v({{
+                    Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
+                }}, {{
+                    EA = Rs1 + Vs2_ub[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedLoadOp);
+                0x2: VlStrideOp::vlse8_v({{
+                    Vd_ub[microIdx] = Mem_vc.as<uint8_t>()[0];
+                }}, inst_flags=VectorStridedLoadOp);
+                0x3: VlIndexOp::vloxei8_v({{
+                    Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
+                }}, {{
+                    EA = Rs1 + Vs2_ub[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedLoadOp);
+            }
+            0x5: decode MOP {
+                0x0: decode LUMOP {
+                    0x00: VleOp::vle16_v({{
+                        if ((machInst.vm || elem_mask(v0, ei)) &&
+                            i < this->microVl) {
+                            Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
+                        } else {
+                            Vd_uh[i] = Vs2_uh[i];
+                        }
+                    }}, inst_flags=VectorUnitStrideLoadOp);
+                    0x08: decode NF {
+                        format VlWholeOp {
+                            0x0: vl1re16_v({{
+                                Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x1: vl2re16_v({{
+                                Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x3: vl4re16_v({{
+                                Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x7: vl8re16_v({{
+                                Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                        }
+                    }
+                }
+                0x1: VlIndexOp::vluxei16_v({{
+                    Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
+                }}, {{
+                    EA = Rs1 + Vs2_uh[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedLoadOp);
+                0x2: VlStrideOp::vlse16_v({{
+                    Vd_uh[microIdx] = Mem_vc.as<uint16_t>()[0];
+                }}, inst_flags=VectorStridedLoadOp);
+                0x3: VlIndexOp::vloxei16_v({{
+                    Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
+                }}, {{
+                    EA = Rs1 + Vs2_uh[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedLoadOp);
+            }
+            0x6: decode MOP {
+                0x0: decode LUMOP {
+                    0x00: VleOp::vle32_v({{
+                        if ((machInst.vm || elem_mask(v0, ei)) &&
+                            i < this->microVl) {
+                            Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
+                        } else {
+                            Vd_uw[i] = Vs2_uw[i];
+                        }
+                    }}, inst_flags=VectorUnitStrideLoadOp);
+                    0x08: decode NF {
+                        format VlWholeOp {
+                            0x0: vl1re32_v({{
+                                Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x1: vl2re32_v({{
+                                Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x3: vl4re32_v({{
+                                Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x7: vl8re32_v({{
+                                Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                        }
+                    }
+                }
+                0x1: VlIndexOp::vluxei32_v({{
+                    Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
+                }}, {{
+                    EA = Rs1 + Vs2_uw[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedLoadOp);
+                0x2: VlStrideOp::vlse32_v({{
+                    Vd_uw[microIdx] = Mem_vc.as<uint32_t>()[0];
+                }}, inst_flags=VectorStridedLoadOp);
+                0x3: VlIndexOp::vloxei32_v({{
+                    Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
+                }}, {{
+                    EA = Rs1 + Vs2_uw[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedLoadOp);
+            }
+            0x7: decode MOP {
+                0x0: decode LUMOP {
+                    0x00: VleOp::vle64_v({{
+                        if ((machInst.vm || elem_mask(v0, ei)) &&
+                            i < this->microVl) {
+                            Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
+                        } else {
+                            Vd_ud[i] = Vs2_ud[i];
+                        }
+                    }}, inst_flags=VectorUnitStrideLoadOp);
+                    0x08: decode NF {
+                        format VlWholeOp {
+                            0x0: vl1re64_v({{
+                                Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x1: vl2re64_v({{
+                                Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x3: vl4re64_v({{
+                                Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                            0x7: vl8re64_v({{
+                                Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
+                            }}, inst_flags=VectorWholeRegisterLoadOp);
+                        }
+                    }
+                }
+                0x1: VlIndexOp::vluxei64_v({{
+                    Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
+                }}, {{
+                    EA = Rs1 + Vs2_ud[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedLoadOp);
+                0x2: VlStrideOp::vlse64_v({{
+                    Vd_ud[microIdx] = Mem_vc.as<uint64_t>()[0];
+                }}, inst_flags=VectorStridedLoadOp);
+                0x3: VlIndexOp::vloxei64_v({{
+                    Vd_vu[vdElemIdx] = Mem_vc.as<vu>()[0];
+                }}, {{
+                    EA = Rs1 + Vs2_ud[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedLoadOp);
+            }
         }
 
         0x03: decode FUNCT3 {
@@ -509,6 +677,23 @@ decode QUADRANT default Unknown::unknown() {
                 0x1: fence_i({{
                 }}, uint64_t, IsNonSpeculative, IsSerializeAfter, No_OpClass);
             }
+
+            0x2: decode FUNCT12 {
+                format CBMOp {
+                    0x0: cbo_inval({{
+                        Mem = 0;
+                    }}, mem_flags=[INVALIDATE, DST_POC]);
+                    0x1: cbo_clean({{
+                        Mem = 0;
+                    }}, mem_flags=[CLEAN, DST_POC]);
+                    0x2: cbo_flush({{
+                        Mem = 0;
+                    }}, mem_flags=[CLEAN, INVALIDATE, DST_POC]);
+                    0x4: cbo_zero({{
+                        Mem = 0;
+                    }}, mem_flags=[CACHE_BLOCK_ZERO]);
+                }
+            }
         }
 
         0x04: decode FUNCT3 {
@@ -806,6 +991,106 @@ decode QUADRANT default Unknown::unknown() {
                     Mem_ud = Fs2_bits;
                 }}, inst_flags=FloatMemWriteOp);
             }
+
+            0x0: decode MOP {
+                0x0: decode SUMOP {
+                    0x00: VseOp::vse8_v({{
+                        Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
+                    }}, inst_flags=VectorUnitStrideStoreOp);
+                    format VsWholeOp {
+                        0x8: decode NF {
+                            0x0: vs1r_v({{
+                                Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
+                            }}, inst_flags=VectorWholeRegisterStoreOp);
+                            0x1: vs2r_v({{
+                                Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
+                            }}, inst_flags=VectorWholeRegisterStoreOp);
+                            0x3: vs4r_v({{
+                                Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
+                            }}, inst_flags=VectorWholeRegisterStoreOp);
+                            0x7: vs8r_v({{
+                                Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
+                            }}, inst_flags=VectorWholeRegisterStoreOp);
+                        }
+                    }
+                    0x0b: VsmOp::vsm_v({{
+                        Mem_vc.as<uint8_t>()[i] = Vs3_ub[i];
+                    }}, inst_flags=VectorUnitStrideMaskStoreOp);
+                }
+                0x1: VsIndexOp::vsuxei8_v({{
+                    Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
+                }}, {{
+                    EA = Rs1 + Vs2_ub[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedStoreOp);
+                0x2: VsStrideOp::vsse8_v({{
+                    Mem_vc.as<uint8_t>()[0] = Vs3_ub[microIdx];
+                }}, inst_flags=VectorStridedStoreOp);
+                0x3: VsIndexOp::vsoxei8_v({{
+                    Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
+                }}, {{
+                    EA = Rs1 + Vs2_ub[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedStoreOp);
+            }
+            0x5: decode MOP {
+                0x0: decode SUMOP {
+                    0x00: VseOp::vse16_v({{
+                        Mem_vc.as<uint16_t>()[i] = Vs3_uh[i];
+                    }}, inst_flags=VectorUnitStrideStoreOp);
+                }
+                0x1: VsIndexOp::vsuxei16_v({{
+                    Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
+                }}, {{
+                    EA = Rs1 + Vs2_uh[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedStoreOp);
+                0x2: VsStrideOp::vsse16_v({{
+                    Mem_vc.as<uint16_t>()[0] = Vs3_uh[microIdx];
+                }}, inst_flags=VectorStridedStoreOp);
+                0x3: VsIndexOp::vsoxei16_v({{
+                    Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
+                }}, {{
+                    EA = Rs1 + Vs2_uh[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedStoreOp);
+            }
+            0x6: decode MOP {
+                0x0: decode SUMOP {
+                    0x00: VseOp::vse32_v({{
+                        Mem_vc.as<uint32_t>()[i] = Vs3_uw[i];
+                    }}, inst_flags=VectorUnitStrideStoreOp);
+                }
+                0x1: VsIndexOp::vsuxei32_v({{
+                    Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
+                }}, {{
+                    EA = Rs1 + Vs2_uw[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedStoreOp);
+                0x2: VsStrideOp::vsse32_v({{
+                    Mem_vc.as<uint32_t>()[0] = Vs3_uw[microIdx];
+                }}, inst_flags=VectorStridedStoreOp);
+                0x3: VsIndexOp::vsoxei32_v({{
+                    Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
+                }}, {{
+                    EA = Rs1 + Vs2_uw[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedStoreOp);
+            }
+            0x7: decode MOP {
+                0x0: decode SUMOP {
+                    0x00: VseOp::vse64_v({{
+                        Mem_vc.as<uint64_t>()[i] = Vs3_ud[i];
+                    }}, inst_flags=VectorUnitStrideStoreOp);
+                }
+                0x1: VsIndexOp::vsuxei64_v({{
+                    Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
+                }}, {{
+                    EA = Rs1 + Vs2_ud[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedStoreOp);
+                0x2: VsStrideOp::vsse64_v({{
+                    Mem_vc.as<uint64_t>()[0] = Vs3_ud[microIdx];
+                }}, inst_flags=VectorStridedStoreOp);
+                0x3: VsIndexOp::vsoxei64_v({{
+                    Mem_vc.as<vu>()[0] = Vs3_vu[vs3ElemIdx];
+                }}, {{
+                    EA = Rs1 + Vs2_ud[vs2ElemIdx];
+                }}, inst_flags=VectorIndexedStoreOp);
+            }
         }
 
         0x0b: decode FUNCT3 {
@@ -2012,6 +2297,2095 @@ decode QUADRANT default Unknown::unknown() {
             }
         }
 
+        0x15: decode FUNCT3 {
+            // OPIVV
+            0x0: decode VFUNCT6 {
+                format VectorIntFormat {
+                    0x0: vadd_vv({{
+                        Vd_vu[i] = Vs2_vu[i] + Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x2: vsub_vv({{
+                        Vd_vu[i] = Vs2_vu[i] - Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x4: vminu_vv({{
+                        Vd_vu[i] = Vs2_vu[i] < Vs1_vu[i] ?
+                                Vs2_vu[i] : Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x5: vmin_vv({{
+                        Vd_vi[i] = Vs2_vi[i] < Vs1_vi[i] ?
+                                Vs2_vi[i] : Vs1_vi[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x6: vmaxu_vv({{
+                        Vd_vu[i] = Vs2_vu[i] > Vs1_vu[i] ?
+                                Vs2_vu[i] : Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x7: vmax_vv({{
+                        Vd_vi[i] = Vs2_vi[i] > Vs1_vi[i] ?
+                                Vs2_vi[i] : Vs1_vi[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x9: vand_vv({{
+                        Vd_vu[i] = Vs2_vu[i] & Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0xa: vor_vv({{
+                        Vd_vu[i] = Vs2_vu[i] | Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0xb: vxor_vv({{
+                        Vd_vu[i] = Vs2_vu[i] ^ Vs1_vu[i];
+                    }}, OPIVV, VectorIntegerArithOp);
+                }
+                0x0c: VectorGatherFormat::vrgather_vv({{
+                    for (uint32_t i = 0; i < microVl; i++) {
+                        uint32_t ei = i + vs1_idx * vs1_elems + vs1_bias;
+                        if (this->vm || elem_mask(v0, ei)) {
+                            const uint64_t idx = Vs1_vu[i]
+                                - vs2_elems * vs2_idx;
+                            auto res = (Vs1_vu[i] >= vlmax) ? 0
+                                : (idx < vs2_elems) ? Vs2_vu[idx]
+                                : Vs3_vu[i];
+                            Vd_vu[i] = res;
+                        }
+                    }
+                }}, OPIVV, VectorMiscOp);
+                0x0e: VectorGatherFormat::vrgatherei16_vv({{
+                    for (uint32_t i = 0; i < microVl; i++) {
+                        uint32_t ei = i + vs1_idx * vs1_elems + vs1_bias;
+                        if (this->vm || elem_mask(v0, ei)) {
+                            const uint16_t idx = Vs1_uh[i + vs1_bias]
+                                - vs2_elems * vs2_idx;
+                            auto res = (Vs1_uh[i + vs1_bias] >= vlmax) ? 0
+                                : (idx < vs2_elems) ? Vs2_vu[idx]
+                                : Vs3_vu[i + vd_bias];
+                            Vd_vu[i + vd_bias] = res;
+                        }
+                    }
+                }}, OPIVV, VectorMiscOp);
+                format VectorIntFormat {
+                    0x10: decode VM {
+                        0x0: vadc_vvm({{
+                            Vd_vi[i] = Vs2_vi[i] + Vs1_vi[i]
+                                    + elem_mask(v0, ei);
+                        }}, OPIVV, VectorIntegerArithOp);
+                        // the unmasked versions (vm=1) are reserved
+                    }
+                    0x12: decode VM {
+                        0x0: vsbc_vvm({{
+                            Vd_vi[i] = Vs2_vi[i] - Vs1_vi[i]
+                                    - elem_mask(v0, ei);
+                        }}, OPIVV, VectorIntegerArithOp);
+                        // the unmasked versions (vm=1) are reserved
+                    }
+                    0x17: decode VM {
+                        0x0: vmerge_vvm({{
+                            Vd_vu[i] = elem_mask(v0, ei)
+                                    ? Vs1_vu[i]
+                                    : Vs2_vu[i];
+                        }}, OPIVV, VectorIntegerArithOp);
+                        0x1: decode VS2 {
+                            0x0: vmv_v_v({{
+                                Vd_vu[i] = Vs1_vu[i];
+                            }}, OPIVV, VectorIntegerArithOp);
+                        }
+                    }
+                }
+                format VectorIntVxsatFormat{
+                    0x20: vsaddu_vv({{
+                        Vd_vu[i] = sat_addu<vu>(Vs2_vu[i], Vs1_vu[i],
+                            vxsatptr);
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x21: vsadd_vv({{
+                        Vd_vu[i] = sat_add<vi>(Vs2_vu[i], Vs1_vu[i],
+                            vxsatptr);
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x22: vssubu_vv({{
+                        Vd_vu[i] = sat_subu<vu>(Vs2_vu[i], Vs1_vu[i],
+                            vxsatptr);
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x23: vssub_vv({{
+                        Vd_vu[i] = sat_sub<vi>(Vs2_vu[i], Vs1_vu[i],
+                            vxsatptr);
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x27: vsmul_vv({{
+                        vi max = std::numeric_limits<vi>::max();
+                        vi min = std::numeric_limits<vi>::min();
+                        bool overflow = Vs1_vi[i] == Vs2_vi[i] &&
+                                        Vs1_vi[i] == min;
+                        __int128_t result = (__int128_t)Vs1_vi[i] *
+                                            (__int128_t)Vs2_vi[i];
+                        result = int_rounding<__int128_t>(
+                            result, 0 /* TODO */, sew - 1);
+                        result = result >> (sew - 1);
+                        if (overflow) {
+                            result = max;
+                            *vxsatptr = true;
+                        }
+
+                        Vd_vi[i] = (vi)result;
+                    }}, OPIVV, VectorIntegerArithOp);
+                }
+                format VectorIntFormat {
+                    0x25: vsll_vv({{
+                        Vd_vu[i] = Vs2_vu[i] << (Vs1_vu[i] & (sew - 1));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x28: vsrl_vv({{
+                        Vd_vu[i] = Vs2_vu[i] >> (Vs1_vu[i] & (sew - 1));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x29: vsra_vv({{
+                        Vd_vi[i] = Vs2_vi[i] >> (Vs1_vu[i] & (sew - 1));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x2a: vssrl_vv({{
+                        int sh = Vs1_vu[i] & (sew - 1);
+                        __uint128_t val = Vs2_vu[i];
+
+                        val = int_rounding<__uint128_t>(val,
+                            xc->readMiscReg(MISCREG_VXRM), sh);
+                        Vd_vu[i] = val >> sh;
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x2b: vssra_vv({{
+                        int sh = Vs1_vi[i] & (sew - 1);
+                        __int128_t val = Vs2_vi[i];
+
+                        val = int_rounding<__int128_t>(val,
+                            xc->readMiscReg(MISCREG_VXRM), sh);
+                        Vd_vi[i] = val >> sh;
+                    }}, OPIVV, VectorIntegerArithOp);
+                }
+                format VectorReduceIntWideningFormat {
+                    0x30: vwredsumu_vs({{
+                        Vd_vwu[0] = reduce_loop(std::plus<vwu>(),
+                            Vs1_vwu, Vs2_vu);
+                    }}, OPIVV, VectorIntegerReduceOp);
+                    0x31: vwredsum_vs({{
+                        Vd_vwu[0] = reduce_loop(std::plus<vwi>(),
+                            Vs1_vwi, Vs2_vi);
+                    }}, OPIVV, VectorIntegerReduceOp);
+                }
+                format VectorIntMaskFormat {
+                    0x11: decode VM {
+                        0x0: vmadc_vvm({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                carry_out(Vs2_vu[i], Vs1_vu[i],
+                                    elem_mask(v0, ei)));
+                        }}, OPIVV, VectorIntegerArithOp);
+                        0x1: vmadc_vv({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                carry_out(Vs2_vu[i], Vs1_vu[i]));
+                        }}, OPIVV, VectorIntegerArithOp);
+                    }
+                    0x13: decode VM {
+                        0x0: vmsbc_vvm({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                borrow_out(Vs2_vi[i], Vs1_vi[i],
+                                    elem_mask(v0, ei)));
+                        }}, OPIVV, VectorIntegerArithOp);
+                        0x1: vmsbc_vv({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                borrow_out(Vs2_vi[i], Vs1_vi[i]));
+                        }}, OPIVV, VectorIntegerArithOp);
+                    }
+                    0x18: vmseq_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] == Vs1_vu[i]));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x19: vmsne_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] != Vs1_vu[i]));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x1a: vmsltu_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] < Vs1_vu[i]));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x1b: vmslt_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] < Vs1_vi[i]));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x1c: vmsleu_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] <= Vs1_vu[i]));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x1d: vmsle_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] <= Vs1_vi[i]));
+                    }}, OPIVV, VectorIntegerArithOp);
+                }
+                format VectorIntNarrowingFormat {
+                    0x2c: vnsrl_wv({{
+                        Vd_vu[i + offset] = (vu)(Vs2_vwu[i] >>
+                            ((vwu)Vs1_vu[i + offset] & (sew * 2 - 1)));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x2d: vnsra_wv({{
+                        Vd_vi[i + offset] = (vi)(Vs2_vwi[i] >>
+                            ((vwu)Vs1_vu[i + offset] & (sew * 2 - 1)));
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x2e: vnclipu_wv({{
+                        vu max = std::numeric_limits<vu>::max();
+                        uint64_t sign_mask =
+                            std::numeric_limits<uint64_t>::max() << sew;
+                        __uint128_t res = Vs2_vwu[i];
+                        unsigned shift = Vs1_vu[i + offset] & ((sew * 2) - 1);
+
+                        res = int_rounding<__uint128_t>(
+                            res, 0 /* TODO */, shift) >> shift;
+
+                        if (res & sign_mask) {
+                            res = max;
+                            // TODO: vxsat
+                        }
+
+                        Vd_vu[i + offset] = (vu)res;
+                    }}, OPIVV, VectorIntegerArithOp);
+                    0x2f: vnclip_wv({{
+                        vi max = std::numeric_limits<vi>::max();
+                        vi min = std::numeric_limits<vi>::min();
+                        __int128_t res = Vs2_vwi[i];
+                        unsigned shift = Vs1_vi[i + offset] & ((sew * 2) - 1);
+
+                        res = int_rounding<__int128_t>(
+                            res, 0 /* TODO */, shift) >> shift;
+
+                        if (res < min) {
+                            res = min;
+                            // TODO: vxsat
+                        } else if (res > max) {
+                            res = max;
+                            // TODO: vxsat
+                        }
+
+                        Vd_vi[i + offset] = (vi)res;
+                    }}, OPIVV, VectorIntegerArithOp);
+                }
+            }
+            // OPFVV
+            0x1: decode VFUNCT6 {
+                0x00: VectorFloatFormat::vfadd_vv({{
+                    auto fd = fadd<et>(ftype<et>(Vs2_vu[i]),
+                                       ftype<et>(Vs1_vu[i]));
+                    Vd_vu[i] = fd.v;
+                }}, OPFVV, VectorFloatArithOp);
+                0x01: VectorReduceFloatFormat::vfredusum_vs({{
+                    Vd_vu[0] = reduce_loop([](const vu& src1, const vu& src2) {
+                        return fadd<et>(ftype<et>(src1), ftype<et>(src2));
+                    }, Vs1_vu, Vs2_vu);
+                }}, OPFVV, VectorFloatReduceOp);
+                0x02: VectorFloatFormat::vfsub_vv({{
+                    auto fd = fsub<et>(ftype<et>(Vs2_vu[i]),
+                                       ftype<et>(Vs1_vu[i]));
+                    Vd_vu[i] = fd.v;
+                }}, OPFVV, VectorFloatArithOp);
+                0x03: VectorReduceFloatFormat::vfredosum_vs({{
+                    Vd_vu[0] = reduce_loop([](const vu& src1, const vu& src2) {
+                        return fadd<et>(ftype<et>(src1), ftype<et>(src2));
+                    }, Vs1_vu, Vs2_vu);
+                }}, OPFVV, VectorFloatReduceOp);
+                0x04: VectorFloatFormat::vfmin_vv({{
+                    auto fd = fmin<et>(ftype<et>(Vs2_vu[i]),
+                                       ftype<et>(Vs1_vu[i]));
+                    Vd_vu[i] = fd.v;
+                }}, OPFVV, VectorFloatArithOp);
+                0x05: VectorReduceFloatFormat::vfredmin_vs({{
+                    Vd_vu[0] = reduce_loop([](const vu& src1, const vu& src2) {
+                        return fmin<et>(ftype<et>(src1), ftype<et>(src2));
+                    }, Vs1_vu, Vs2_vu);
+                }}, OPFVV, VectorFloatReduceOp);
+                0x06: VectorFloatFormat::vfmax_vv({{
+                    auto fd = fmax<et>(ftype<et>(Vs2_vu[i]),
+                                       ftype<et>(Vs1_vu[i]));
+                    Vd_vu[i] = fd.v;
+                }}, OPFVV, VectorFloatArithOp);
+                0x07: VectorReduceFloatFormat::vfredmax_vs({{
+                    Vd_vu[0] = reduce_loop([](const vu& src1, const vu& src2) {
+                        return fmax<et>(ftype<et>(src1), ftype<et>(src2));
+                    }, Vs1_vu, Vs2_vu);
+                }}, OPFVV, VectorFloatReduceOp);
+                0x08: VectorFloatFormat::vfsgnj_vv({{
+                    Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
+                                         ftype<et>(Vs1_vu[i]),
+                                         false, false).v;
+                }}, OPFVV, VectorFloatArithOp);
+                0x09: VectorFloatFormat::vfsgnjn_vv({{
+                    Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
+                                         ftype<et>(Vs1_vu[i]),
+                                         true, false).v;
+                }}, OPFVV, VectorFloatArithOp);
+                0x0a: VectorFloatFormat::vfsgnjx_vv({{
+                    Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
+                                         ftype<et>(Vs1_vu[i]),
+                                         false, true).v;
+                }}, OPFVV, VectorFloatArithOp);
+                // VWFUNARY0
+                0x10: decode VS1 {
+                    0x00: decode VM {
+                        // The encodings corresponding to the masked versions
+                        // (vm=0) of vfmv.f.s are reserved
+                        0x1: VectorNonSplitFormat::vfmv_f_s({{
+                            freg_t fd = freg(Vs2_vu[0]);
+                            Fd_bits = fd.v;
+                        }}, OPFVV, VectorMiscOp);
+                    }
+                }
+                0x12: decode VS1 {
+                    format VectorFloatCvtFormat {
+                        0x00: vfcvt_xu_f_v({{
+                            Vd_vu[i] = f_to_ui<et>(ftype<et>(Vs2_vu[i]),
+                                                   softfloat_roundingMode);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x01: vfcvt_x_f_v({{
+                            Vd_vu[i] = f_to_i<et>(ftype<et>(Vs2_vu[i]),
+                                                  softfloat_roundingMode);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x02: vfcvt_f_xu_v({{
+                            auto fd = ui_to_f<et>(Vs2_vu[i]);
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x03: vfcvt_f_x_v({{
+                            auto fd = i_to_f<et>(Vs2_vu[i]);
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x06: vfcvt_rtz_xu_f_v({{
+                            Vd_vu[i] = f_to_ui<et>(ftype<et>(Vs2_vu[i]),
+                                                   softfloat_round_minMag);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x07: vfcvt_rtz_x_f_v({{
+                            Vd_vu[i] = f_to_i<et>(ftype<et>(Vs2_vu[i]),
+                                                  softfloat_round_minMag);
+                        }}, OPFVV, VectorFloatConvertOp);
+                    }
+                    format VectorFloatWideningCvtFormat {
+                        0x08: vfwcvt_xu_f_v({{
+                            Vd_vwu[i] = f_to_wui<et>(
+                                ftype<et>(Vs2_vu[i + offset]),
+                                softfloat_roundingMode);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x09: vfwcvt_x_f_v({{
+                            Vd_vwu[i] = f_to_wi<et>(
+                                ftype<et>(Vs2_vu[i + offset]),
+                                softfloat_roundingMode);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x0a: vfwcvt_f_xu_v({{
+                            auto fd = ui_to_wf<vu>(Vs2_vu[i + offset]);
+                            Vd_vwu[i] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x0b: vfwcvt_f_x_v({{
+                            auto fd = i_to_wf<vu>(Vs2_vu[i + offset]);
+                            Vd_vwu[i] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x0c: vfwcvt_f_f_v({{
+                            auto fd = f_to_wf<et>(
+                                ftype<et>(Vs2_vu[i + offset]));
+                            Vd_vwu[i] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x0e: vfwcvt_rtz_xu_f_v({{
+                            Vd_vwu[i] = f_to_wui<et>(
+                                ftype<et>(Vs2_vu[i + offset]),
+                                softfloat_round_minMag);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x0f: vfwcvt_rtz_x_f_v({{
+                            Vd_vwu[i] = f_to_wi<et>(
+                                ftype<et>(Vs2_vu[i + offset]),
+                                softfloat_round_minMag);
+                        }}, OPFVV, VectorFloatConvertOp);
+                    }
+                    format VectorFloatNarrowingCvtFormat {
+                        0x10: vfncvt_xu_f_w({{
+                            Vd_vu[i + offset] = f_to_nui<vu>(
+                                ftype<ewt>(Vs2_vwu[i]),
+                                softfloat_roundingMode);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x11: vfncvt_x_f_w({{
+                            Vd_vu[i + offset] = f_to_ni<vu>(
+                                ftype<ewt>(Vs2_vwu[i]),
+                                softfloat_roundingMode);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x12: vfncvt_f_xu_w({{
+                            auto fd = ui_to_nf<et>(Vs2_vwu[i]);
+                            Vd_vu[i + offset] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x13: vfncvt_f_x_w({{
+                            auto fd = i_to_nf<et>(Vs2_vwu[i]);
+                            Vd_vu[i + offset] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x14: vfncvt_f_f_w({{
+                            auto fd = f_to_nf<et>(ftype<ewt>(Vs2_vwu[i]));
+                            Vd_vu[i + offset] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x15: vfncvt_rod_f_f_w({{
+                            softfloat_roundingMode = softfloat_round_odd;
+                            auto fd = f_to_nf<et>(ftype<ewt>(Vs2_vwu[i]));
+                            Vd_vu[i + offset] = fd.v;
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x16: vfncvt_rtz_xu_f_w({{
+                            Vd_vu[i + offset] = f_to_nui<vu>(
+                                ftype<ewt>(Vs2_vwu[i]),
+                                softfloat_round_minMag);
+                        }}, OPFVV, VectorFloatConvertOp);
+                        0x17: vfncvt_rtz_x_f_w({{
+                            Vd_vu[i + offset] = f_to_ni<vu>(
+                                ftype<ewt>(Vs2_vwu[i]),
+                                softfloat_round_minMag);
+                        }}, OPFVV, VectorFloatConvertOp);
+                    }
+                }
+                0x13: decode VS1 {
+                    format VectorFloatCvtFormat {
+                        0x00: vfsqrt_v({{
+                            auto fd = fsqrt<et>(ftype<et>(Vs2_vu[i]));
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVV, VectorFloatArithOp);
+                        0x04: vfrsqrt7_v({{
+                            auto fd = frsqrte7<et>(ftype<et>(Vs2_vu[i]));
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVV, VectorFloatArithOp);
+                        0x05: vfrec7_v({{
+                            auto fd = frecip7<et>(ftype<et>(Vs2_vu[i]));
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVV, VectorFloatArithOp);
+                        0x10: vfclass_v({{
+                            auto fd = fclassify<et>(ftype<et>(Vs2_vu[i]));
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVV, VectorFloatArithOp);
+                    }
+                }
+
+                format VectorFloatMaskFormat {
+                    0x18: vmfeq_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            feq<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype<et>(Vs1_vu[i])));
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x19: vmfle_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            fle<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype<et>(Vs1_vu[i])));
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x1b: vmflt_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            flt<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype<et>(Vs1_vu[i])));
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x1c: vmfne_vv({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            !feq<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype<et>(Vs1_vu[i])));
+                    }}, OPFVV, VectorFloatArithOp);
+                }
+                format VectorFloatFormat {
+                    0x20: vfdiv_vv({{
+                        auto fd = fdiv<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype<et>(Vs1_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x24: vfmul_vv({{
+                        auto fd = fmul<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype<et>(Vs1_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x28: vfmadd_vv({{
+                        auto fd = fmadd<et>(ftype<et>(Vs3_vu[i]),
+                                            ftype<et>(Vs1_vu[i]),
+                                            ftype<et>(Vs2_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x29: vfnmadd_vv({{
+                        auto fd = fmadd<et>(fneg(ftype<et>(Vs3_vu[i])),
+                                            ftype<et>(Vs1_vu[i]),
+                                            fneg(ftype<et>(Vs2_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x2a: vfmsub_vv({{
+                        auto fd = fmadd<et>(ftype<et>(Vs3_vu[i]),
+                                            ftype<et>(Vs1_vu[i]),
+                                            fneg(ftype<et>(Vs2_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x2b: vfnmsub_vv({{
+                        auto fd = fmadd<et>(fneg(ftype<et>(Vs3_vu[i])),
+                                            ftype<et>(Vs1_vu[i]),
+                                            ftype<et>(Vs2_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x2c: vfmacc_vv({{
+                        auto fd = fmadd<et>(ftype<et>(Vs1_vu[i]),
+                                            ftype<et>(Vs2_vu[i]),
+                                            ftype<et>(Vs3_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x2d: vfnmacc_vv({{
+                        auto fd = fmadd<et>(fneg(ftype<et>(Vs1_vu[i])),
+                                            ftype<et>(Vs2_vu[i]),
+                                            fneg(ftype<et>(Vs3_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x2e: vfmsac_vv({{
+                        auto fd = fmadd<et>(ftype<et>(Vs1_vu[i]),
+                                            ftype<et>(Vs2_vu[i]),
+                                            fneg(ftype<et>(Vs3_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x2f: vfnmsac_vv({{
+                        auto fd = fmadd<et>(fneg(ftype<et>(Vs1_vu[i])),
+                                            ftype<et>(Vs2_vu[i]),
+                                            ftype<et>(Vs3_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x31: VectorReduceFloatWideningFormat::vfwredusum_vs({{
+                        Vd_vwu[0] = reduce_loop(
+                            [](const vwu& src1, const vu& src2) {
+                                return fadd<ewt>(
+                                    ftype<ewt>(src1),
+                                    f_to_wf<et>(ftype<et>(src2))
+                                );
+                            }, Vs1_vwu, Vs2_vu);
+                    }}, OPFVV, VectorFloatReduceOp);
+                    0x33: VectorReduceFloatWideningFormat::vfwredosum_vs({{
+                        Vd_vwu[0] = reduce_loop(
+                            [](const vwu& src1, const vu& src2) {
+                                return fadd<ewt>(
+                                    ftype<ewt>(src1),
+                                    f_to_wf<et>(ftype<et>(src2))
+                                );
+                            }, Vs1_vwu, Vs2_vu);
+                    }}, OPFVV, VectorFloatReduceOp);
+                }
+                format VectorFloatWideningFormat {
+                    0x30: vfwadd_vv({{
+                        auto fd = fadd<ewt>(
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fwiden(ftype<et>(Vs1_vu[i + offset])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x32: vfwsub_vv({{
+                        auto fd = fsub<ewt>(
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fwiden(ftype<et>(Vs1_vu[i + offset])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x34: vfwadd_wv({{
+                        auto fd = fadd<ewt>(
+                            ftype<ewt>(Vs2_vwu[i]),
+                            fwiden(ftype<et>(Vs1_vu[i + offset])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x36: vfwsub_wv({{
+                        auto fd = fsub<ewt>(
+                            ftype<ewt>(Vs2_vwu[i]),
+                            fwiden(ftype<et>(Vs1_vu[i + offset])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x38: vfwmul_vv({{
+                        auto fd = fmul<ewt>(
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fwiden(ftype<et>(Vs1_vu[i + offset])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x3c: vfwmacc_vv({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(ftype<et>(Vs1_vu[i + offset])),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            ftype<ewt>(Vs3_vwu[i]));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x3d: vfwnmacc_vv({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(fneg(ftype<et>(Vs1_vu[i + offset]))),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fneg(ftype<ewt>(Vs3_vwu[i])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x3e: vfwmsac_vv({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(ftype<et>(Vs1_vu[i + offset])),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fneg(ftype<ewt>(Vs3_vwu[i])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                    0x3f: vfwnmsac_vv({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(fneg(ftype<et>(Vs1_vu[i + offset]))),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            ftype<ewt>(Vs3_vwu[i]));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVV, VectorFloatArithOp);
+                }
+            }
+            // OPMVV
+            0x2: decode VFUNCT6 {
+                format VectorReduceIntFormat {
+                    0x0: vredsum_vs({{
+                        Vd_vi[0] =
+                            reduce_loop(std::plus<vi>(), Vs1_vi, Vs2_vi);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x1: vredand_vs({{
+                        Vd_vi[0] =
+                            reduce_loop(std::bit_and<vi>(), Vs1_vi, Vs2_vi);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x2: vredor_vs({{
+                        Vd_vi[0] =
+                            reduce_loop(std::bit_or<vi>(), Vs1_vi, Vs2_vi);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x3: vredxor_vs({{
+                        Vd_vi[0] =
+                            reduce_loop(std::bit_xor<vi>(), Vs1_vi, Vs2_vi);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x4: vredminu_vs({{
+                        Vd_vu[0] =
+                            reduce_loop([](const vu& src1, const vu& src2) {
+                                return std::min<vu>(src1, src2);
+                            }, Vs1_vu, Vs2_vu);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x5: vredmin_vs({{
+                        Vd_vi[0] =
+                            reduce_loop([](const vi& src1, const vi& src2) {
+                                return std::min<vi>(src1, src2);
+                            }, Vs1_vi, Vs2_vi);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x6: vredmaxu_vs({{
+                        Vd_vu[0] =
+                            reduce_loop([](const vu& src1, const vu& src2) {
+                                return std::max<vu>(src1, src2);
+                            }, Vs1_vu, Vs2_vu);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                    0x7: vredmax_vs({{
+                        Vd_vi[0] =
+                            reduce_loop([](const vi& src1, const vi& src2) {
+                                return std::max<vi>(src1, src2);
+                            }, Vs1_vi, Vs2_vi);
+                    }}, OPMVV, VectorIntegerReduceOp);
+                }
+                format VectorIntFormat {
+                    0x8: vaaddu_vv({{
+                        __uint128_t res = (__uint128_t)Vs2_vu[i] + Vs1_vu[i];
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vu[i] = res >> 1;
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x9: vaadd_vv({{
+                        __uint128_t res = (__uint128_t)Vs2_vi[i] + Vs1_vi[i];
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vi[i] = res >> 1;
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0xa: vasubu_vv({{
+                        __uint128_t res = (__uint128_t)Vs2_vu[i] - Vs1_vu[i];
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vu[i] = res >> 1;
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0xb: vasub_vv({{
+                        __uint128_t res = (__uint128_t)Vs2_vi[i] - Vs1_vi[i];
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vi[i] = res >> 1;
+                    }}, OPMVV, VectorIntegerArithOp);
+                }
+                // VWXUNARY0
+                0x10: decode VS1 {
+                    0x00: decode VM {
+                        // The encodings corresponding to the masked versions
+                        // (vm=0) of vmv.x.s are reserved.
+                        0x1: VectorNonSplitFormat::vmv_x_s({{
+                            Rd_ud = Vs2_vi[0];
+                        }}, OPMVV, VectorMiscOp);
+                    }
+                    0x10: Vector1Vs1RdMaskFormat::vcpop_m({{
+                        uint64_t popcount = 0;
+                        for (uint32_t i = 0; i < (uint32_t)machInst.vl; i++) {
+                            bool vs2_lsb = elem_mask(Vs2_vu, i);
+                            if(this->vm){
+                                popcount += vs2_lsb;
+                            }else{
+                                bool do_mask = elem_mask(v0, i);
+                                popcount += (vs2_lsb && do_mask);
+                            }
+                        }
+                        Rd_vu = popcount;
+                    }}, OPMVV, VectorMiscOp);
+                    0x11: Vector1Vs1RdMaskFormat::vfirst_m({{
+                        int64_t pos = -1;
+                        for (uint32_t i = 0; i < (uint32_t)machInst.vl; i++) {
+                            if(this->vm == 0){
+                                if(elem_mask(v0, i)==0){
+                                    continue;
+                                }
+                            }
+                            bool vs2_lsb = elem_mask(Vs2_vu, i);
+                            if (vs2_lsb) {
+                                pos = i;
+                                break;
+                            }
+                        }
+                        Rd_vu = pos;
+                    }}, OPMVV, VectorMiscOp);
+                }
+                0x12: decode VS1 {
+                    format VectorIntExtFormat {
+                        0x02: vzext_vf8({{
+                            Vd_vu[i] = Vs2_vextu[i + offset];
+                        }}, OPMVV, VectorIntegerExtensionOp);
+                        0x03: vsext_vf8({{
+                            Vd_vi[i] = Vs2_vext[i + offset];
+                        }}, OPMVV, VectorIntegerExtensionOp);
+                        0x04: vzext_vf4({{
+                            Vd_vu[i] = Vs2_vextu[i + offset];
+                        }}, OPMVV, VectorIntegerExtensionOp);
+                        0x05: vsext_vf4({{
+                            Vd_vi[i] = Vs2_vext[i + offset];
+                        }}, OPMVV, VectorIntegerExtensionOp);
+                        0x06: vzext_vf2({{
+                            Vd_vu[i] = Vs2_vextu[i + offset];
+                        }}, OPMVV, VectorIntegerExtensionOp);
+                        0x07: vsext_vf2({{
+                            Vd_vi[i] = Vs2_vext[i + offset];
+                        }}, OPMVV, VectorIntegerExtensionOp);
+                    }
+                }
+                0x14: decode VS1 {
+                    0x01: Vector1Vs1VdMaskFormat::vmsbf_m({{
+                        bool has_one = false;
+                        for (uint32_t i = 0; i < (uint32_t)machInst.vl; i++) {
+                            bool vs2_lsb = elem_mask(Vs2_vu, i);
+                            bool do_mask = elem_mask(v0, i);
+                            if(this->vm||(this->vm == 0&&do_mask)){
+                                uint64_t res = 0;
+                                if (!has_one && !vs2_lsb) {
+                                    res = 1;
+                                } else if(!has_one && vs2_lsb) {
+                                    has_one = true;
+                                }
+                                Vd_ub[i/8] = ASSIGN_VD_BIT(i, res);
+                            }
+                        }
+                    }}, OPMVV, VectorMiscOp);
+                    0x02: Vector1Vs1VdMaskFormat::vmsof_m({{
+                        bool has_one = false;
+                        for (uint32_t i = 0; i < (uint32_t)machInst.vl; i++) {
+                            bool vs2_lsb = elem_mask(Vs2_vu, i);
+                            bool do_mask = elem_mask(v0, i);
+                            if(this->vm||(this->vm == 0&&do_mask)){
+                                uint64_t res = 0;
+                                if(!has_one && vs2_lsb) {
+                                    has_one = true;
+                                    res = 1;
+                                }
+                                Vd_ub[i/8] = ASSIGN_VD_BIT(i, res);
+                            }
+                        }
+                    }}, OPMVV, VectorMiscOp);
+                    0x03: Vector1Vs1VdMaskFormat::vmsif_m({{
+                        bool has_one = false;
+                        for (uint32_t i = 0; i < (uint32_t)machInst.vl; i++) {
+                            bool vs2_lsb = elem_mask(Vs2_vu, i);
+                            bool do_mask = elem_mask(v0, i);
+                            if(this->vm||(this->vm == 0&&do_mask)){
+                                uint64_t res = 0;
+                                if (!has_one && !vs2_lsb) {
+                                    res = 1;
+                                } else if(!has_one && vs2_lsb) {
+                                    has_one = true;
+                                    res = 1;
+                                }
+                                Vd_ub[i/8] = ASSIGN_VD_BIT(i, res);
+                            }
+                        }
+                    }}, OPMVV, VectorMiscOp);
+                    0x10: ViotaFormat::viota_m({{
+                        RiscvISAInst::VecRegContainer tmp_s2;
+                        xc->getRegOperand(this, 2,
+                            &tmp_s2);
+                        auto Vs2bit = tmp_s2.as<vu>();
+                        for (uint32_t i = 0; i < this->microVl; i++) {
+                            uint32_t ei = i +
+                                vtype_VLMAX(vtype, true) * this->microIdx;
+                            bool vs2_lsb = elem_mask(Vs2bit, ei);
+                            bool do_mask = elem_mask(v0, ei);
+                            bool has_one = false;
+                            if (this->vm || (do_mask && !this->vm)) {
+                                if (vs2_lsb) {
+                                    has_one = true;
+                                }
+                            }
+                            bool use_ori = (!this->vm) && !do_mask;
+                            if(use_ori == false){
+                                Vd_vu[i] = *cnt;
+                            }
+                            if (has_one) {
+                                *cnt = *cnt+1;
+                            }
+                        }
+                    }}, OPMVV, VectorMiscOp);
+                    0x11: VectorIntFormat::vid_v({{
+                        Vd_vu[i] = ei;
+                    }}, OPMVV, VectorMiscOp);
+                }
+                format VectorMaskFormat {
+                    0x18: vmandn_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            elem_mask(Vs2_vu, i) & !elem_mask(Vs1_vu, i));
+                    }}, OPMVV, VectorMiscOp);
+                    0x19: vmand_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            elem_mask(Vs2_vu, i) & elem_mask(Vs1_vu, i));
+                    }}, OPMVV, VectorMiscOp);
+                    0x1a: vmor_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            elem_mask(Vs2_vu, i) | elem_mask(Vs1_vu, i));
+                    }}, OPMVV, VectorMiscOp);
+                    0x1b: vmxor_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            elem_mask(Vs2_vu, i) ^ elem_mask(Vs1_vu, i));
+                    }}, OPMVV, VectorMiscOp);
+                    0x1c: vmorn_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            elem_mask(Vs2_vu, i) | !elem_mask(Vs1_vu, i));
+                    }}, OPMVV, VectorMiscOp);
+                    0x1d: vmnand_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            !(elem_mask(Vs2_vu, i) & elem_mask(Vs1_vu, i)));
+                    }}, OPMVV, VectorMiscOp);
+                    0x1e: vmnor_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            !(elem_mask(Vs2_vu, i) | elem_mask(Vs1_vu, i)));
+                    }}, OPMVV, VectorMiscOp);
+                    0x1f: vmxnor_mm({{
+                        Vd_ub[i/8] = ASSIGN_VD_BIT(i,
+                            !(elem_mask(Vs2_vu, i) ^ elem_mask(Vs1_vu, i)));
+                    }}, OPMVV, VectorMiscOp);
+                }
+                format VectorIntFormat {
+                    0x20: vdivu_vv({{
+                        if (Vs1_vu[i] == 0)
+                            Vd_vu[i] = (vu)-1;
+                        else
+                            Vd_vu[i] = Vs2_vu[i] / Vs1_vu[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x21: vdiv_vv({{
+                        if (Vs1_vi[i] == 0)
+                            Vd_vi[i] = -1;
+                        else if (Vs2_vi[i] == std::numeric_limits<vi>::min()
+                                && Vs1_vi[i] == -1)
+                            Vd_vi[i] = Vs2_vi[i];
+                        else
+                            Vd_vi[i] = Vs2_vi[i] / Vs1_vi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x22: vremu_vv({{
+                        if (Vs1_vu[i] == 0) {
+                            Vd_vu[i] = Vs2_vu[i];
+                        } else {
+                            Vd_vu[i] = Vs2_vu[i] % Vs1_vu[i];
+                        }
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x23: vrem_vv({{
+                        if (Vs1_vi[i] == 0) {
+                            Vd_vi[i] = Vs2_vi[i];
+                        } else if (Vs2_vi[i] == std::numeric_limits<vi>::min()
+                                && Vs1_vi[i] == -1) {
+                            Vd_vi[i] = 0;
+                        } else {
+                            Vd_vi[i] = Vs2_vi[i] % Vs1_vi[i];
+                        }
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x24: vmulhu_vv({{
+                        if (sew < 64) {
+                            Vd_vu[i] = ((uint64_t)Vs2_vu[i] * Vs1_vu[i])
+                                        >> sew;
+                        } else {
+                            Vd_vu[i] = mulhu_64(Vs2_vu[i], Vs1_vu[i]);
+                        }
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x25: vmul_vv({{
+                        Vd_vi[i] = Vs2_vi[i] * Vs1_vi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x26: vmulhsu_vv({{
+                        if (sew < 64) {
+                            Vd_vi[i] = ((int64_t)Vs2_vi[i] *
+                                        (uint64_t)Vs1_vu[i])
+                                        >> sew;
+                        } else {
+                            Vd_vi[i] = mulhsu_64(Vs2_vi[i], Vs1_vu[i]);
+                        }
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x27: vmulh_vv({{
+                        if (sew < 64) {
+                            Vd_vi[i] = ((int64_t)Vs2_vi[i] * Vs1_vi[i])
+                                        >> sew;
+                        } else {
+                            Vd_vi[i] = mulh_64(Vs2_vi[i], Vs1_vi[i]);
+                        }
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x29: vmadd_vv({{
+                        Vd_vi[i] = Vs3_vi[i] * Vs1_vi[i] + Vs2_vi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x2b: vnmsub_vv({{
+                        Vd_vi[i] = -(Vs3_vi[i] * Vs1_vi[i]) + Vs2_vi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x2d: vmacc_vv({{
+                        Vd_vi[i] = Vs2_vi[i] * Vs1_vi[i] + Vs3_vi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x2f: vnmsac_vv({{
+                        Vd_vi[i] = -(Vs2_vi[i] * Vs1_vi[i]) + Vs3_vi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                }
+                format VectorIntWideningFormat {
+                    0x30: vwaddu_vv({{
+                        Vd_vwu[i] = vwu(Vs2_vu[i + offset])
+                                + vwu(Vs1_vu[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x31: vwadd_vv({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset])
+                                + vwi(Vs1_vi[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x32: vwsubu_vv({{
+                        Vd_vwu[i] = vwu(Vs2_vu[i + offset])
+                                - vwu(Vs1_vu[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x33: vwsub_vv({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset])
+                                - vwi(Vs1_vi[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x34: vwaddu_wv({{
+                        Vd_vwu[i] = Vs2_vwu[i] + vwu(Vs1_vu[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x35: vwadd_wv({{
+                        Vd_vwi[i] = Vs2_vwi[i] + vwi(Vs1_vi[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x36: vwsubu_wv({{
+                        Vd_vwu[i] = Vs2_vwu[i] - vwu(Vs1_vu[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x37: vwsub_wv({{
+                        Vd_vwi[i] = Vs2_vwi[i] - vwi(Vs1_vi[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x38: vwmulu_vv({{
+                        Vd_vwu[i] = vwu(Vs2_vu[i + offset])
+                                * vwu(Vs1_vu[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x3a: vwmulsu_vv({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset])
+                                * vwu(Vs1_vu[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x3b: vwmul_vv({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset])
+                                * vwi(Vs1_vi[i + offset]);
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x3c: vwmaccu_vv({{
+                        Vd_vwu[i] = vwu(Vs1_vu[i + offset])
+                                * vwu(Vs2_vu[i + offset])
+                                + Vs3_vwu[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x3d: vwmacc_vv({{
+                        Vd_vwi[i] = vwi(Vs1_vi[i + offset])
+                                * vwi(Vs2_vi[i + offset])
+                                + Vs3_vwi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                    0x3f: vwmaccsu_vv({{
+                        Vd_vwi[i] = vwi(Vs1_vi[i + offset])
+                                * vwu(Vs2_vu[i + offset])
+                                + Vs3_vwi[i];
+                    }}, OPMVV, VectorIntegerArithOp);
+                }
+            }
+            // OPIVI
+            0x3: decode VFUNCT6 {
+                format VectorIntFormat {
+                    0x00: vadd_vi({{
+                        Vd_vi[i] = Vs2_vi[i] + (vi)sext<5>(SIMM5);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x03: vrsub_vi({{
+                        Vd_vi[i] = (vi)sext<5>(SIMM5) - Vs2_vi[i];
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x09: vand_vi({{
+                        Vd_vi[i] = Vs2_vi[i] & (vi)sext<5>(SIMM5);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x0a: vor_vi({{
+                        Vd_vi[i] = Vs2_vi[i] | (vi)sext<5>(SIMM5);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x0b: vxor_vi({{
+                        Vd_vi[i] = Vs2_vi[i] ^ (vi)sext<5>(SIMM5);
+                    }}, OPIVI, VectorIntegerArithOp);
+                }
+                0x0c: VectorGatherFormat::vrgather_vi({{
+                    for (uint32_t i = 0; i < microVl; i++) {
+                        uint32_t ei = i + vs1_idx * vs1_elems + vs1_bias;
+                        if (this->vm || elem_mask(v0, ei)) {
+                            const uint64_t idx =
+                                (uint64_t)sext<5>(SIMM5) - vs2_elems * vs2_idx;
+                            Vd_vu[i] = ((uint64_t)sext<5>(SIMM5) >= vlmax) ? 0
+                                : (idx < vs2_elems) ? Vs2_vu[idx]
+                                : Vs3_vu[i];
+                        }
+                    }
+                }}, OPIVI, VectorMiscOp);
+                0x0e: VectorSlideUpFormat::vslideup_vi({{
+                    const int offset = (int)(uint64_t)(SIMM5);
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vdIdx - vs2Idx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int elemOffset = vdOffset + vdIdx * microVlmax;
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            if (this->vm || elem_mask(v0, i + elemOffset)) {
+                                Vd_vu[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                            }
+                        }
+                    }
+                }}, OPIVI, VectorMiscOp);
+                0x0f: VectorSlideDownFormat::vslidedown_vi({{
+                    const int offset = (int)(uint64_t)(SIMM5);
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vs2Idx - vdIdx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    const int numVs2s = vtype_regs_per_group(vtype);
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const bool needZeroTail = numVs2s == vs2Idx + 1;
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int elemIdxBase = vdIdx * microVlmax;
+                        vreg_t resVreg;
+                        auto res = resVreg.as<vu>();
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            res[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                        }
+                        if (needZeroTail) {
+                            for (int i = upperBound + vdOffset;
+                                i < microVlmax; i++) {
+                                res[i] = 0;
+                            }
+                        }
+                        for (int i = vdOffset; i < microVl ; i++) {
+                            if (vm || elem_mask(v0, i + elemIdxBase)) {
+                                Vd_vu[i] = res[i];
+                            }
+                        }
+                    }
+                }}, OPIVI, VectorMiscOp);
+                format VectorIntFormat {
+                    0x10: decode VM {
+                        0x0: vadc_vim({{
+                            Vd_vi[i] = Vs2_vi[i] +
+                                (vi)sext<5>(SIMM5) + elem_mask(v0, ei);
+                        }}, OPIVI, VectorIntegerArithOp);
+                        // the unmasked versions (vm=1) are reserved
+                    }
+                    0x17: decode VM {
+                        0x0: vmerge_vim({{
+                            Vd_vi[i] = elem_mask(v0, ei)
+                                    ? (vi)sext<5>(SIMM5)
+                                    : Vs2_vi[i];
+                        }}, OPIVI, VectorIntegerArithOp);
+                        0x1: vmv_v_i({{
+                            Vd_vi[i] = (vi)sext<5>(SIMM5);
+                        }}, OPIVI, VectorIntegerArithOp);
+                    }
+                }
+                format VectorIntVxsatFormat{
+                    0x20: vsaddu_vi({{
+                        Vd_vu[i] = sat_addu<vu>(Vs2_vu[i], (vu)SIMM5,
+                            vxsatptr);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x21: vsadd_vi({{
+                        Vd_vu[i] = sat_add<vi>(Vs2_vu[i], (vu)SIMM5,
+                            vxsatptr);
+                    }}, OPIVI, VectorIntegerArithOp);
+                }
+                format VectorIntFormat {
+                    0x25: vsll_vi({{
+                        Vd_vu[i] = Vs2_vu[i] << ((vu)SIMM5 & (sew - 1) & 0x1f);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x28: vsrl_vi({{
+                        Vd_vu[i] = Vs2_vu[i] >> ((vu)SIMM5 & (sew - 1) & 0x1f);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x2a: vssrl_vi({{
+                        int sh = SIMM5 & (vtype_SEW(vtype) - 1);
+                        __uint128_t res = Vs2_vu[i];
+
+                        res = int_rounding<__uint128_t>(
+                            res, 0 /* TODO */, sh) >> sh;
+
+                        Vd_vu[i] = res;
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x29: vsra_vi({{
+                        Vd_vi[i] = Vs2_vi[i] >> ((vu)SIMM5 & (sew - 1) & 0x1f);
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x2b: vssra_vi({{
+                        int sh = SIMM5 & (sew - 1);
+                        __int128_t val = Vs2_vi[i];
+
+                        val = int_rounding<__int128_t>(val,
+                            xc->readMiscReg(MISCREG_VXRM), sh);
+                        Vd_vi[i] = val >> sh;
+                    }}, OPIVI, VectorIntegerArithOp);
+                }
+                // According to Spec Section 16.6,
+                // vm must be 1 (unmasked) in vmv<nr>r.v instructions.
+                0x27: decode VM { 0x1: decode SIMM3 {
+                    format VMvWholeFormat {
+                        0x0: vmv1r_v({{
+                            Vd_ud[i] = Vs2_ud[i];
+                        }}, OPIVI, VectorMiscOp);
+                        0x1: vmv2r_v({{
+                            Vd_ud[i] = Vs2_ud[i];
+                        }}, OPIVI, VectorMiscOp);
+                        0x3: vmv4r_v({{
+                            Vd_ud[i] = Vs2_ud[i];
+                        }}, OPIVI, VectorMiscOp);
+                        0x7: vmv8r_v({{
+                            Vd_ud[i] = Vs2_ud[i];
+                        }}, OPIVI, VectorMiscOp);
+                    }
+                }}
+                format VectorIntMaskFormat {
+                    0x11: decode VM {
+                        0x0: vmadc_vim({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                carry_out(Vs2_vi[i], (vi)sext<5>(SIMM5),
+                                    elem_mask(v0, ei)));
+                        }}, OPIVI, VectorIntegerArithOp);
+                        0x1: vmadc_vi({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                carry_out(Vs2_vi[i], (vi)sext<5>(SIMM5)));
+                        }}, OPIVI, VectorIntegerArithOp);
+                    }
+                    0x18: vmseq_vi({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] == (vi)sext<5>(SIMM5)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x19: vmsne_vi({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] != (vi)sext<5>(SIMM5)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x1c: vmsleu_vi({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] <= (vu)sext<5>(SIMM5)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x1d: vmsle_vi({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] <= (vi)sext<5>(SIMM5)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x1e: vmsgtu_vi({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] > (vu)sext<5>(SIMM5)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x1f: vmsgt_vi({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] > (vi)sext<5>(SIMM5)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                }
+                format VectorIntNarrowingFormat {
+                    0x2c: vnsrl_wi({{
+                        Vd_vu[i + offset] = (vu)(Vs2_vwu[i] >>
+                                            ((vwu)SIMM5 & (sew * 2 - 1)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x2d: vnsra_wi({{
+                        Vd_vi[i + offset] = (vi)(Vs2_vwi[i] >>
+                                            ((vwu)SIMM5 & (sew * 2 - 1)));
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x2e: vnclipu_wi({{
+                        vu max = std::numeric_limits<vu>::max();
+                        uint64_t sign_mask =
+                            std::numeric_limits<uint64_t>::max() << sew;
+                        __uint128_t res = Vs2_vwu[i];
+                        unsigned shift = VS1 & ((sew * 2) - 1);
+
+                        res = int_rounding<__uint128_t>(
+                            res, 0 /* TODO */, shift) >> shift;
+
+                        if (res & sign_mask) {
+                            // TODO: vxsat
+                            res = max;
+                        }
+
+                        Vd_vu[i + offset] = (vu)res;
+                    }}, OPIVI, VectorIntegerArithOp);
+                    0x2f: vnclip_wi({{
+                        vi max = std::numeric_limits<vi>::max();
+                        vi min = std::numeric_limits<vi>::min();
+                        __int128_t res = Vs2_vwi[i];
+                        unsigned shift = VS1 & ((sew * 2) - 1);
+
+                        res = int_rounding<__int128_t>(
+                            res, 0 /* TODO */, shift) >> shift;
+
+                        if (res < min) {
+                            res = min;
+                            // TODO: vxsat
+                        } else if (res > max) {
+                            res = max;
+                            // TODO: vxsat
+                        }
+
+                        Vd_vi[i + offset] = (vi)res;
+                    }}, OPIVI, VectorIntegerArithOp);
+                }
+            }
+            // OPIVX
+            0x4: decode VFUNCT6 {
+                format VectorIntFormat {
+                    0x0: vadd_vx({{
+                        Vd_vu[i] = Vs2_vu[i] + Rs1_vu;
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x2: vsub_vx({{
+                        Vd_vu[i] = Vs2_vu[i] - Rs1_vu;
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x3: vrsub_vx({{
+                        Vd_vu[i] = Rs1_vu - Vs2_vu[i];
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x4: vminu_vx({{
+                        Vd_vu[i] = std::min(Vs2_vu[i], Rs1_vu);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x5: vmin_vx({{
+                        Vd_vi[i] = std::min(Vs2_vi[i], Rs1_vi);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x6: vmaxu_vx({{
+                        Vd_vu[i] = std::max(Vs2_vu[i], Rs1_vu);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x7: vmax_vx({{
+                        Vd_vi[i] = std::max(Vs2_vi[i], Rs1_vi);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x9: vand_vx({{
+                        Vd_vu[i] = Vs2_vu[i] & Rs1_vu;
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0xa: vor_vx({{
+                        Vd_vu[i] = Vs2_vu[i] | Rs1_vu;
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0xb: vxor_vx({{
+                        Vd_vu[i] = Vs2_vu[i] ^ Rs1_vu;
+                    }}, OPIVX, VectorIntegerArithOp);
+                }
+                0x0e: VectorSlideUpFormat::vslideup_vx({{
+                    const int offset = (int)Rs1_vu;
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vdIdx - vs2Idx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int elemOffset = vdOffset + vdIdx * microVlmax;
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            if (this->vm || elem_mask(v0, i + elemOffset)) {
+                                Vd_vu[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                            }
+                        }
+                    }
+                }}, OPIVX, VectorMiscOp);
+                0x0f: VectorSlideDownFormat::vslidedown_vx({{
+                    const int offset = (int)Rs1_vu;
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vs2Idx - vdIdx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    const int numVs2s = vtype_regs_per_group(vtype);
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const bool needZeroTail = numVs2s == vs2Idx + 1;
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int elemIdxBase = vdIdx * microVlmax;
+                        vreg_t resVreg;
+                        auto res = resVreg.as<vu>();
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            res[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                        }
+                        if (needZeroTail) {
+                            for (int i = upperBound + vdOffset;
+                                i < microVlmax; i++) {
+                                res[i] = 0;
+                            }
+                        }
+                        for (int i = vdOffset; i < microVl ; i++) {
+                            if (vm || elem_mask(v0, i + elemIdxBase)) {
+                                Vd_vu[i] = res[i];
+                            }
+                        }
+                    }
+                }}, OPIVX, VectorMiscOp);
+                0x0c: VectorGatherFormat::vrgather_vx({{
+                    for (uint32_t i = 0; i < microVl; i++) {
+                        uint32_t ei = i + vs1_idx * vs1_elems + vs1_bias;
+                        if (this->vm || elem_mask(v0, ei)) {
+                            const uint64_t idx = Rs1_vu - vs2_elems * vs2_idx;
+                            Vd_vu[i] = (Rs1_vu >= vlmax) ? 0
+                                : (idx < vs2_elems) ? Vs2_vu[idx]
+                                : Vs3_vu[i];
+                        }
+                    }
+                }}, OPIVX, VectorMiscOp);
+                format VectorIntFormat {
+                    0x10: decode VM {
+                        0x0: vadc_vxm({{
+                            Vd_vi[i] = Vs2_vi[i] + Rs1_vi + elem_mask(v0, ei);
+                        }}, OPIVX, VectorIntegerArithOp);
+                        // the unmasked versions (vm=1) are reserved
+                    }
+                    0x12: decode VM {
+                        0x0: vsbc_vxm({{
+                            Vd_vi[i] = Vs2_vi[i] - Rs1_vi - elem_mask(v0, ei);
+                        }}, OPIVX, VectorIntegerArithOp);
+                        // the unmasked versions (vm=1) are reserved
+                    }
+                    0x17: decode VM {
+                        0x0: vmerge_vxm({{
+                            Vd_vu[i] = elem_mask(v0, ei) ? Rs1_vu : Vs2_vu[i];
+                        }}, OPIVX, VectorIntegerArithOp);
+                        0x1: decode VS2 {
+                            0x0: vmv_v_x({{
+                                Vd_vu[i] = Rs1_vu;
+                            }}, OPIVX, VectorIntegerArithOp);
+                        }
+                    }
+                }
+                format VectorIntVxsatFormat{
+                    0x20: vsaddu_vx({{
+                        Vd_vu[i] = sat_addu<vu>(Vs2_vu[i], Rs1_vu,
+                            vxsatptr);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x21: vsadd_vx({{
+                        Vd_vu[i] = sat_add<vi>(Vs2_vu[i], Rs1_vu,
+                            vxsatptr);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x22: vssubu_vx({{
+                        Vd_vu[i] = sat_subu<vu>(Vs2_vu[i], Rs1_vu,
+                            vxsatptr);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x23: vssub_vx({{
+                        Vd_vu[i] = sat_sub<vi>(Vs2_vu[i], Rs1_vu,
+                            vxsatptr);
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x27: vsmul_vx({{
+                        vi max = std::numeric_limits<vi>::max();
+                        vi min = std::numeric_limits<vi>::min();
+                        bool overflow = Rs1_vi == Vs2_vi[i] && Rs1_vi == min;
+                        __int128_t result =
+                            (__int128_t)Rs1_vi * (__int128_t)Vs2_vi[i];
+                        result = int_rounding<__uint128_t>(
+                            result, 0 /* TODO */, sew - 1);
+                        result = result >> (sew - 1);
+                        if (overflow) {
+                            result = max;
+                            *vxsatptr = true;
+                        }
+
+                        Vd_vi[i] = (vi)result;
+                    }}, OPIVX, VectorIntegerArithOp);
+                }
+                format VectorIntFormat {
+                    0x25: vsll_vx({{
+                        Vd_vu[i] = Vs2_vu[i] << (Rs1_vu & (sew - 1));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x28: vsrl_vx({{
+                        Vd_vu[i] = Vs2_vu[i] >> (Rs1_vu & (sew - 1));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x29: vsra_vx({{
+                        Vd_vi[i] = Vs2_vi[i] >> (Rs1_vu & (sew - 1));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x2a: vssrl_vx({{
+                        int sh = Rs1_vu & (sew - 1);
+                        __uint128_t val = Vs2_vu[i];
+
+                        val = int_rounding<__uint128_t>(val,
+                            xc->readMiscReg(MISCREG_VXRM), sh);
+                        Vd_vu[i] = val >> sh;
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x2b: vssra_vx({{
+                        int sh = Rs1_vu & (sew - 1);
+                        __int128_t val = Vs2_vi[i];
+
+                        val = int_rounding<__int128_t>(val,
+                            xc->readMiscReg(MISCREG_VXRM), sh);
+                        Vd_vi[i] = val >> sh;
+                    }}, OPIVX, VectorIntegerArithOp);
+                }
+                format VectorIntNarrowingFormat {
+                    0x2c: vnsrl_wx({{
+                        Vd_vu[i + offset] = (vu)(Vs2_vwu[i] >>
+                                            ((vwu)Rs1_vu & (sew * 2 - 1)));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x2d: vnsra_wx({{
+                        Vd_vi[i + offset] = (vi)(Vs2_vwi[i] >>
+                                            ((vwu)Rs1_vu & (sew * 2 - 1)));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x2e: vnclipu_wx({{
+                        vu max = std::numeric_limits<vu>::max();
+                        uint64_t sign_mask =
+                            std::numeric_limits<uint64_t>::max() << sew;
+                        __uint128_t res = Vs2_vwu[i];
+                        unsigned shift = Rs1_vu & ((sew * 2) - 1);
+
+                        res = int_rounding<__uint128_t>(
+                            res, 0 /* TODO */, shift) >> shift;
+
+                        if (res & sign_mask) {
+                            // TODO: vxsat
+                            res = max;
+                        }
+
+                        Vd_vu[i + offset] = (vu)res;
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x2f: vnclip_wx({{
+                        vi max = std::numeric_limits<vi>::max();
+                        vi min = std::numeric_limits<vi>::min();
+                        __int128_t res = Vs2_vwi[i];
+                        unsigned shift = Rs1_vi & ((sew * 2) - 1);
+
+                        res = int_rounding<__int128_t>(
+                            res, 0 /* TODO */, shift) >> shift;
+
+                        if (res < min) {
+                            res = min;
+                            // TODO: vxsat
+                        } else if (res > max) {
+                            res = max;
+                            // TODO: vxsat
+                        }
+
+                        Vd_vi[i + offset] = (vi)res;
+                    }}, OPIVX, VectorIntegerArithOp);
+                }
+
+                format VectorIntMaskFormat {
+                    0x11: decode VM {
+                        0x0: vmadc_vxm({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                carry_out(Vs2_vi[i], Rs1_vi,
+                                    elem_mask(v0, ei)));
+                        }}, OPIVX, VectorIntegerArithOp);
+                        0x1: vmadc_vx({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                carry_out(Vs2_vi[i], Rs1_vi));
+                        }}, OPIVX, VectorIntegerArithOp);
+                    }
+                    0x13: decode VM {
+                        0x0: vmsbc_vxm({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                borrow_out(Vs2_vi[i], Rs1_vi,
+                                    elem_mask(v0, ei)));
+                        }}, OPIVX, VectorIntegerArithOp);
+                        0x1: vmsbc_vx({{
+                            Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                                borrow_out(Vs2_vi[i], Rs1_vi));
+                        }}, OPIVX, VectorIntegerArithOp);
+                    }
+                    0x18: vmseq_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] == Rs1_vu));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x19: vmsne_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] != Rs1_vu));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x1a: vmsltu_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] < Rs1_vu));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x1b: vmslt_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] < Rs1_vi));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x1c: vmsleu_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] <= Rs1_vu));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x1d: vmsle_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] <= Rs1_vi));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x1e: vmsgtu_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vu[i] > Rs1_vu));
+                    }}, OPIVX, VectorIntegerArithOp);
+                    0x1f: vmsgt_vx({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            (Vs2_vi[i] > Rs1_vi));
+                    }}, OPIVX, VectorIntegerArithOp);
+                }
+            }
+            // OPFVF
+            0x5: decode VFUNCT6 {
+                format VectorFloatFormat{
+                    0x00: vfadd_vf({{
+                        auto fd = fadd<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype_freg<et>(freg(Fs1_bits)));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x02: vfsub_vf({{
+                        auto fd = fsub<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype_freg<et>(freg(Fs1_bits)));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x04: vfmin_vf({{
+                        auto fd = fmin<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype_freg<et>(freg(Fs1_bits)));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x06: vfmax_vf({{
+                        auto fd = fmax<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype_freg<et>(freg(Fs1_bits)));
+                            Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x08: vfsgnj_vf({{
+                        Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
+                                             ftype_freg<et>(freg(Fs1_bits)),
+                                             false, false).v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x09: vfsgnjn_vf({{
+                        Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
+                                             ftype_freg<et>(freg(Fs1_bits)),
+                                             true, false).v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x0a: vfsgnjx_vf({{
+                        Vd_vu[i] = fsgnj<et>(ftype<et>(Vs2_vu[i]),
+                                             ftype_freg<et>(freg(Fs1_bits)),
+                                             false, true).v;
+                    }}, OPFVF, VectorFloatArithOp);
+                }
+                0x0e: VectorFloatSlideUpFormat::vfslide1up_vf({{
+                    const int offset = 1;
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vdIdx - vs2Idx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int elemOffset = vdOffset + vdIdx * microVlmax;
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            if (this->vm || elem_mask(v0, i + elemOffset)) {
+                                Vd_vu[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                            }
+                        }
+                        // TODO: dirty code
+                        if (vdIdx == 0 && vs2Idx == 0 &&
+                                (this->vm || elem_mask(v0, 0))) {
+                            tmp_d0.as<vu>()[0] = Rs1_vu;
+                        }
+                    }
+                }}, OPFVF, VectorMiscOp);
+                0x0f: VectorFloatSlideDownFormat::vfslide1down_vf({{
+                    const int offset = 1;
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vs2Idx - vdIdx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    const int numVs2s = vtype_regs_per_group(vtype);
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const bool needZeroTail = numVs2s == vs2Idx + 1;
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int elemIdxBase = vdIdx * microVlmax;
+                        vreg_t resVreg;
+                        auto res = resVreg.as<vu>();
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            res[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                        }
+                        if (needZeroTail) {
+                            for (int i = upperBound + vdOffset;
+                                i < microVlmax; i++) {
+                                res[i] = 0;
+                            }
+                        }
+                        for (int i = vdOffset; i < microVl ; i++) {
+                            if (vm || elem_mask(v0, i + elemIdxBase)) {
+                                Vd_vu[i] = (i + elemIdxBase != machInst.vl - 1)
+                                    ? res[i]
+                                    : Rs1_vu;
+                            }
+                        }
+                    }
+                }}, OPFVF, VectorMiscOp);
+                // VRFUNARY0
+                0x10: decode VS2 {
+                    0x00: decode VM {
+                        // The encodings corresponding to the masked versions
+                        // (vm=0) of vfmv.s.f are reserved
+                        0x1: VectorNonSplitFormat::vfmv_s_f({{
+                            auto fd = ftype_freg<et>(freg(Fs1_bits));
+                            Vd_vu[0] = fd.v;
+                        }}, OPFVV, VectorMiscOp);
+                    }
+                }
+                format VectorFloatFormat{
+                    0x17: decode VM {
+                        0x0: vfmerge_vfm({{
+                            Vd_vu[i] = elem_mask(v0, ei)
+                                    ? ftype_freg<et>(freg(Fs1_bits)).v
+                                    : Vs2_vu[i];
+                        }}, OPFVF, VectorFloatArithOp);
+                        0x1: vfmv_v_f({{
+                            auto fd = ftype_freg<et>(freg(Fs1_bits));
+                            Vd_vu[i] = fd.v;
+                        }}, OPFVF, VectorFloatArithOp);
+                    }
+                }
+                format VectorFloatMaskFormat {
+                    0x18: vmfeq_vf({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            feq<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype_freg<et>(freg(Fs1_bits))));
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x19: vmfle_vf({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            fle<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype_freg<et>(freg(Fs1_bits))));
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x1b: vmflt_vf({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            flt<et>(ftype<et>(Vs2_vu[i]),
+                                    ftype_freg<et>(freg(Fs1_bits))));
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x1c: vmfne_vf({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            !feq<et>(ftype<et>(Vs2_vu[i]),
+                                     ftype_freg<et>(freg(Fs1_bits))));
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x1d: vmfgt_vf({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            flt<et>(ftype_freg<et>(freg(Fs1_bits)),
+                                    ftype<et>(Vs2_vu[i])));
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x1f: vmfge_vf({{
+                        Vd_ub[(i + offset)/8] = ASSIGN_VD_BIT(i + offset,
+                            fle<et>(ftype_freg<et>(freg(Fs1_bits)),
+                                    ftype<et>(Vs2_vu[i])));
+                    }}, OPFVF, VectorFloatArithOp);
+                }
+                format VectorFloatFormat{
+                    0x20: vfdiv_vf({{
+                        auto fd = fdiv<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype_freg<et>(freg(Fs1_bits)));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x21: vfrdiv_vf({{
+                        auto fd = fdiv<et>(ftype_freg<et>(freg(Fs1_bits)),
+                                           ftype<et>(Vs2_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x24: vfmul_vf({{
+                        auto fd = fmul<et>(ftype<et>(Vs2_vu[i]),
+                                           ftype_freg<et>(freg(Fs1_bits)));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x27: vfrsub_vf({{
+                        auto fd = fsub<et>(ftype_freg<et>(freg(Fs1_bits)),
+                                           ftype<et>(Vs2_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x28: vfmadd_vf({{
+                        auto fd = fmadd<et>(ftype<et>(Vs3_vu[i]),
+                                            ftype_freg<et>(freg(Fs1_bits)),
+                                            ftype<et>(Vs2_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x29: vfnmadd_vf({{
+                        auto fd = fmadd<et>(fneg(ftype<et>(Vs3_vu[i])),
+                                            ftype_freg<et>(freg(Fs1_bits)),
+                                            fneg(ftype<et>(Vs2_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x2a: vfmsub_vf({{
+                        auto fd = fmadd<et>(ftype<et>(Vs3_vu[i]),
+                                            ftype_freg<et>(freg(Fs1_bits)),
+                                            fneg(ftype<et>(Vs2_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x2b: vfnmsub_vf({{
+                        auto fd = fmadd<et>(fneg(ftype<et>(Vs3_vu[i])),
+                                            ftype_freg<et>(freg(Fs1_bits)),
+                                            ftype<et>(Vs2_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x2c: vfmacc_vf({{
+                        auto fd = fmadd<et>(ftype_freg<et>(freg(Fs1_bits)),
+                                            ftype<et>(Vs2_vu[i]),
+                                            ftype<et>(Vs3_vu[i]));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x2d: vfnmacc_vf({{
+                        auto fd = fmadd<et>(
+                            fneg(ftype_freg<et>(freg(Fs1_bits))),
+                            ftype<et>(Vs2_vu[i]),
+                            fneg(ftype<et>(Vs3_vu[i]))
+                        );
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x2e: vfmsac_vf({{
+                        auto fd = fmadd<et>(ftype_freg<et>(freg(Fs1_bits)),
+                                            ftype<et>(Vs2_vu[i]),
+                                            fneg(ftype<et>(Vs3_vu[i])));
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x2f: vfnmsac_vf({{
+                        auto fd = fmadd<et>(
+                            fneg(ftype_freg<et>(freg(Fs1_bits))),
+                            ftype<et>(Vs2_vu[i]),
+                            ftype<et>(Vs3_vu[i])
+                        );
+                        Vd_vu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                }
+                format VectorFloatWideningFormat {
+                    0x30: vfwadd_vf({{
+                        auto fd = fadd<ewt>(
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x32: vfwsub_vf({{
+                        auto fd = fsub<ewt>(
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x34: vfwadd_wf({{
+                        auto fd = fadd<ewt>(
+                            ftype<ewt>(Vs2_vwu[i]),
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x36: vfwsub_wf({{
+                        auto fd = fsub<ewt>(
+                            ftype<ewt>(Vs2_vwu[i]),
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x38: vfwmul_vf({{
+                        auto fd = fmul<ewt>(
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x3c: vfwmacc_vf({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            ftype<ewt>(Vs3_vwu[i]));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x3d: vfwnmacc_vf({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(fneg(ftype_freg<et>(freg(Fs1_bits)))),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fneg(ftype<ewt>(Vs3_vwu[i])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x3e: vfwmsac_vf({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(ftype_freg<et>(freg(Fs1_bits))),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            fneg(ftype<ewt>(Vs3_vwu[i])));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                    0x3f: vfwnmsac_vf({{
+                        auto fd = fmadd<ewt>(
+                            fwiden(fneg(ftype_freg<et>(freg(Fs1_bits)))),
+                            fwiden(ftype<et>(Vs2_vu[i + offset])),
+                            ftype<ewt>(Vs3_vwu[i]));
+                        Vd_vwu[i] = fd.v;
+                    }}, OPFVF, VectorFloatArithOp);
+                }
+            }
+            // OPMVX
+            0x6: decode VFUNCT6 {
+                format VectorIntFormat {
+                    0x08: vaaddu_vx({{
+                        __uint128_t res = (__uint128_t)Vs2_vu[i] + Rs1_vu;
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vu[i] = res >> 1;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x09: vaadd_vx({{
+                        __uint128_t res = (__uint128_t)Vs2_vi[i] + Rs1_vi;
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vi[i] = res >> 1;
+                    }}, OPMVX, VectorIntegerArithOp);
+                }
+                0x0e: VectorSlideUpFormat::vslide1up_vx({{
+                    const int offset = 1;
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vdIdx - vs2Idx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int elemOffset = vdOffset + vdIdx * microVlmax;
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            if (this->vm || elem_mask(v0, i + elemOffset)) {
+                                Vd_vu[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                            }
+                        }
+                        // TODO: dirty code
+                        if (vdIdx == 0 && vs2Idx == 0 &&
+                                (this->vm || elem_mask(v0, 0))) {
+                            tmp_d0.as<vu>()[0] = Rs1_vu;
+                        }
+                    }
+                }}, OPIVX, VectorMiscOp);
+                0x0f: VectorSlideDownFormat::vslide1down_vx({{
+                    const int offset = 1;
+                    const int microVlmax = vtype_VLMAX(machInst.vtype8, true);
+                    const int vregOffset = vs2Idx - vdIdx;
+                    const int offsetInVreg = offset - vregOffset * microVlmax;
+                    const int numVs2s = vtype_regs_per_group(vtype);
+                    if (std::abs(offsetInVreg) < uint32_t(microVlmax)) {
+                        const bool needZeroTail = numVs2s == vs2Idx + 1;
+                        const int upperBound = (offsetInVreg >= 0)
+                            ? microVlmax - offsetInVreg
+                            : microVlmax + offsetInVreg;
+                        const int vdOffset = (offsetInVreg >= 0)
+                            ? 0
+                            : -offsetInVreg;
+                        const int vs2Offset = (offsetInVreg >= 0)
+                            ? offsetInVreg
+                            : 0;
+                        const int elemIdxBase = vdIdx * microVlmax;
+                        vreg_t resVreg;
+                        auto res = resVreg.as<vu>();
+                        for (int i = 0;
+                            i < upperBound && i + vdOffset < microVl;
+                            i++) {
+                            res[i + vdOffset] = Vs2_vu[i + vs2Offset];
+                        }
+                        if (needZeroTail) {
+                            for (int i = upperBound + vdOffset;
+                                i < microVlmax; i++) {
+                                res[i] = 0;
+                            }
+                        }
+                        for (int i = vdOffset; i < microVl ; i++) {
+                            if (vm || elem_mask(v0, i + elemIdxBase)) {
+                                Vd_vu[i] = (i + elemIdxBase != machInst.vl - 1)
+                                    ? res[i]
+                                    : Rs1_vu;
+                            }
+                        }
+                    }
+                }}, OPIVX, VectorMiscOp);
+                // VRXUNARY0
+                0x10: decode VS2 {
+                    0x00: decode VM {
+                        // The encodings corresponding to the masked versions
+                        // (vm=0) of vmv.s.x are reserved.
+                        0x1: VectorNonSplitFormat::vmv_s_x({{
+                            Vd_vu[0] = Rs1_vu;
+                        }}, OPMVX, VectorMiscOp);
+                    }
+                }
+                format VectorIntFormat {
+                    0x0a: vasubu_vx({{
+                        __uint128_t res = (__uint128_t)Vs2_vu[i] - Rs1_vu;
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vu[i] = res >> 1;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x0b: vasub_vx({{
+                        __uint128_t res = (__uint128_t)Vs2_vi[i] - Rs1_vi;
+                        res = int_rounding<__uint128_t>(res, 0 /* TODO */, 1);
+                        Vd_vi[i] = res >> 1;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x20: vdivu_vx({{
+                        if (Rs1_vu == 0)
+                            Vd_vu[i] = (vu)-1;
+                        else
+                            Vd_vu[i] = Vs2_vu[i] / Rs1_vu;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x21: vdiv_vx({{
+                        if (Rs1_vi == 0)
+                            Vd_vi[i] = -1;
+                        else if (Vs2_vi[i] == std::numeric_limits<vi>::min()
+                                && Rs1_vi == -1)
+                            Vd_vi[i] = Vs2_vi[i];
+                        else
+                            Vd_vi[i] = Vs2_vi[i] / Rs1_vi;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x22: vremu_vx({{
+                        if (Rs1_vu == 0)
+                            Vd_vu[i] = Vs2_vu[i];
+                        else
+                            Vd_vu[i] = Vs2_vu[i] % Rs1_vu;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x23: vrem_vx({{
+                        if (Rs1_vi == 0)
+                            Vd_vi[i] = Vs2_vi[i];
+                        else if (Vs2_vi[i] == std::numeric_limits<vi>::min()
+                                && Rs1_vi == -1)
+                            Vd_vi[i] = 0;
+                        else
+                            Vd_vi[i] = Vs2_vi[i] % Rs1_vi;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x24: vmulhu_vx({{
+                        if (sew < 64)
+                            Vd_vu[i] = ((uint64_t)Vs2_vu[i] * Rs1_vu)
+                                        >> sew;
+                        else
+                            Vd_vu[i] = mulhu_64(Vs2_vu[i], Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x25: vmul_vx({{
+                        Vd_vi[i] = Vs2_vi[i] * Rs1_vi;
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x26: vmulhsu_vx({{
+                        if (sew < 64)
+                            Vd_vi[i] = ((int64_t)Vs2_vi[i] *
+                                        (uint64_t)Rs1_vu)
+                                        >> sew;
+                        else
+                            Vd_vi[i] = mulhsu_64(Vs2_vi[i], Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x27: vmulh_vx({{
+                        if (sew < 64)
+                            Vd_vi[i] = ((int64_t)Vs2_vi[i] * Rs1_vi)
+                                        >> sew;
+                        else
+                            Vd_vi[i] = mulh_64(Vs2_vi[i], Rs1_vi);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x29: vmadd_vx({{
+                        Vd_vi[i] = Vs3_vi[i] * Rs1_vi + Vs2_vi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x2b: vnmsub_vx({{
+                        Vd_vi[i] = -(Vs3_vi[i] * Rs1_vi) + Vs2_vi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x2d: vmacc_vx({{
+                        Vd_vi[i] = Vs2_vi[i] * Rs1_vi + Vs3_vi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x2f: vnmsac_vx({{
+                        Vd_vi[i] = -(Vs2_vi[i] * Rs1_vi) + Vs3_vi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                }
+                format VectorIntWideningFormat {
+                    0x30: vwaddu_vx({{
+                        Vd_vwu[i] = vwu(Vs2_vu[i + offset]) + vwu(Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x31: vwadd_vx({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset]) + vwi(Rs1_vi);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x32: vwsubu_vx({{
+                        Vd_vwu[i] = vwu(Vs2_vu[i + offset]) - vwu(Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x33: vwsub_vx({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset]) - vwi(Rs1_vi);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x34: vwaddu_wx({{
+                        Vd_vwu[i] = Vs2_vwu[i] + vwu(Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x35: vwadd_wx({{
+                        Vd_vwi[i] = Vs2_vwi[i] + vwi(Rs1_vi);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x36: vwsubu_wx({{
+                        Vd_vwu[i] = Vs2_vwu[i] - vwu(Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x37: vwsub_wx({{
+                        Vd_vwi[i] = Vs2_vwi[i] - vwi(Rs1_vi);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x38: vwmulu_vx({{
+                        Vd_vwu[i] = vwu(Vs2_vu[i + offset]) * vwu(Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x3a: vwmulsu_vx({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset]) * vwu(Rs1_vu);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x3b: vwmul_vx({{
+                        Vd_vwi[i] = vwi(Vs2_vi[i + offset]) * vwi(Rs1_vi);
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x3c: vwmaccu_vx({{
+                        Vd_vwu[i] = vwu(Rs1_vu) * vwu(Vs2_vu[i + offset])
+                                + Vs3_vwu[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x3d: vwmacc_vx({{
+                        Vd_vwi[i] = vwi(Rs1_vi) * vwi(Vs2_vi[i + offset])
+                                + Vs3_vwi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x3e: vwmaccus_vx({{
+                        Vd_vwi[i] = vwu(Rs1_vu) * vwi(Vs2_vi[i + offset])
+                                + Vs3_vwi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                    0x3f: vwmaccsu_vx({{
+                        Vd_vwi[i] = vwi(Rs1_vi) * vwu(Vs2_vu[i + offset])
+                                + Vs3_vwi[i];
+                    }}, OPMVX, VectorIntegerArithOp);
+                }
+            }
+            0x7: decode BIT31 {
+                format VConfOp {
+                    0x0: vsetvli({{
+                        uint64_t rd_bits = RD;
+                        uint64_t rs1_bits = RS1;
+                        uint64_t requested_vl = Rs1_ud;
+                        uint64_t requested_vtype = zimm11;
+
+                        Rd_ud = 0;
+                    }}, VectorConfigOp, IsSerializeAfter, IsNonSpeculative);
+                    0x1: decode BIT30 {
+                        0x0: vsetvl({{
+                            uint64_t rd_bits = RD;
+                            uint64_t rs1_bits = RS1;
+                            uint64_t requested_vl = Rs1_ud;
+                            uint64_t requested_vtype = Rs2_ud;
+
+                            Rd_ud = 0;
+                        }}, VectorConfigOp, IsSerializeAfter,
+                        IsNonSpeculative);
+                        0x1: vsetivli({{
+                            uint64_t rd_bits = RD;
+                            uint64_t rs1_bits = -1;
+                            uint64_t requested_vl = uimm;
+                            uint64_t requested_vtype = zimm10;
+
+                            Rd_ud = 0;
+                        }}, VectorConfigOp, IsSerializeAfter,
+                        IsNonSpeculative);
+                    }
+                }
+            }
+        }
+
         0x18: decode FUNCT3 {
             format BOp {
                 0x0: beq({{
diff --git a/src/arch/riscv/isa/formats/formats.isa b/src/arch/riscv/isa/formats/formats.isa
index 19749438a8..0102df17d7 100644
--- a/src/arch/riscv/isa/formats/formats.isa
+++ b/src/arch/riscv/isa/formats/formats.isa
@@ -37,6 +37,9 @@
 ##include "fp.isa"
 ##include "amo.isa"
 ##include "bs.isa"
+##include "vector_conf.isa"
+##include "vector_arith.isa"
+##include "vector_mem.isa"
 
 // Include formats for nonstandard extensions
 ##include "compressed.isa"
diff --git a/src/arch/riscv/isa/formats/mem.isa b/src/arch/riscv/isa/formats/mem.isa
index 0d80260a25..7cec113ba1 100644
--- a/src/arch/riscv/isa/formats/mem.isa
+++ b/src/arch/riscv/isa/formats/mem.isa
@@ -243,3 +243,10 @@ def format Store(memacc_code, ea_code={{EA = rvZext(Rs1 + offset);}},
         LoadStoreBase(name, Name, offset_code, ea_code, memacc_code, mem_flags,
         inst_flags, 'Store', exec_template_base='Store')
 }};
+
+def format CBMOp(memacc_code, ea_code={{EA = rvZext(Rs1 + offset);}},
+        offset_code={{offset = 0;}}, mem_flags=[], inst_flags=[]) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        LoadStoreBase(name, Name, offset_code, ea_code, memacc_code, mem_flags,
+        inst_flags, 'Store', exec_template_base='Store')
+}};
diff --git a/src/arch/riscv/isa/formats/vector_arith.isa b/src/arch/riscv/isa/formats/vector_arith.isa
new file mode 100644
index 0000000000..c462e6c8d4
--- /dev/null
+++ b/src/arch/riscv/isa/formats/vector_arith.isa
@@ -0,0 +1,1319 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2022 PLCT Lab
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+let {{
+    def setDestWrapper(destRegId):
+        return "setDestRegIdx(_numDestRegs++, " + destRegId + ");\n" + \
+               "_numTypedDestRegs[VecRegClass]++;\n"
+    def setSrcWrapper(srcRegId):
+        return "setSrcRegIdx(_numSrcRegs++, " + srcRegId + ");\n"
+    def setSrcVm():
+        return "if (!this->vm)\n" + \
+               "    setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);"
+    def vmDeclAndReadData():
+        return '''
+            [[maybe_unused]] RiscvISA::vreg_t tmp_v0;
+            [[maybe_unused]] uint8_t* v0;
+            if(!machInst.vm) {
+                xc->getRegOperand(this, _numSrcRegs-1, &tmp_v0);
+                v0 = tmp_v0.as<uint8_t>();
+            }
+        '''
+    def copyOldVd(vd_idx):
+        return 'COPY_OLD_VD(%d);' % vd_idx
+    def loopWrapper(code, micro_inst = True):
+        if micro_inst:
+            upper_bound = "this->microVl"
+        else:
+            upper_bound = "(uint32_t)machInst.vl"
+        return '''
+            for (uint32_t i = 0; i < %s; i++) {
+                %s
+            }
+        ''' % (upper_bound, code)
+    def maskCondWrapper(code):
+        return "if (this->vm || elem_mask(v0, ei)) {\n" + \
+               code + "}\n"
+    def eiDeclarePrefix(code, widening = False):
+        if widening:
+            return '''
+            uint32_t ei = i + micro_vlmax * this->microIdx;
+            ''' + code
+        else:
+            return '''
+            uint32_t ei = i + vtype_VLMAX(vtype, true) * this->microIdx;
+            ''' + code
+
+    def wideningOpRegisterConstraintChecks(code):
+        return '''
+            const uint32_t num_microops = 1 << std::max<int64_t>(0, vtype_vlmul(machInst.vtype8) + 1);
+            if ((machInst.vd % alignToPowerOfTwo(num_microops)) != 0) {
+                std::string error =
+                    csprintf("Unaligned Vd group in Widening op");
+                return std::make_shared<IllegalInstFault>(error, machInst);
+            }
+            if ((machInst.vs2 <= machInst.vd) && (machInst.vd < (machInst.vs2 + num_microops - 1))) {
+                // A destination vector register group can overlap a source vector
+                // register group if The destination EEW is greater than the source
+                // EEW, the source EMUL is at least 1, and the overlap is in the
+                // highest- numbered part of the destination register group.
+                std::string error =
+                    csprintf("Unsupported overlap in Vs2 and Vd for Widening op");
+                return std::make_shared<IllegalInstFault>(error, machInst);
+            }
+            ''' + code
+
+    def narrowingOpRegisterConstraintChecks(code):
+        return '''
+            const uint32_t num_microops = 1 << std::max<int64_t>(0, vtype_vlmul(machInst.vtype8) + 1);
+            if ((machInst.vs2 % alignToPowerOfTwo(num_microops)) != 0) {
+                std::string error =
+                    csprintf("Unaligned VS2 group in Narrowing op");
+                return std::make_shared<IllegalInstFault>(error, machInst);
+            }
+            if ((machInst.vs2 < machInst.vd) && (machInst.vd <= (VS2 + num_microops - 1))) {
+                // A destination vector register group can overlap a source vector
+                // register group The destination EEW is smaller than the source EEW
+                // and the overlap is in the lowest-numbered part of the source
+                // register group
+                std::string error =
+                    csprintf("Unsupported overlap in Vs2 and Vd for Narrowing op");
+                return std::make_shared<IllegalInstFault>(error, machInst);
+            }
+        ''' + code
+
+    def fflags_wrapper(code):
+        return '''
+        RegVal FFLAGS = xc->readMiscReg(MISCREG_FFLAGS);
+        std::feclearexcept(FE_ALL_EXCEPT);
+        ''' + code + '''
+        FFLAGS |= softfloat_exceptionFlags;
+        softfloat_exceptionFlags = 0;
+        xc->setMiscReg(MISCREG_FFLAGS, FFLAGS);
+        '''
+}};
+
+
+def format VectorIntFormat(code, category, *flags) {{
+    macroop_class_name = 'VectorArithMacroInst'
+    microop_class_name = 'VectorArithMicroInst'
+
+    if name == "vid_v" :
+        macroop_class_name = 'VectorVMUNARY0MacroInst'
+        microp_class_name = 'VectorVMUNARY0MicroInst'
+
+    iop = InstObjParams(name, Name, macroop_class_name, {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    v0_required = inst_name not in ["vmv"]
+    mask_cond = v0_required and (inst_suffix not in ['vvm', 'vxm', 'vim'])
+    need_elem_idx = mask_cond or code.find("ei") != -1
+
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+
+    num_src_regs = 0
+
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    num_src_regs += 1
+
+    src1_reg_id = ""
+    if category in ["OPIVV", "OPMVV"]:
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx]"
+        num_src_regs += 1
+    elif category in ["OPIVX", "OPMVX"]:
+        src1_reg_id = "intRegClass[_machInst.rs1]"
+        num_src_regs += 1
+    elif category == "OPIVI":
+        pass
+    else:
+        error("not supported category for VectorIntFormat: %s" % category)
+
+    old_vd_idx = num_src_regs
+    src3_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    if category != "OPIVI":
+        set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    if v0_required:
+        set_src_reg_idx += setSrcVm()
+
+    # code
+    if mask_cond:
+        code = maskCondWrapper(code)
+    if need_elem_idx:
+        code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+
+    vm_decl_rd = ""
+    if v0_required:
+        vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        microop_class_name,
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntMicroDeclare.subst(microiop) + \
+        VectorIntMicroConstructor.subst(microiop) + \
+        VectorIntMicroExecute.subst(microiop) + \
+        VectorIntMacroDeclare.subst(iop) + \
+        VectorIntMacroConstructor.subst(iop)
+
+    decode_block = VectorIntDecodeBlock.subst(iop)
+}};
+
+
+def format VectorIntExtFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    ext_div = int(inst_suffix[-1])
+
+    old_vd_idx = 1
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx / " + \
+                      str(ext_div) + "]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    set_src_reg_idx += setSrcVm()
+
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+    vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx),
+         'ext_div': ext_div},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntExtMicroDeclare.subst(microiop) + \
+        VectorIntMicroConstructor.subst(microiop) + \
+        VectorIntExtMicroExecute.subst(microiop) + \
+        VectorIntExtMacroDeclare.subst(iop) + \
+        VectorIntMacroConstructor.subst(iop)
+
+    decode_block = VectorIntDecodeBlock.subst(iop)
+}};
+
+def format VectorIntWideningFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    v0_required = True
+    mask_cond = v0_required
+    need_elem_idx = mask_cond or code.find("ei") != -1
+    old_vd_idx = 2
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src1_reg_id = ""
+    if category in ["OPIVV", "OPMVV"]:
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx / 2]"
+    elif category in ["OPIVX", "OPMVX"]:
+        src1_reg_id = "intRegClass[_machInst.rs1]"
+    else:
+        error("not supported category for VectorIntFormat: %s" % category)
+    src2_reg_id = ""
+    if inst_suffix in ["vv", "vx"]:
+        src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx / 2]"
+    elif inst_suffix in ["wv", "wx"]:
+        src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    if v0_required:
+        set_src_reg_idx += setSrcVm()
+
+    # code
+    if mask_cond:
+        code = maskCondWrapper(code)
+    if need_elem_idx:
+        code = eiDeclarePrefix(code, widening=True)
+    code = loopWrapper(code)
+
+    code = wideningOpRegisterConstraintChecks(code)
+
+    vm_decl_rd = ""
+    if v0_required:
+        vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntWideningMicroDeclare.subst(microiop) + \
+        VectorIntWideningMicroConstructor.subst(microiop) + \
+        VectorIntWideningMicroExecute.subst(microiop) + \
+        VectorIntWideningMacroDeclare.subst(iop) + \
+        VectorIntWideningMacroConstructor.subst(iop)
+
+    decode_block = VectorIntWideningDecodeBlock.subst(iop)
+}};
+
+def format VectorIntNarrowingFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    mask_cond = True
+    need_elem_idx = True
+
+    old_vd_idx = 2
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx / 2]"
+    if category in ["OPIVV"]:
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx / 2]"
+    elif category in ["OPIVX"]:
+        src1_reg_id = "intRegClass[_machInst.rs1]"
+    elif category == "OPIVI":
+        old_vd_idx = 1
+    else:
+        error("not supported category for VectorIntFormat: %s" % category)
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vs3 + _microIdx / 2]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = ""
+    if category != "OPIVI":
+        set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_src_reg_idx += setSrcVm()
+    # code
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code, widening=True)
+    code = loopWrapper(code)
+    code = narrowingOpRegisterConstraintChecks(code)
+    vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx),
+         },
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntWideningMicroDeclare.subst(microiop) + \
+        VectorIntWideningMicroConstructor.subst(microiop) + \
+        VectorIntNarrowingMicroExecute.subst(microiop) + \
+        VectorIntWideningMacroDeclare.subst(iop) + \
+        VectorIntWideningMacroConstructor.subst(iop)
+
+    decode_block = VectorIntWideningDecodeBlock.subst(iop)
+}};
+
+def format VectorIntMaskFormat(code, category, *flags) {{
+    iop = InstObjParams(name,
+        Name,
+        'VectorArithMacroInst',
+        {'code': code},
+        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    v0_required = not (inst_name in ["vmadc", "vmsbc"] \
+        and inst_suffix in ["vv", "vx", "vi"])
+    mask_cond = inst_name not in ['vmadc', 'vmsbc']
+    need_elem_idx = mask_cond or code.find("ei") != -1
+
+    old_vd_idx = 2
+    dest_reg_id = "vecRegClass[VecMemInternalReg0 + _microIdx]"
+    src1_reg_id = ""
+    if category == "OPIVV":
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx]"
+    elif category == "OPIVX":
+        src1_reg_id = "intRegClass[_machInst.rs1]"
+    elif category == "OPIVI":
+        old_vd_idx = 1
+    else:
+        error("not supported category for VectorIntFormat: %s" % category)
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = ""
+    if category != "OPIVI":
+        set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    if v0_required:
+        set_src_reg_idx += setSrcVm()
+
+    #code
+    if mask_cond:
+        code = maskCondWrapper(code)
+    if need_elem_idx:
+        code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+
+    vm_decl_rd = ""
+    if v0_required:
+        vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntMaskMicroDeclare.subst(microiop) + \
+        VectorIntMaskMicroConstructor.subst(microiop) + \
+        VectorIntMaskMicroExecute.subst(microiop) + \
+        VectorIntMaskMacroDeclare.subst(iop) + \
+        VectorIntMaskMacroConstructor.subst(iop)
+    decode_block = VectorIntDecodeBlock.subst(iop)
+}};
+
+def format VectorGatherFormat(code, category, *flags) {{
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    if inst_name == "vrgatherei16":
+        idx_type = "uint16_t"
+    else:
+        idx_type = "elem_type"
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst',
+        {'idx_type': idx_type,
+         'code': code},
+        flags)
+    old_vd_idx = 2
+    dest_reg_id = "vecRegClass[_machInst.vd + vd_idx]"
+    src1_reg_id = ""
+    if category in ["OPIVV"]:
+        src1_reg_id = "vecRegClass[_machInst.vs1 + vs1_idx]"
+    elif category in ["OPIVX"]:
+        src1_reg_id = "intRegClass[_machInst.rs1]"
+    elif category == "OPIVI":
+        old_vd_idx = 1
+    else:
+        error("not supported category for VectorIntFormat: %s" % category)
+    src2_reg_id = "vecRegClass[_machInst.vs2 + vs2_idx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + vd_idx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    if category != "OPIVI":
+        set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    set_src_reg_idx += setSrcVm()
+
+    # code
+
+    vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx),
+         'idx_type': idx_type},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorGatherMicroDeclare.subst(microiop) + \
+        VectorGatherMicroConstructor.subst(microiop) + \
+        VectorGatherMicroExecute.subst(microiop) + \
+        VectorGatherMacroDeclare.subst(iop) + \
+        VectorGatherMacroConstructor.subst(iop)
+
+    decode_block = VectorGatherDecodeBlock.subst(iop)
+
+}};
+
+def format VectorFloatFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    v0_required = inst_name not in ["vfmv"]
+    mask_cond = v0_required and (inst_suffix not in ['vvm', 'vfm'])
+    need_elem_idx = mask_cond or code.find("ei") != -1
+
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src1_reg_id = ""
+    if category == "OPFVV":
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx]"
+    elif category == "OPFVF":
+        src1_reg_id = "floatRegClass[_machInst.rs1]"
+    else:
+        error("not supported category for VectorFloatFormat: %s" % category)
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    if v0_required:
+        set_src_reg_idx += setSrcVm()
+    # code
+    if mask_cond:
+        code = maskCondWrapper(code)
+    if need_elem_idx:
+        code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+    code = fflags_wrapper(code)
+
+    vm_decl_rd = ""
+    if v0_required:
+        vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorFloatMicroDeclare.subst(microiop) + \
+        VectorFloatMicroConstructor.subst(microiop) + \
+        VectorFloatMicroExecute.subst(microiop) + \
+        VectorFloatMacroDeclare.subst(iop) + \
+        VectorFloatMacroConstructor.subst(iop)
+
+    decode_block = VectorFloatDecodeBlock.subst(iop)
+}};
+
+def format VectorFloatCvtFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+
+    old_vd_idx = 1
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    set_src_reg_idx += setSrcVm()
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+    code = fflags_wrapper(code)
+
+    vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorFloatCvtMicroDeclare.subst(microiop) + \
+        VectorFloatMicroConstructor.subst(microiop) + \
+        VectorFloatMicroExecute.subst(microiop) + \
+        VectorFloatCvtMacroDeclare.subst(iop) + \
+        VectorFloatMacroConstructor.subst(iop)
+
+    decode_block = VectorFloatDecodeBlock.subst(iop)
+}};
+
+def format VectorFloatWideningFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    v0_required = True
+    mask_cond = v0_required
+    need_elem_idx = mask_cond or code.find("ei") != -1
+
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src1_reg_id = ""
+    if category in ["OPFVV"]:
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx / 2]"
+    elif category in ["OPFVF"]:
+        src1_reg_id = "floatRegClass[_machInst.rs1]"
+    else:
+        error("not supported category for VectorFloatFormat: %s" % category)
+    src2_reg_id = ""
+    if inst_suffix in ["vv", "vf"]:
+        src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx / 2]"
+    elif inst_suffix in ["wv", "wf"]:
+        src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    if v0_required:
+        set_src_reg_idx += setSrcVm()
+
+    # code
+    if mask_cond:
+        code = maskCondWrapper(code)
+    if need_elem_idx:
+        code = eiDeclarePrefix(code, widening=True)
+    code = loopWrapper(code)
+    code = fflags_wrapper(code)
+
+    code = wideningOpRegisterConstraintChecks(code)
+
+    vm_decl_rd = ""
+    if v0_required:
+        vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntWideningMicroDeclare.subst(microiop) + \
+        VectorIntWideningMicroConstructor.subst(microiop) + \
+        VectorFloatWideningMicroExecute.subst(microiop) + \
+        VectorIntWideningMacroDeclare.subst(iop) + \
+        VectorIntWideningMacroConstructor.subst(iop)
+
+    decode_block = VectorFloatWideningDecodeBlock.subst(iop)
+}};
+
+def format VectorFloatWideningCvtFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+
+    old_vd_idx = 1
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx / 2]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    set_src_reg_idx += setSrcVm()
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+    code = fflags_wrapper(code)
+
+    vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorFloatCvtMicroDeclare.subst(microiop) + \
+        VectorFloatMicroConstructor.subst(microiop) + \
+        VectorFloatWideningMicroExecute.subst(microiop) + \
+        VectorFloatCvtMacroDeclare.subst(iop) + \
+        VectorIntWideningMacroConstructor.subst(iop)
+
+    decode_block = VectorFloatWideningDecodeBlock.subst(iop)
+}};
+
+def format VectorFloatNarrowingCvtFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+
+    old_vd_idx = 1
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx / 2]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx / 2]"
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    set_src_reg_idx += setSrcVm()
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+    code = fflags_wrapper(code)
+    code = narrowingOpRegisterConstraintChecks(code)
+
+    vm_decl_rd = vmDeclAndReadData()
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorFloatCvtMicroDeclare.subst(microiop) + \
+        VectorFloatMicroConstructor.subst(microiop) + \
+        VectorFloatNarrowingMicroExecute.subst(microiop) + \
+        VectorFloatCvtMacroDeclare.subst(iop) + \
+        VectorIntWideningMacroConstructor.subst(iop)
+
+    decode_block = VectorFloatWideningDecodeBlock.subst(iop)
+}};
+
+def format VectorFloatMaskFormat(code, category, *flags) {{
+    iop = InstObjParams(name,
+        Name,
+        'VectorArithMacroInst',
+        {'code': code},
+        flags)
+    dest_reg_id = "vecRegClass[VecMemInternalReg0 + _microIdx]"
+    src1_reg_id = ""
+    if category == "OPFVV":
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx]"
+    elif category == "OPFVF":
+        src1_reg_id = "floatRegClass[_machInst.rs1]"
+    else:
+        error("not supported category for VectorFloatFormat: %s" % category)
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_src_reg_idx += setSrcVm()
+    vm_decl_rd = vmDeclAndReadData()
+
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+    code = fflags_wrapper(code)
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorFloatMaskMicroDeclare.subst(microiop) + \
+        VectorFloatMaskMicroConstructor.subst(microiop) + \
+        VectorFloatMaskMicroExecute.subst(microiop) + \
+        VectorFloatMaskMacroDeclare.subst(iop) + \
+        VectorFloatMaskMacroConstructor.subst(iop)
+    decode_block = VectorFloatDecodeBlock.subst(iop)
+}};
+
+def format VMvWholeFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VMvWholeMacroInst', {'code': code}, flags)
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VMvWholeMicroInst',
+        {'code': code},
+        flags)
+
+    header_output = \
+        VMvWholeMacroDeclare.subst(iop) + \
+        VMvWholeMicroDeclare.subst(microiop)
+    decoder_output = \
+        VMvWholeMacroConstructor.subst(iop) + \
+        VMvWholeMicroConstructor.subst(microiop)
+    exec_output = VMvWholeMicroExecute.subst(microiop)
+    decode_block = BasicDecode.subst(iop)
+}};
+
+def format ViotaFormat(code, category, *flags){{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    # The tail of vector mask inst should be treated as tail-agnostic.
+    # We treat it with tail-undisturbed policy, since
+    # the test suits only support undisturbed policy.
+    old_dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    vm_decl_rd = vmDeclAndReadData()
+    set_vm_idx = setSrcVm()
+
+    microiop = InstObjParams(name+"_micro",
+        Name+"Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'set_vm_idx': set_vm_idx,
+         'copy_old_vd': copyOldVd(1)},
+        flags)
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        ViotaMicroDeclare.subst(microiop) + \
+        ViotaMicroConstructor.subst(microiop) + \
+        ViotaMicroExecute.subst(microiop)+\
+        ViotaMacroDeclare.subst(iop) + \
+        ViotaMacroConstructor.subst(iop)
+
+    decode_block = VectorIntDecodeBlock.subst(iop)
+
+}};
+
+def format Vector1Vs1VdMaskFormat(code, category, *flags){{
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd]"
+    src2_reg_id = "vecRegClass[_machInst.vs2]"
+    # The tail of vector mask inst should be treated as tail-agnostic.
+    # We treat it with tail-undisturbed policy, since
+    # the test suits only support undisturbed policy.
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    vm_decl_rd = vmDeclAndReadData()
+    set_vm_idx = setSrcVm()
+    iop = InstObjParams(name,
+        Name,
+        'VectorNonSplitInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'set_vm_idx': set_vm_idx,
+         'copy_old_vd': copyOldVd(1)},
+        flags)
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        Vector1Vs1RdMaskDeclare.subst(iop) + \
+        Vector1Vs1VdMaskConstructor.subst(iop) + \
+        Vector1Vs1VdMaskExecute.subst(iop)
+
+    decode_block = VectorMaskDecodeBlock.subst(iop)
+}};
+
+def format Vector1Vs1RdMaskFormat(code, category, *flags){{
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    vm_decl_rd = vmDeclAndReadData()
+    set_vm_idx = setSrcVm()
+    iop = InstObjParams(name,
+        Name,
+        'VectorNonSplitInst',
+        {'code': code,
+         'vm_decl_rd': vm_decl_rd,
+         'set_vm_idx': set_vm_idx},
+        flags)
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        Vector1Vs1RdMaskDeclare.subst(iop) + \
+        Vector1Vs1RdMaskConstructor.subst(iop) + \
+        Vector1Vs1RdMaskExecute.subst(iop)
+
+    decode_block = VectorMaskDecodeBlock.subst(iop)
+}};
+
+def format VectorNonSplitFormat(code, category, *flags) {{
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    vm_decl_rd = ""
+
+    set_vm_idx = ""
+
+    if inst_name == "vfmv" :
+        code = fflags_wrapper(code)
+
+    iop = InstObjParams(name,
+        Name,
+        'VectorNonSplitInst',
+        {'code': code,
+         'vm_decl_rd': vm_decl_rd,
+         'set_vm_idx': set_vm_idx},
+        flags)
+
+
+    if inst_name == "vfmv" :
+        execute_block = VectorFloatNonSplitExecute.subst(iop)
+        decode_block = VectorFloatDecodeBlock.subst(iop)
+    elif inst_name == "vmv" :
+        execute_block = VectorIntNonSplitExecute.subst(iop)
+        decode_block = VectorIntDecodeBlock.subst(iop)
+    else :
+        error("Unsupported inst for VectorNonSplitFormat: %s" % inst_name)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorNonSplitDeclare.subst(iop) + \
+        VectorNonSplitConstructor.subst(iop) + \
+        execute_block
+
+}};
+
+def format VectorMaskFormat(code, category, *flags) {{
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    old_vd_idx = 2
+    if category not in ["OPMVV"]:
+        error("not supported category for VectorIntFormat: %s" % category)
+    dest_reg_id = "vecRegClass[_machInst.vd]"
+    src1_reg_id = "vecRegClass[_machInst.vs1]"
+    src2_reg_id = "vecRegClass[_machInst.vs2]"
+
+    # The tail of vector mask inst should be treated as tail-agnostic.
+    # We treat it with tail-undisturbed policy, since
+    # the test suits only support undisturbed policy.
+    # TODO: remove it
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+
+    set_src_reg_idx = ""
+    set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    code = loopWrapper(code, micro_inst = False)
+
+    iop = InstObjParams(name,
+        Name,
+        'VectorNonSplitInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorMaskDeclare.subst(iop) + \
+        VectorMaskConstructor.subst(iop) + \
+        VectorMaskExecute.subst(iop)
+
+    decode_block = VectorMaskDecodeBlock.subst(iop)
+}};
+
+def format VectorReduceIntFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd]"
+    src1_reg_id = "vecRegClass[_machInst.vs1]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    # Treat tail undisturbed/agnostic as the same
+    # We always need old rd as src vreg
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_src_reg_idx += setSrcVm()
+    vm_decl_rd = vmDeclAndReadData()
+    type_def = '''
+        using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+        using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    '''
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'type_def': type_def,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorReduceMicroDeclare.subst(microiop) + \
+        VectorReduceMicroConstructor.subst(microiop) + \
+        VectorReduceIntMicroExecute.subst(microiop) + \
+        VectorReduceMacroDeclare.subst(iop) + \
+        VectorReduceMacroConstructor.subst(iop)
+    decode_block = VectorIntDecodeBlock.subst(iop)
+}};
+
+def format VectorReduceFloatFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd]"
+    src1_reg_id = "vecRegClass[_machInst.vs1]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    # Treat tail undisturbed/agnostic as the same
+    # We always need old rd as src vreg
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_src_reg_idx += setSrcVm()
+    vm_decl_rd = vmDeclAndReadData()
+    type_def = '''
+        using et = ElemType;
+        using vu = decltype(et::v);
+    '''
+
+    code = fflags_wrapper(code)
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'type_def': type_def,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorReduceMicroDeclare.subst(microiop) + \
+        VectorReduceMicroConstructor.subst(microiop) + \
+        VectorReduceFloatMicroExecute.subst(microiop) + \
+        VectorReduceMacroDeclare.subst(iop) + \
+        VectorReduceMacroConstructor.subst(iop)
+    decode_block = VectorFloatDecodeBlock.subst(iop)
+}};
+
+def format VectorReduceFloatWideningFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd]"
+    src1_reg_id = "vecRegClass[_machInst.vs1]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    # Treat tail undisturbed/agnostic as the same
+    # We always need old rd as src vreg
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_src_reg_idx += setSrcVm()
+    vm_decl_rd = vmDeclAndReadData()
+    type_def = '''
+        using et = ElemType;
+        using vu [[maybe_unused]] = decltype(et::v);
+        using ewt = typename double_width<et>::type;
+        using vwu = decltype(ewt::v);
+    '''
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'type_def': type_def,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorReduceMicroDeclare.subst(microiop) + \
+        VectorReduceMicroConstructor.subst(microiop) + \
+        VectorReduceFloatWideningMicroExecute.subst(microiop) + \
+        VectorReduceMacroDeclare.subst(iop) + \
+        VectorReduceMacroConstructor.subst(iop)
+    decode_block = VectorFloatWideningDecodeBlock.subst(iop)
+}};
+
+def format VectorIntVxsatFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    old_vd_idx = 2
+    dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
+    src1_reg_id = ""
+    if category in ["OPIVV"]:
+        src1_reg_id = "vecRegClass[_machInst.vs1 + _microIdx]"
+    elif category in ["OPIVX"]:
+        src1_reg_id = "intRegClass[_machInst.rs1]"
+    elif category == "OPIVI":
+        old_vd_idx = 1
+    else:
+        error("not supported category for VectorIntVxsatFormat: %s" % category)
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+
+    set_src_reg_idx = ""
+    if category != "OPIVI":
+        set_src_reg_idx += setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    set_src_reg_idx += setSrcWrapper(src3_reg_id)
+    set_src_reg_idx += setSrcVm()
+    vm_decl_rd = vmDeclAndReadData()
+
+    code = maskCondWrapper(code)
+    code = eiDeclarePrefix(code)
+    code = loopWrapper(code)
+
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorIntVxsatMicroDeclare.subst(microiop) + \
+        VectorIntVxsatMicroConstructor.subst(microiop) + \
+        VectorIntMicroExecute.subst(microiop) + \
+        VectorIntVxsatMacroDeclare.subst(iop) + \
+        VectorIntVxsatMacroConstructor.subst(iop)
+
+    decode_block = VectorIntDecodeBlock.subst(iop)
+}};
+
+def format VectorReduceIntWideningFormat(code, category, *flags) {{
+    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd]"
+    src1_reg_id = "vecRegClass[_machInst.vs1]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
+    old_dest_reg_id = "vecRegClass[_machInst.vd]"
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    set_src_reg_idx = setSrcWrapper(src1_reg_id)
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    # Treat tail undisturbed/agnostic as the same
+    # We always need old rd as src vreg
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_src_reg_idx += setSrcVm()
+    vm_decl_rd = vmDeclAndReadData()
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        'VectorArithMicroInst',
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(2)},
+        flags)
+
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorReduceMicroDeclare.subst(microiop) + \
+        VectorReduceMicroConstructor.subst(microiop) + \
+        VectorReduceIntWideningMicroExecute.subst(microiop) + \
+        VectorReduceMacroDeclare.subst(iop) + \
+        VectorReduceMacroConstructor.subst(iop)
+    decode_block = VectorIntWideningDecodeBlock.subst(iop)
+}};
+
+let {{
+
+def VectorSlideBase(name, Name, category, code, flags, macro_construtor,
+        decode_template, micro_execute_template):
+    macroop_class_name = 'VectorSlideMacroInst'
+    microop_class_name = 'VectorSlideMicroInst'
+    # Make sure flags are in lists (convert to lists if not).
+    flags = makeList(flags)
+    iop = InstObjParams(name, Name, macroop_class_name, {'code': code},
+                        flags)
+    inst_name, inst_suffix = name.split("_", maxsplit=1)
+    dest_reg_id = "vecRegClass[_machInst.vd + vdIdx]"
+    src2_reg_id = "vecRegClass[_machInst.vs2 + vs2Idx]"
+    src1_ireg_id = "intRegClass[_machInst.rs1]"
+    src1_freg_id = "floatRegClass[_machInst.rs1]"
+
+    # The tail of vector mask inst should be treated as tail-agnostic.
+    # We treat it with tail-undisturbed policy, since
+    # the test suits only support undisturbed policy.
+    num_src_regs = 0
+
+    old_dest_reg_id = "vecRegClass[_machInst.vd + vdIdx]"
+    set_src_reg_idx = ""
+    if category in ["OPIVX", "OPMVX"]:
+        set_src_reg_idx += setSrcWrapper(src1_ireg_id)
+        num_src_regs += 1
+    elif category in ["OPFVF"]:
+        set_src_reg_idx += setSrcWrapper(src1_freg_id)
+        num_src_regs += 1
+    set_src_reg_idx += setSrcWrapper(src2_reg_id)
+    num_src_regs += 1
+    old_vd_idx = num_src_regs
+    set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
+    set_dest_reg_idx = setDestWrapper(dest_reg_id)
+    vm_decl_rd = vmDeclAndReadData()
+    set_src_reg_idx += setSrcVm()
+    microiop = InstObjParams(name + "_micro",
+        Name + "Micro",
+        microop_class_name,
+        {'code': code,
+         'set_dest_reg_idx': set_dest_reg_idx,
+         'set_src_reg_idx': set_src_reg_idx,
+         'vm_decl_rd': vm_decl_rd,
+         'copy_old_vd': copyOldVd(old_vd_idx)},
+        flags)
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    # Because of the use of templates, we had to put all parts in header to
+    # keep the compiler happy.
+    header_output = \
+        VectorSlideMicroDeclare.subst(microiop) + \
+        VectorSlideMicroConstructor.subst(microiop) + \
+        micro_execute_template.subst(microiop) + \
+        VectorSlideMacroDeclare.subst(iop) + \
+        macro_construtor.subst(iop)
+
+    decode_block = decode_template.subst(iop)
+    return (header_output, decode_block)
+
+}};
+
+def format VectorSlideUpFormat(code, category, *flags) {{
+    (header_output, decode_block) = VectorSlideBase(name, Name, category, code,
+        flags,
+        macro_construtor = VectorSlideUpMacroConstructor,
+        decode_template = VectorIntDecodeBlock,
+        micro_execute_template = VectorSlideMicroExecute)
+}};
+
+def format VectorSlideDownFormat(code, category, *flags) {{
+    (header_output, decode_block) = VectorSlideBase(name, Name, category, code,
+        flags,
+        macro_construtor = VectorSlideDownMacroConstructor,
+        decode_template = VectorIntDecodeBlock,
+        micro_execute_template = VectorSlideMicroExecute)
+}};
+
+def format VectorFloatSlideUpFormat(code, category, *flags) {{
+    (header_output, decode_block) = VectorSlideBase(name, Name, category, code,
+        flags,
+        macro_construtor = VectorSlideUpMacroConstructor,
+        decode_template = VectorFloatDecodeBlock,
+        micro_execute_template = VectorFloatSlideMicroExecute)
+}};
+
+def format VectorFloatSlideDownFormat(code, category, *flags) {{
+    (header_output, decode_block) = VectorSlideBase(name, Name, category, code,
+        flags,
+        macro_construtor = VectorSlideDownMacroConstructor,
+        decode_template = VectorFloatDecodeBlock,
+        micro_execute_template = VectorFloatSlideMicroExecute)
+}};
diff --git a/src/arch/riscv/isa/formats/vector_conf.isa b/src/arch/riscv/isa/formats/vector_conf.isa
new file mode 100644
index 0000000000..556e230075
--- /dev/null
+++ b/src/arch/riscv/isa/formats/vector_conf.isa
@@ -0,0 +1,96 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2022 PLCT Lab
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+def format VConfOp(code, *flags) {{
+    iop = InstObjParams(name, Name, 'VConfOp', code, flags)
+    header_output = BasicDeclare.subst(iop)
+    decoder_output = BasicConstructor.subst(iop)
+    decode_block = BasicDecode.subst(iop)
+    exec_output = VConfExecute.subst(iop)
+}};
+
+def template VConfExecute {{
+    Fault
+    %(class_name)s::execute(ExecContext *xc,
+        trace::InstRecord *traceData) const
+    {
+        auto tc = xc->tcBase();
+
+        %(op_decl)s;
+        %(op_rd)s;
+        %(code)s;
+
+        tc->setMiscReg(MISCREG_VSTART, 0);
+
+        uint32_t vlen = xc->readMiscReg(MISCREG_VLENB) * 8;
+        uint32_t vlmax = getVlmax(xc->readMiscReg(MISCREG_VTYPE), vlen);
+
+        VTYPE new_vtype = requested_vtype;
+        if (xc->readMiscReg(MISCREG_VTYPE) != new_vtype) {
+            vlmax = getVlmax(new_vtype, vlen);
+
+            float vflmul = getVflmul(new_vtype.vlmul);
+
+            uint32_t sew = getSew(new_vtype.vsew);
+
+            uint32_t new_vill =
+                !(vflmul >= 0.125 && vflmul <= 8) ||
+                    sew > std::min(vflmul, 1.0f) * ELEN ||
+                    bits(requested_vtype, 30, 8) != 0;
+            if (new_vill) {
+                vlmax = 0;
+                new_vtype = 0;
+                new_vtype.vill = 1;
+            }
+
+            xc->setMiscReg(MISCREG_VTYPE, new_vtype);
+        }
+
+        uint32_t current_vl = xc->readMiscReg(MISCREG_VL);
+        uint32_t new_vl = 0;
+        if (vlmax == 0) {
+            new_vl = 0;
+        } else if (rd_bits == 0 && rs1_bits == 0) {
+            new_vl = current_vl > vlmax ? vlmax : current_vl;
+        } else if (rd_bits != 0 && rs1_bits == 0) {
+            new_vl = vlmax;
+        } else if (rs1_bits != 0) {
+            new_vl = requested_vl > vlmax ? vlmax : requested_vl;
+        }
+
+        xc->setMiscReg(MISCREG_VL, new_vl);
+
+        tc->getDecoderPtr()->as<Decoder>().setVlAndVtype(new_vl, new_vtype);
+
+        Rd = new_vl;
+
+        %(op_wb)s;
+        return NoFault;
+    }
+}};
diff --git a/src/arch/riscv/isa/formats/vector_mem.isa b/src/arch/riscv/isa/formats/vector_mem.isa
new file mode 100644
index 0000000000..113250d5cf
--- /dev/null
+++ b/src/arch/riscv/isa/formats/vector_mem.isa
@@ -0,0 +1,205 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2022 PLCT Lab
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+let {{
+
+def VMemBase(name, Name, ea_code, memacc_code, mem_flags,
+                   inst_flags, base_class, postacc_code='',
+                   declare_template_base=VMemMacroDeclare,
+                   decode_template=BasicDecode, exec_template_base='',
+                   # If it's a macroop, the corresponding microops will be
+                   # generated.
+                   is_macroop=True):
+    # Make sure flags are in lists (convert to lists if not).
+    mem_flags = makeList(mem_flags)
+    inst_flags = makeList(inst_flags)
+    iop = InstObjParams(name, Name, base_class,
+        {'ea_code': ea_code,
+         'memacc_code': memacc_code,
+         'postacc_code': postacc_code },
+        inst_flags)
+
+    constructTemplate = eval(exec_template_base + 'Constructor')
+
+    header_output   = declare_template_base.subst(iop)
+    decoder_output  = ''
+    if declare_template_base is not VMemTemplateMacroDeclare:
+        decoder_output  += constructTemplate.subst(iop)
+    else:
+        header_output   += constructTemplate.subst(iop)
+    decode_block    = decode_template.subst(iop)
+    exec_output     = ''
+    if not is_macroop:
+        return (header_output, decoder_output, decode_block, exec_output)
+
+    microiop = InstObjParams(name + '_micro',
+        Name + 'Micro',
+        exec_template_base + 'MicroInst',
+        {'ea_code': ea_code,
+         'memacc_code': memacc_code,
+         'postacc_code': postacc_code},
+        inst_flags)
+
+    if mem_flags:
+        mem_flags = [ 'Request::%s' % flag for flag in mem_flags ]
+        s = '\n\tmemAccessFlags = ' + '|'.join(mem_flags) + ';'
+        microiop.constructor += s
+
+    microDeclTemplate = eval(exec_template_base + 'Micro' + 'Declare')
+    microExecTemplate = eval(exec_template_base + 'Micro' + 'Execute')
+    microInitTemplate = eval(exec_template_base + 'Micro' + 'InitiateAcc')
+    microCompTemplate = eval(exec_template_base + 'Micro' + 'CompleteAcc')
+    header_output = microDeclTemplate.subst(microiop) + header_output
+    micro_exec_output = (microExecTemplate.subst(microiop) +
+        microInitTemplate.subst(microiop) +
+        microCompTemplate.subst(microiop))
+    if declare_template_base is not VMemTemplateMacroDeclare:
+        exec_output += micro_exec_output
+    else:
+        header_output += micro_exec_output
+
+    return (header_output, decoder_output, decode_block, exec_output)
+
+}};
+
+def format VleOp(
+    memacc_code,
+    ea_code={{ EA = Rs1 + VLENB * microIdx; }},
+    mem_flags=[],
+    inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VleMacroInst', exec_template_base='Vle')
+}};
+
+def format VseOp(
+    memacc_code,
+    ea_code={{ EA = Rs1 + VLENB * microIdx; }},
+    mem_flags=[],
+    inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VseMacroInst', exec_template_base='Vse')
+}};
+
+def format VlmOp(
+    memacc_code,
+    ea_code={{ EA = Rs1; }},
+    mem_flags=[],
+    inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VleMacroInst', exec_template_base='Vlm', is_macroop=False)
+}};
+
+def format VsmOp(
+  memacc_code,
+  ea_code={{ EA = Rs1; }},
+  mem_flags=[],
+  inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VseMacroInst', exec_template_base='Vsm', is_macroop=False)
+}};
+
+def format VlWholeOp(
+    memacc_code,
+    ea_code={{ EA = Rs1 + VLENB * microIdx; }},
+    mem_flags=[],
+    inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VlWholeMacroInst', exec_template_base='VlWhole')
+}};
+
+def format VsWholeOp(
+    memacc_code,
+    ea_code={{ EA = Rs1 + VLENB * microIdx; }},
+    mem_flags=[],
+    inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VsWholeMacroInst', exec_template_base='VsWhole')
+}};
+
+def format VlStrideOp(
+    memacc_code,
+    ea_code={{ EA = Rs1 + Rs2 * (regIdx * VLENB / elem_size + microIdx); }},
+    mem_flags=[],
+    inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VlStrideMacroInst', exec_template_base='VlStride')
+}};
+
+def format VsStrideOp(
+    memacc_code,
+    ea_code={{ EA = Rs1 + Rs2 * (regIdx * VLENB / elem_size + microIdx); }},
+    mem_flags=[],
+    inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VsStrideMacroInst', exec_template_base='VsStride')
+}};
+
+def format VlIndexOp(
+    memacc_code,
+    ea_code,
+    mem_flags=[],
+    inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VlIndexMacroInst', exec_template_base='VlIndex',
+                 declare_template_base=VMemTemplateMacroDeclare,
+                 decode_template=VMemTemplateDecodeBlock
+                 )
+}};
+
+def format VsIndexOp(
+    memacc_code,
+    ea_code,
+    mem_flags=[],
+    inst_flags=[]
+) {{
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags,
+                 'VsIndexMacroInst', exec_template_base='VsIndex',
+                 declare_template_base=VMemTemplateMacroDeclare,
+                 decode_template=VMemTemplateDecodeBlock
+                 )
+}};
diff --git a/src/arch/riscv/isa/includes.isa b/src/arch/riscv/isa/includes.isa
index 8dddc2fb59..76f2388faf 100644
--- a/src/arch/riscv/isa/includes.isa
+++ b/src/arch/riscv/isa/includes.isa
@@ -34,6 +34,7 @@
 //
 
 output header {{
+#include <functional>
 #include <iomanip>
 #include <sstream>
 #include <string>
@@ -45,6 +46,8 @@ output header {{
 #include <softfloat.h>
 #include <specialize.h>
 
+#include "arch/generic/memhelpers.hh"
+#include "arch/riscv/decoder.hh"
 #include "arch/riscv/insts/amo.hh"
 #include "arch/riscv/insts/bs.hh"
 #include "arch/riscv/insts/compressed.hh"
@@ -53,6 +56,7 @@ output header {{
 #include "arch/riscv/insts/standard.hh"
 #include "arch/riscv/insts/static_inst.hh"
 #include "arch/riscv/insts/unknown.hh"
+#include "arch/riscv/insts/vector.hh"
 #include "arch/riscv/interrupts.hh"
 #include "cpu/static_inst.hh"
 #include "mem/packet.hh"
@@ -66,9 +70,15 @@ output decoder {{
 #include <limits>
 #include <string>
 
+/* riscv softfloat library */
+#include <internals.h>
+#include <softfloat.h>
+#include <specialize.h>
+
 #include "arch/riscv/decoder.hh"
 #include "arch/riscv/faults.hh"
 #include "arch/riscv/mmu.hh"
+#include "arch/riscv/regs/float.hh"
 #include "base/cprintf.hh"
 #include "base/loader/symtab.hh"
 #include "cpu/thread_context.hh"
@@ -95,6 +105,7 @@ output exec {{
 #include "arch/riscv/reg_abi.hh"
 #include "arch/riscv/regs/float.hh"
 #include "arch/riscv/regs/misc.hh"
+#include "arch/riscv/regs/vector.hh"
 #include "arch/riscv/utility.hh"
 #include "base/condcodes.hh"
 #include "cpu/base.hh"
diff --git a/src/arch/riscv/isa/main.isa b/src/arch/riscv/isa/main.isa
index 24f366b00c..2923a965da 100644
--- a/src/arch/riscv/isa/main.isa
+++ b/src/arch/riscv/isa/main.isa
@@ -50,6 +50,9 @@ namespace RiscvISA;
 //Include the operand_types and operand definitions
 ##include "operands.isa"
 
+//Include the definitions for the instruction templates
+##include "templates/templates.isa"
+
 //Include the definitions for the instruction formats
 ##include "formats/formats.isa"
 
diff --git a/src/arch/riscv/isa/operands.isa b/src/arch/riscv/isa/operands.isa
index 72d8f81bca..a81b28df57 100644
--- a/src/arch/riscv/isa/operands.isa
+++ b/src/arch/riscv/isa/operands.isa
@@ -38,7 +38,15 @@ def operand_types {{
     'sd' : 'int64_t',
     'ud' : 'uint64_t',
     'sf' : 'float',
-    'df' : 'double'
+    'df' : 'double',
+
+    'vi'    : 'vi',
+    'vu'    : 'vu',
+    'vwi'   : 'vwi',
+    'vwu'   : 'vwu',
+    'vext'  : 'vext',
+    'vextu' : 'vextu',
+    'vc'    : 'RiscvISA::VecRegContainer'
 }};
 
 let {{
@@ -79,6 +87,11 @@ def operands {{
     'Fp2': FloatRegOp('df', 'FP2 + 8', 'IsFloating', 2),
     'Fp2_bits': FloatRegOp('ud', 'FP2 + 8', 'IsFloating', 2),
 
+    'Vd':  VecRegOp('vc', 'VD', 'IsVector', 1),
+    'Vs1': VecRegOp('vc', 'VS1', 'IsVector', 2),
+    'Vs2': VecRegOp('vc', 'VS2', 'IsVector', 3),
+    'Vs3': VecRegOp('vc', 'VS3', 'IsVector', 4),
+
 #Memory Operand
     'Mem': MemOp('ud', None, (None, 'IsLoad', 'IsStore'), 5),
 
diff --git a/src/arch/riscv/isa/templates/templates.isa b/src/arch/riscv/isa/templates/templates.isa
new file mode 100644
index 0000000000..ed3f5287c0
--- /dev/null
+++ b/src/arch/riscv/isa/templates/templates.isa
@@ -0,0 +1,32 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2022 PLCT Lab
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Include
+##include "vector_mem.isa"
+##include "vector_arith.isa"
diff --git a/src/arch/riscv/isa/templates/vector_arith.isa b/src/arch/riscv/isa/templates/vector_arith.isa
new file mode 100644
index 0000000000..d15ab70f20
--- /dev/null
+++ b/src/arch/riscv/isa/templates/vector_arith.isa
@@ -0,0 +1,1989 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2022 PLCT Lab
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+output header {{
+
+#define ASSIGN_VD_BIT(idx, bit) \
+    ((Vd[(idx)/8] & ~(1 << (idx)%8)) | ((bit) << (idx)%8))
+
+#define COPY_OLD_VD(idx)                                             \
+    [[maybe_unused]] RiscvISA::vreg_t old_vd;                        \
+    [[maybe_unused]] decltype(Vd) old_Vd = nullptr;                  \
+    xc->getRegOperand(this, (idx), &old_vd);                           \
+    old_Vd = old_vd.as<std::remove_reference_t<decltype(Vd[0])> >(); \
+    memcpy(Vd, old_Vd, VLENB);
+
+#define VRM_REQUIRED                                                         \
+        uint_fast8_t frm = xc->readMiscReg(MISCREG_FRM);                     \
+        if (frm > 4)                                                         \
+            return std::make_shared<IllegalInstFault>("RM fault", machInst); \
+        softfloat_roundingMode = frm;
+
+template<typename Type>
+bool inline
+carry_out(Type a, Type b, bool carry_in = false) {
+    using TypeU = std::make_unsigned_t<Type>;
+    TypeU s = *reinterpret_cast<TypeU*>(&a)
+            + *reinterpret_cast<TypeU*>(&b) + carry_in;
+    return carry_in
+        ? (s <= *reinterpret_cast<TypeU*>(&a))
+        : (s <  *reinterpret_cast<TypeU*>(&a));
+}
+
+template<typename Type>
+bool inline
+borrow_out(Type a, Type b, bool borrow_in = false) {
+    using TypeU = std::make_unsigned_t<Type>;
+    return borrow_in
+        ? (*reinterpret_cast<TypeU*>(&a) <= *reinterpret_cast<TypeU*>(&b))
+        : (*reinterpret_cast<TypeU*>(&a) <  *reinterpret_cast<TypeU*>(&b));
+}
+
+}};
+
+def template VectorIntMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+}};
+
+def template VectorIntMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs1, vs2, vs3(old_vd), vm for *.vv, *.vx
+    // vs2, (old_vd), vm for *.vi
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+                   uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+                                         uint8_t _microVl, uint8_t _microIdx)
+    : %(base_class)s("%(mnemonic)s", _machInst,
+                     %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorIntMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    [[maybe_unused]] constexpr size_t sew = sizeof(vu) * 8;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+
+    return NoFault;
+}
+
+}};
+
+def template VectorIntExtMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    std::string generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const override
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+            << registerName(srcRegIdx(0));
+        if (machInst.vm == 0) ss << ", v0.t";
+        return ss.str();
+    }
+};
+
+}};
+
+def template VectorIntExtMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    RegId srcRegIdxArr[3];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+                   uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    std::string generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const override
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+            << registerName(srcRegIdx(0));
+        if (machInst.vm == 0) ss << ", v0.t";
+        return ss.str();
+    }
+};
+
+}};
+
+def template VectorIntExtMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    auto SEW = vtype_SEW(vtype);
+    auto offset = (VLEN / SEW) * (microIdx % %(ext_div)d);
+    switch (SEW / %(ext_div)d) {
+      case 8: {
+        using vext  [[maybe_unused]] = int8_t;
+        using vextu [[maybe_unused]] = uint8_t;
+        %(op_decl)s;
+        %(op_rd)s;
+        %(vm_decl_rd)s;
+        %(copy_old_vd)s;
+        %(code)s;
+        %(op_wb)s;
+        break;
+      }
+      case 16: {
+        using vext  [[maybe_unused]] = int16_t;
+        using vextu [[maybe_unused]] = uint16_t;
+        %(op_decl)s;
+        %(op_rd)s;
+        %(vm_decl_rd)s;
+        %(copy_old_vd)s;
+        %(code)s;
+        %(op_wb)s;
+        break;
+      }
+      case 32: {
+        using vext  [[maybe_unused]] = int32_t;
+        using vextu [[maybe_unused]] = uint32_t;
+        %(op_decl)s;
+        %(op_rd)s;
+        %(vm_decl_rd)s;
+        %(copy_old_vd)s;
+        %(code)s;
+        %(op_wb)s;
+      break;
+      }
+      default: break;
+    }
+
+    return NoFault;
+}
+
+}};
+
+def template VectorIntDecodeBlock {{
+
+switch(machInst.vtype8.vsew) {
+case 0b000: return new %(class_name)s<uint8_t>(machInst);
+case 0b001: return new %(class_name)s<uint16_t>(machInst);
+case 0b010: return new %(class_name)s<uint32_t>(machInst);
+case 0b011: return new %(class_name)s<uint64_t>(machInst);
+default: GEM5_UNREACHABLE;
+}
+
+}};
+
+def template VectorIntWideningMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntWideningMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const int64_t vlmul = vtype_vlmul(_machInst.vtype8);
+    // Todo: move to Decode template
+    panic_if(vlmul == 3, "LMUL=8 is illegal for widening inst");
+    // when LMUL setted as m1, need to split to 2 micro insts
+    const uint32_t num_microops = 1 << std::max<int64_t>(0, vlmul + 1);
+
+    int32_t tmp_vl = this->vl;
+    const int32_t t_micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    const int32_t micro_vlmax = vlmul < 0 ? t_micro_vlmax : t_micro_vlmax / 2;
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorIntWideningMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs1, vs2, vs3(old_vd), vm for *.vv, *.vx
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+                   uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntWideningMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+        uint8_t _microVl, uint8_t _microIdx)
+    : %(base_class)s("%(mnemonic)s", _machInst,
+                     %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorIntWideningMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    using vwu [[maybe_unused]] = typename double_width<vu>::type;
+    using vwi [[maybe_unused]] = typename double_width<vi>::type;
+    [[maybe_unused]] constexpr size_t sew = sizeof(vu) * 8;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    const int64_t vlmul = vtype_vlmul(machInst.vtype8);
+    const int32_t t_micro_vlmax = vtype_VLMAX(machInst.vtype8, true);
+    const int32_t micro_vlmax = vlmul < 0 ? t_micro_vlmax : t_micro_vlmax / 2;
+    [[maybe_unused]] const size_t offset =
+        (this->microIdx % 2 == 0) ? 0 : micro_vlmax;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorIntNarrowingMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    using vwu [[maybe_unused]] = typename double_width<vu>::type;
+    using vwi [[maybe_unused]] = typename double_width<vi>::type;
+    [[maybe_unused]] constexpr size_t sew = sizeof(vu) * 8;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    const int64_t vlmul = vtype_vlmul(machInst.vtype8);
+    const int32_t t_micro_vlmax = vtype_VLMAX(machInst.vtype8, true);
+    const int32_t micro_vlmax = vlmul < 0 ? t_micro_vlmax : t_micro_vlmax / 2;
+    [[maybe_unused]] const size_t offset =
+        (this->microIdx % 2 == 0) ? 0 : micro_vlmax;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorIntWideningDecodeBlock {{
+
+switch(machInst.vtype8.vsew) {
+case 0b000: return new %(class_name)s<uint8_t>(machInst);
+case 0b001: return new %(class_name)s<uint16_t>(machInst);
+case 0b010: return new %(class_name)s<uint32_t>(machInst);
+default: GEM5_UNREACHABLE;
+}
+
+}};
+
+def template VectorFloatMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorFloatMacroConstructor {{
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+}};
+
+def template VectorFloatMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs1, vs2, vs3(old_vd), vm
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst,
+        uint8_t _microVl, uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorFloatMicroConstructor {{
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+                                         uint8_t _microVl, uint8_t _microIdx)
+    : %(base_class)s("%(mnemonic)s", _machInst,
+                     %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorFloatMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using et = ElemType;
+    using vu = decltype(et::v);
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    VRM_REQUIRED;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+
+    return NoFault;
+}
+
+}};
+
+def template VectorFloatDecodeBlock {{
+
+switch(machInst.vtype8.vsew) {
+case 0b010: return new %(class_name)s<float32_t>(machInst);
+case 0b011: return new %(class_name)s<float64_t>(machInst);
+default: GEM5_UNREACHABLE;
+}
+
+}};
+
+def template VectorFloatCvtMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    std::string generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const override
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+            << registerName(srcRegIdx(0));
+        if (machInst.vm == 0) ss << ", v0.t";
+        return ss.str();
+    }
+};
+
+}};
+
+def template VectorFloatCvtMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    RegId srcRegIdxArr[3];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst,
+        uint8_t _microVl, uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    std::string generateDisassembly(Addr pc,
+        const loader::SymbolTable *symtab) const override
+    {
+        std::stringstream ss;
+        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+            << registerName(srcRegIdx(0));
+        if (machInst.vm == 0) ss << ", v0.t";
+        return ss.str();
+    }
+};
+
+}};
+
+
+def template VectorFloatWideningMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using et = ElemType;
+    using vu [[maybe_unused]] = decltype(et::v);
+    using ewt = typename double_width<et>::type;
+    using vwu = decltype(ewt::v);
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    VRM_REQUIRED;
+
+    const int64_t vlmul = vtype_vlmul(machInst.vtype8);
+    const int32_t t_micro_vlmax = vtype_VLMAX(machInst.vtype8, true);
+    const int32_t micro_vlmax = vlmul < 0 ? t_micro_vlmax : t_micro_vlmax / 2;
+    [[maybe_unused]] const size_t offset =
+        (this->microIdx % 2 == 0) ? 0 : micro_vlmax;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorFloatNarrowingMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using et = ElemType;
+    using vu [[maybe_unused]] = decltype(et::v);
+    using ewt = typename double_width<et>::type;
+    using vwu = decltype(ewt::v);
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    VRM_REQUIRED;
+
+    const int64_t vlmul = vtype_vlmul(machInst.vtype8);
+    const int32_t t_micro_vlmax = vtype_VLMAX(machInst.vtype8, true);
+    const int32_t micro_vlmax = vlmul < 0 ? t_micro_vlmax : t_micro_vlmax / 2;
+    [[maybe_unused]] const size_t offset =
+        (this->microIdx % 2 == 0) ? 0 : micro_vlmax;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorFloatWideningDecodeBlock {{
+
+switch(machInst.vtype8.vsew) {
+case 0b010: return new %(class_name)s<float32_t>(machInst);
+default: GEM5_UNREACHABLE;
+}
+
+}};
+
+def template ViotaMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    int cnt = 0;
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+
+def template ViotaMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+
+    StaticInstPtr microop;
+
+    // Allow one empty micro op to hold IsLastMicroop flag
+    for (int i = 0; i < num_microops && micro_vl >= 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i,
+            &cnt);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template ViotaMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+    int* cnt;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+                   uint8_t _microIdx, int* cnt);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template ViotaMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+    uint8_t _microVl, uint8_t _microIdx, int* cnt)
+    : %(base_class)s("%(mnemonic)s", _machInst,
+                     %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    this->cnt = cnt;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+    setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs2]);
+}
+
+}};
+
+def template ViotaMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+
+def template Vector1Vs1VdMaskConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+    %(set_vm_idx)s;
+}
+
+}};
+
+def template Vector1Vs1VdMaskExecute {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu = uint8_t;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+};
+
+}};
+
+def template Vector1Vs1RdMaskDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    RegId srcRegIdxArr[2];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template Vector1Vs1RdMaskConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    %(set_vm_idx)s;
+}
+
+}};
+
+def template Vector1Vs1RdMaskExecute {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    %(op_rd)s;
+    uint64_t Rd = 0;
+    %(vm_decl_rd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+};
+
+}};
+
+def template VectorIntMaskMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntMaskMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+    microop = new VMaskMergeMicroInst<ElemType>(_machInst, _machInst.vd,
+        this->microops.size());
+    this->microops.push_back(microop);
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorIntMaskMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs1(rs1), vs2, old_vd, v0 for *.vv[m] or *.vx[m]
+    // vs2, old_vd, v0 for *.vi[m]
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst,
+                   uint8_t _microVl, uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntMaskMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+                                         uint8_t _microVl, uint8_t _microIdx)
+: %(base_class)s("%(mnemonic)s", _machInst,
+                 %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorIntMaskMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+
+    constexpr uint16_t bit_offset = VLENB / sizeof(ElemType);
+    const uint16_t offset = bit_offset * microIdx;
+
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorFloatMaskMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorFloatMaskMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+    microop = new VMaskMergeMicroInst<ElemType>(_machInst, _machInst.vd,
+        this->microops.size());
+    this->microops.push_back(microop);
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorFloatMaskMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs1(rs1), vs2, old_vd, v0 for *.vv or *.vf
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst,
+                   uint8_t _microVl, uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorFloatMaskMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+                                         uint8_t _microVl, uint8_t _microIdx)
+: %(base_class)s("%(mnemonic)s", _machInst,
+                 %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorFloatMaskMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using et = ElemType;
+    using vu = decltype(et::v);
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+
+    constexpr uint16_t bit_offset = VLENB / sizeof(ElemType);
+    const uint16_t offset = bit_offset * microIdx;
+
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VMvWholeMacroDeclare {{
+
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VMvWholeMacroConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = _machInst.simm3 + 1;
+    StaticInstPtr microop;
+
+    for (int i = 0; i < num_microops; ++i) {
+        microop = new %(class_name)sMicro(_machInst, 0, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VMvWholeMicroDeclare {{
+
+class %(class_name)s : public %(base_class)s
+{
+private:
+    RegId srcRegIdxArr[1];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+                   uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VMvWholeMicroConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst,
+                               uint8_t _microVl, uint8_t _microIdx)
+    : %(base_class)s("%(mnemonic)s", _machInst,
+                     %(op_class)s, _microVl, _microIdx)
+{
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _microIdx]);
+    _numTypedDestRegs[VecRegClass]++;
+    setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs2 + _microIdx]);
+}
+
+}};
+
+def template VMvWholeMicroExecute {{
+
+Fault
+%(class_name)s::execute(ExecContext* xc, trace::InstRecord* traceData) const
+{
+    // TODO: Check register alignment.
+    // TODO: If vd is equal to vs2 the instruction is an architectural NOP.
+    %(op_decl)s;
+    %(op_rd)s;
+    for (size_t i = 0; i < (VLEN / 64); i++) {
+        %(code)s;
+    }
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorMaskDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    RegId srcRegIdxArr[3];
+    RegId destRegIdxArr[1];
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorMaskConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorMaskExecute {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu = uint8_t;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    // TODO: remove it
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+
+    return NoFault;
+};
+
+}};
+
+def template VectorMaskDecodeBlock {{
+
+return new %(class_name)s<uint8_t>(machInst);
+
+}};
+
+def template VectorNonSplitDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    RegId srcRegIdxArr[2];
+    RegId destRegIdxArr[1];
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorNonSplitConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    %(set_vm_idx)s;
+}
+
+}};
+
+def template VectorIntNonSplitExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                    trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorFloatNonSplitExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                    trace::InstRecord* traceData) const
+{
+    using et = ElemType;
+    using vu = decltype(et::v);
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorReduceMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorReduceMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorReduceMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs2, vs1, vd, vm
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst,
+                   uint8_t _microVl, uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorReduceMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+                                         uint8_t _microVl, uint8_t _microIdx)
+: %(base_class)s("%(mnemonic)s", _machInst,
+                 %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorReduceIntMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    %(type_def)s;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+
+    auto reduce_loop =
+        [&, this](const auto& f, const auto* _, const auto* vs2) {
+            ElemType microop_result = this->microIdx != 0 ? old_Vd[0] : Vs1[0];
+            for (uint32_t i = 0; i < this->microVl; i++) {
+                uint32_t ei = i + vtype_VLMAX(vtype, true) * this->microIdx;
+                if (this->vm || elem_mask(v0, ei)) {
+                    microop_result = f(microop_result, Vs2[i]);
+                }
+            }
+            return microop_result;
+        };
+
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorReduceFloatMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    %(type_def)s;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+
+    Vd[0] = this->microIdx != 0 ? old_Vd[0] : Vs1[0];
+
+    auto reduce_loop =
+        [&, this](const auto& f, const auto* _, const auto* vs2) {
+            vu tmp_val = Vd[0];
+            for (uint32_t i = 0; i < this->microVl; i++) {
+                uint32_t ei = i + vtype_VLMAX(vtype, true) * this->microIdx;
+                if (this->vm || elem_mask(v0, ei)) {
+                    tmp_val = f(tmp_val, Vs2[i]).v;
+                }
+            }
+            return tmp_val;
+        };
+
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorReduceFloatWideningMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    %(type_def)s;
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+
+    Vd[0] = this->microIdx != 0 ? old_Vd[0] : Vs1[0];
+
+    auto reduce_loop =
+        [&, this](const auto& f, const auto* _, const auto* vs2) {
+            vwu tmp_val = Vd[0];
+            for (uint32_t i = 0; i < this->microVl; i++) {
+                uint32_t ei = i + vtype_VLMAX(vtype, true) * this->microIdx;
+                if (this->vm || elem_mask(v0, ei)) {
+                    tmp_val = f(tmp_val, Vs2[i]).v;
+                }
+            }
+            return tmp_val;
+        };
+
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorGatherMacroDeclare {{
+
+template<typename ElemType, typename IndexType>
+class %(class_name)s : public %(base_class)s{
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorGatherMacroConstructor {{
+
+template<typename ElemType, typename IndexType>
+%(class_name)s<ElemType, IndexType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    constexpr uint32_t vd_eewb = sizeof(ElemType);
+    constexpr uint32_t vs2_eewb = sizeof(ElemType);
+    constexpr uint32_t vs1_eewb = sizeof(IndexType);
+    constexpr bool vs1_split = vd_eewb > vs1_eewb;
+    const int8_t lmul = vtype_vlmul(vtype);
+    const int8_t vs1_emul = lmul +
+        (vs1_split ? -(vs2_eewb / vs1_eewb) : vs1_eewb / vs2_eewb);
+    const uint8_t vs2_vregs = lmul < 0 ? 1 : 1 << lmul;
+    const uint8_t vs1_vregs = vs1_emul < 0 ? 1 : 1 << vs1_emul;
+    const uint8_t vd_vregs = vs2_vregs;
+    const int32_t micro_vlmax = VLENB / std::max(vd_eewb, vs1_eewb);
+    int32_t remaining_vl = this->vl;
+    int32_t micro_vl = std::min(remaining_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (uint8_t i = 0; i < std::max(vs1_vregs, vd_vregs) && micro_vl > 0;
+            i++) {
+        for (uint8_t j = 0; j < vs2_vregs; j++) {
+            microop = new %(class_name)sMicro<ElemType, IndexType>(
+                _machInst, micro_vl, i * vs2_vregs + j);
+            microop->setDelayedCommit();
+            this->microops.push_back(microop);
+        }
+        micro_vl = std::min(remaining_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorGatherMicroDeclare {{
+
+template<typename ElemType, typename IndexType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs2, vs1, vd, vm
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst,
+                   uint8_t _microVl, uint8_t _microIdx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorGatherMicroConstructor {{
+
+template<typename ElemType, typename IndexType>
+%(class_name)s<ElemType, IndexType>::%(class_name)s(ExtMachInst _machInst,
+    uint8_t _microVl, uint8_t _microIdx)
+: %(base_class)s("%(mnemonic)s", _machInst,
+                 %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    [[maybe_unused]] constexpr uint32_t vd_eewb = sizeof(ElemType);
+    [[maybe_unused]] constexpr uint32_t vs2_eewb = sizeof(ElemType);
+    [[maybe_unused]] constexpr uint32_t vs1_eewb = sizeof(IndexType);
+    constexpr uint8_t vs1_split_num = (vd_eewb + vs1_eewb - 1) / vs1_eewb;
+    constexpr uint8_t vd_split_num = (vs1_eewb + vd_eewb - 1) / vd_eewb;
+    const int8_t lmul = vtype_vlmul(vtype);
+    const uint8_t vs2_vregs = lmul < 0 ? 1 : 1 << lmul;
+    [[maybe_unused]] const uint8_t vs2_idx = _microIdx % vs2_vregs;
+    [[maybe_unused]] const uint8_t vs1_idx =
+        _microIdx / vs2_vregs / vs1_split_num;
+    [[maybe_unused]] const uint8_t vd_idx =
+        _microIdx / vs2_vregs / vd_split_num;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorGatherMicroExecute {{
+
+template <typename ElemType, typename IndexType>
+Fault
+%(class_name)s<ElemType, IndexType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    [[maybe_unused]] constexpr size_t sew = sizeof(vu) * 8;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    const uint32_t vlmax = vtype_VLMAX(vtype);
+    constexpr uint8_t vd_eewb = sizeof(ElemType);
+    constexpr uint8_t vs1_eewb = sizeof(IndexType);
+    constexpr uint8_t vs2_eewb = sizeof(ElemType);
+    constexpr uint8_t vs1_split_num = (vd_eewb + vs1_eewb - 1) / vs1_eewb;
+    constexpr uint8_t vd_split_num = (vs1_eewb + vd_eewb - 1) / vd_eewb;
+    [[maybe_unused]] constexpr uint16_t vd_elems = VLENB / vd_eewb;
+    [[maybe_unused]] constexpr uint16_t vs1_elems = VLENB / vs1_eewb;
+    [[maybe_unused]] constexpr uint16_t vs2_elems = VLENB / vs2_eewb;
+    [[maybe_unused]] const int8_t lmul = vtype_vlmul(vtype);
+    [[maybe_unused]] const uint8_t vs2_vregs = lmul < 0 ? 1 : 1 << lmul;
+    [[maybe_unused]] const uint8_t vs2_idx = microIdx % vs2_vregs;
+    [[maybe_unused]] const uint8_t vs1_idx =
+        microIdx / vs2_vregs / vs1_split_num;
+    [[maybe_unused]] const uint8_t vd_idx =
+        microIdx / vs2_vregs / vd_split_num;
+    [[maybe_unused]] const uint16_t vs1_bias =
+        vs1_elems * (vd_idx % vs1_split_num) / vs1_split_num;
+    [[maybe_unused]] const uint16_t vd_bias =
+        vd_elems * (vs1_idx % vd_split_num) / vd_split_num;
+
+    %(code)s;
+    %(op_wb)s;
+
+    return NoFault;
+}
+
+}};
+
+def template VectorGatherDecodeBlock {{
+
+switch(machInst.vtype8.vsew) {
+    case 0b000: {
+        using elem_type [[maybe_unused]] = uint8_t;
+        return new %(class_name)s<uint8_t, %(idx_type)s>(machInst);
+    }
+    case 0b001: {
+        using elem_type [[maybe_unused]] = uint16_t;
+        return new %(class_name)s<uint16_t, %(idx_type)s>(machInst);
+    }
+    case 0b010: {
+        using elem_type [[maybe_unused]] = uint32_t;
+        return new %(class_name)s<uint32_t, %(idx_type)s>(machInst);
+    }
+    case 0b011: {
+        using elem_type [[maybe_unused]] = uint64_t;
+        return new %(class_name)s<uint64_t, %(idx_type)s>(machInst);
+    }
+    default: GEM5_UNREACHABLE;
+}
+
+}};
+
+def template VectorIntVxsatMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s{
+private:
+    %(reg_idx_arr_decl)s;
+    bool vxsat = false;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntVxsatMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro<ElemType>(_machInst,
+            micro_vl, i, &vxsat);
+        microop->setDelayedCommit();
+        this->microops.push_back(microop);
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    microop = new VxsatMicroInst(&vxsat, _machInst);
+    microop->setFlag(StaticInst::IsSerializeAfter);
+    microop->setFlag(StaticInst::IsNonSpeculative);
+    this->microops.push_back(microop);
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+}};
+
+def template VectorIntVxsatMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+    bool* vxsatptr;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+                   uint8_t _microIdx, bool* vxsatptr);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorIntVxsatMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+    uint8_t _microVl, uint8_t _microIdx, bool* vxsatptr)
+    : %(base_class)s("%(mnemonic)s", _machInst,
+                     %(op_class)s, _microVl, _microIdx)
+{
+    this->vm = _machInst.vm;
+    this->vxsatptr = vxsatptr;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorReduceIntWideningMicroExecute {{
+
+template <typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+    using vwu [[maybe_unused]] = typename double_width<vu>::type;
+    using vwi [[maybe_unused]] = typename double_width<vi>::type;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+
+    Vd[0] = this->microIdx != 0 ? old_Vd[0] : Vs1[0];
+
+    auto reduce_loop =
+        [&, this](const auto& f, const auto* _, const auto* vs2) {
+            vwu tmp_val = Vd[0];
+            for (uint32_t i = 0; i < this->microVl; i++) {
+                uint32_t ei = i + vtype_VLMAX(vtype, true) * this->microIdx;
+                if (this->vm || elem_mask(v0, ei)) {
+                    tmp_val = f(tmp_val, Vs2[i]);
+                }
+            }
+            return tmp_val;
+        };
+
+    %(code)s;
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VectorSlideMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s {
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorSlideUpMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    // Todo static filter out useless uop
+    int micro_idx = 0;
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        for (int j = 0; j <= i; ++j) {
+            microop = new %(class_name)sMicro<ElemType>(
+                _machInst, micro_vl, micro_idx++, i, j);
+            microop->setDelayedCommit();
+            this->microops.push_back(microop);
+        }
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorSlideDownMacroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+    const uint32_t num_microops = vtype_regs_per_group(vtype);
+    int32_t tmp_vl = this->vl;
+    const int32_t micro_vlmax = vtype_VLMAX(_machInst.vtype8, true);
+    int32_t micro_vl = std::min(tmp_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    // Todo static filter out useless uop
+    int micro_idx = 0;
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        for (int j = i; j < num_microops; ++j) {
+            microop = new %(class_name)sMicro<ElemType>(
+                _machInst, micro_vl, micro_idx++, i, j);
+            microop->setDelayedCommit();
+            this->microops.push_back(microop);
+        }
+        micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
+    }
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VectorSlideMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // vs2, vs1, vs3(old_vd), vm for *.vv, *.vx
+    // vs2, (old_vd), vm for *.vi
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+    bool vm;
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+        uint8_t _microIdx, uint8_t _vdIdx, uint8_t _vs2Idx);
+    Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VectorSlideMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+        uint8_t _microVl, uint8_t _microIdx, uint8_t _vdIdx, uint8_t _vs2Idx)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s, _microVl,
+        _microIdx, _vdIdx, _vs2Idx)
+{
+    this->vm = _machInst.vm;
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    %(set_dest_reg_idx)s;
+    %(set_src_reg_idx)s;
+}
+
+}};
+
+def template VectorSlideMicroExecute {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using vu [[maybe_unused]] = std::make_unsigned_t<ElemType>;
+    using vi [[maybe_unused]] = std::make_signed_t<ElemType>;
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    [[maybe_unused]]const uint32_t vlmax = vtype_VLMAX(vtype);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+
+    return NoFault;
+};
+
+}};
+
+def template VectorFloatSlideMicroExecute {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext* xc,
+                                  trace::InstRecord* traceData) const
+{
+    using et = ElemType;
+    using vu = decltype(et::v);
+
+    if (machInst.vill)
+        return std::make_shared<IllegalInstFault>("VILL is set", machInst);
+    [[maybe_unused]]const uint32_t vlmax = vtype_VLMAX(vtype);
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(vm_decl_rd)s;
+    %(copy_old_vd)s;
+    %(code)s;
+    %(op_wb)s;
+
+    return NoFault;
+};
+
+}};
diff --git a/src/arch/riscv/isa/templates/vector_mem.isa b/src/arch/riscv/isa/templates/vector_mem.isa
new file mode 100644
index 0000000000..f8be1e555b
--- /dev/null
+++ b/src/arch/riscv/isa/templates/vector_mem.isa
@@ -0,0 +1,1377 @@
+// -*- mode:c++ -*-
+
+// Copyright (c) 2022 PLCT Lab
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met: redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer;
+// redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution;
+// neither the name of the copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+def template VMemMacroDeclare {{
+
+class %(class_name)s : public %(base_class)s
+{
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VMemTemplateMacroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    %(reg_idx_arr_decl)s;
+public:
+    %(class_name)s(ExtMachInst _machInst);
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VleConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    const int32_t micro_vlmax = VLEN / width_EEW(_machInst.width);
+    const uint32_t num_microops = ceil((float) this->vl / (micro_vlmax));
+    int32_t remaining_vl = this->vl;
+    int32_t micro_vl = std::min(remaining_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        microop->setFlag(IsLoad);
+        this->microops.push_back(microop);
+        micro_vl = std::min(remaining_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VleMicroDeclare {{
+
+class %(class_name)s : public %(base_class)s
+{
+private:
+    RegId srcRegIdxArr[3];
+    RegId destRegIdxArr[1];
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl, uint8_t _microIdx)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s, _microVl,
+                     _microIdx)
+    {
+        %(set_reg_idx_arr)s;
+        _numSrcRegs = 0;
+        _numDestRegs = 0;
+        setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _microIdx]);
+        _numTypedDestRegs[VecRegClass]++;
+        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _microIdx]);
+        if (!_machInst.vm) {
+            setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
+        }
+    }
+
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+    Fault completeAcc(PacketPtr, ExecContext *,
+                      trace::InstRecord *) const override;
+    using %(base_class)s::generateDisassembly;
+
+};
+
+}};
+
+def template VleMicroExecute {{
+
+Fault
+%(class_name)s::execute(ExecContext *xc, trace::InstRecord *traceData) const
+{
+    Addr EA;
+    %(op_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if(!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs - 1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    uint32_t mem_size = width_EEW(machInst.width) / 8 * this->microVl;
+    const std::vector<bool> byte_enable(mem_size, true);
+    Fault fault = xc->readMem(EA, Mem.as<uint8_t>(), mem_size, memAccessFlags,
+                              byte_enable);
+    if (fault != NoFault)
+        return fault;
+
+    const size_t micro_vlmax = vtype_VLMAX(machInst.vtype8, true);
+    const size_t micro_elems = VLEN / width_EEW(machInst.width);
+    size_t ei;
+    for (size_t i = 0; i < micro_elems; i++) {
+        ei = i + micro_vlmax * microIdx;
+        %(memacc_code)s;
+    }
+
+    %(op_wb)s;
+    return fault;
+}
+
+}};
+
+def template VleMicroInitiateAcc {{
+
+Fault
+%(class_name)s::initiateAcc(ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    Addr EA;
+
+    %(op_src_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+
+    uint32_t mem_size = width_EEW(this->machInst.width) / 8 * this->microVl;
+    const std::vector<bool> byte_enable(mem_size, true);
+    Fault fault = initiateMemRead(xc, EA, mem_size, memAccessFlags,
+                                  byte_enable);
+    return fault;
+}
+
+}};
+
+def template VleMicroCompleteAcc {{
+
+Fault
+%(class_name)s::completeAcc(PacketPtr pkt, ExecContext *xc,
+                            trace::InstRecord *traceData) const
+{
+    %(op_decl)s;
+    %(op_rd)s;
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if(!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs - 1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    memcpy(Mem.as<uint8_t>(), pkt->getPtr<uint8_t>(), pkt->getSize());
+
+    const size_t micro_vlmax = vtype_VLMAX(machInst.vtype8, true);
+    const size_t micro_elems = VLEN / width_EEW(machInst.width);
+    size_t ei;
+    for (size_t i = 0; i < micro_elems; i++) {
+        ei = i + micro_vlmax * microIdx;
+        %(memacc_code)s;
+    }
+
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VseConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    const int32_t micro_vlmax = VLEN / width_EEW(_machInst.width);
+    const uint32_t num_microops = ceil((float) this->vl / (micro_vlmax));
+    int32_t remaining_vl = this->vl;
+    int32_t micro_vl = std::min(remaining_vl, micro_vlmax);
+
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
+        microop = new %(class_name)sMicro(_machInst, micro_vl, i);
+        microop->setDelayedCommit();
+        microop->setFlag(IsStore);
+        this->microops.push_back(microop);
+        micro_vl = std::min(remaining_vl -= micro_vlmax, micro_vlmax);
+    }
+
+    this->microops.front()->setFlag(IsFirstMicroop);
+    this->microops.back()->setFlag(IsLastMicroop);
+}
+
+}};
+
+def template VseMicroDeclare {{
+
+class %(class_name)s : public %(base_class)s
+{
+private:
+    RegId srcRegIdxArr[3];
+    RegId destRegIdxArr[0];
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl, uint8_t _microIdx)
+        : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s,
+                         _microVl, _microIdx)
+    {
+        %(set_reg_idx_arr)s;
+        _numSrcRegs = 0;
+        _numDestRegs = 0;
+        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs3 + _microIdx]);
+        if (!_machInst.vm) {
+            setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
+        }
+        this->flags[IsVector] = true;
+        this->flags[IsStore] = true;
+    }
+
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+    Fault completeAcc(PacketPtr, ExecContext *,
+                      trace::InstRecord *) const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VseMicroExecute {{
+
+Fault
+%(class_name)s::execute(ExecContext *xc, trace::InstRecord *traceData) const
+{
+    Addr EA;
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if(!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs - 1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+
+    const size_t micro_vlmax = vtype_VLMAX(machInst.vtype8, true);
+    const size_t eewb = width_EEW(machInst.width) / 8;
+    const size_t mem_size = eewb * microVl;
+    std::vector<bool> byte_enable(mem_size, false);
+    size_t ei;
+    for (size_t i = 0; i < microVl; i++) {
+        ei = i + micro_vlmax * microIdx;
+        if (machInst.vm || elem_mask(v0, ei)) {
+            %(memacc_code)s;
+            auto it = byte_enable.begin() + i * eewb;
+            std::fill(it, it + eewb, true);
+        }
+    }
+
+    Fault fault;
+    fault = xc->writeMem(Mem.as<uint8_t>(), mem_size, EA, memAccessFlags,
+                         nullptr, byte_enable);
+    return fault;
+}
+
+}};
+
+def template VseMicroInitiateAcc {{
+
+Fault
+%(class_name)s::initiateAcc(ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    Addr EA;
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if(!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs - 1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+
+    const size_t micro_vlmax = vtype_VLMAX(machInst.vtype8, true);
+    const size_t eewb = width_EEW(machInst.width) / 8;
+    const size_t mem_size = eewb * microVl;
+    std::vector<bool> byte_enable(mem_size, false);
+    size_t ei;
+    for (size_t i = 0; i < microVl; i++) {
+        ei = i + micro_vlmax * microIdx;
+        if (machInst.vm || elem_mask(v0, ei)) {
+            %(memacc_code)s;
+            auto it = byte_enable.begin() + i * eewb;
+            std::fill(it, it + eewb, true);
+        }
+    }
+
+    Fault fault;
+    fault = xc->writeMem(Mem.as<uint8_t>(), mem_size, EA, memAccessFlags,
+                         nullptr, byte_enable);
+    return fault;
+}
+
+}};
+
+def template VseMicroCompleteAcc {{
+
+Fault
+%(class_name)s::completeAcc(PacketPtr pkt, ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    return NoFault;
+}
+
+}};
+
+def template VlmConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    const uint32_t micro_vlmax = VLEN / width_EEW(_machInst.width);
+    int32_t micro_vl = (std::min(this->vl, micro_vlmax) + 7) / 8;
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+    } else {
+        microop = new Vle8_vMicro(_machInst, micro_vl, 0);
+        microop->setDelayedCommit();
+        microop->setFlag(IsLoad);
+    }
+    this->microops.push_back(microop);
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VsmConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    const uint32_t micro_vlmax = VLEN / width_EEW(_machInst.width);
+    int32_t micro_vl = (std::min(this->vl, micro_vlmax) + 7) / 8;
+
+    StaticInstPtr microop;
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+    } else {
+        microop = new Vse8_vMicro(_machInst, micro_vl, 0);
+        microop->setDelayedCommit();
+        microop->setFlag(IsStore);
+    }
+    this->microops.push_back(microop);
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VsWholeConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst)
+  : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    size_t NFIELDS = machInst.nf + 1;
+    const int32_t micro_vlmax = VLEN / width_EEW(_machInst.width);
+
+    StaticInstPtr microop;
+    for (int i = 0; i < NFIELDS; ++i) {
+        microop = new %(class_name)sMicro(_machInst, micro_vlmax, i);
+        microop->setDelayedCommit();
+        microop->setFlag(IsStore);
+        this->microops.push_back(microop);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VsWholeMicroDeclare {{
+
+class %(class_name)s: public %(base_class)s
+{
+private:
+    RegId destRegIdxArr[0];
+    RegId srcRegIdxArr[2];
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl, uint8_t _microIdx)
+        : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s,
+                         _microVl, _microIdx)
+    {
+        %(set_reg_idx_arr)s;
+        _numSrcRegs = 0;
+        _numDestRegs = 0;
+        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs3 + _microIdx]);
+        this->flags[IsVector] = true;
+        this->flags[IsStore] = true;
+    }
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+    Fault completeAcc(PacketPtr, ExecContext *,
+                        trace::InstRecord *) const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VsWholeMicroExecute {{
+
+Fault
+%(class_name)s::execute(ExecContext *xc, trace::InstRecord *traceData) const
+{
+    Addr EA;
+    %(op_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+
+    for (size_t i = 0; i < VLENB; i++) {
+        %(memacc_code)s;
+    }
+
+    Fault fault = writeMemAtomicLE(xc, traceData, *(vreg_t::Container*)(&Mem),
+                                   EA, memAccessFlags, nullptr);
+    return fault;
+}
+
+}};
+
+def template VsWholeMicroInitiateAcc {{
+
+Fault
+%(class_name)s::initiateAcc(ExecContext* xc,
+        trace::InstRecord* traceData) const
+{
+    Addr EA;
+    %(op_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+
+    for (size_t i = 0; i < VLENB; i++) {
+        %(memacc_code)s;
+    }
+
+    Fault fault = writeMemTimingLE(xc, traceData, *(vreg_t::Container*)(&Mem),
+                                   EA, memAccessFlags, nullptr);
+    return fault;
+}
+
+}};
+
+def template VsWholeMicroCompleteAcc {{
+
+Fault
+%(class_name)s::completeAcc(PacketPtr pkt, ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    return NoFault;
+}
+
+}};
+
+def template VlWholeConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    size_t NFIELDS = machInst.nf + 1;
+    const int32_t micro_vlmax = VLEN / width_EEW(_machInst.width);
+
+    StaticInstPtr microop;
+    for (int i = 0; i < NFIELDS; ++i) {
+        microop = new %(class_name)sMicro(_machInst, micro_vlmax, i);
+        microop->setDelayedCommit();
+        microop->setFlag(IsLoad);
+        this->microops.push_back(microop);
+    }
+
+    this->microops.front()->setFirstMicroop();
+    this->microops.back()->setLastMicroop();
+}
+
+}};
+
+def template VlWholeMicroDeclare {{
+
+class %(class_name)s: public %(base_class)s
+{
+private:
+    RegId destRegIdxArr[1];
+    RegId srcRegIdxArr[1];
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl, uint8_t _microIdx)
+        : %(base_class)s("%(mnemonic)s_micro", _machInst, %(op_class)s,
+                         _microVl, _microIdx)
+    {
+        %(set_reg_idx_arr)s;
+        _numSrcRegs = 0;
+        _numDestRegs = 0;
+        setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _microIdx]);
+        _numTypedDestRegs[VecRegClass]++;
+        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+        this->flags[IsVector] = true;
+        this->flags[IsLoad] = true;
+    }
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+    Fault completeAcc(PacketPtr, ExecContext *,
+                        trace::InstRecord *) const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VlWholeMicroExecute {{
+
+Fault
+%(class_name)s::execute(ExecContext *xc, trace::InstRecord *traceData) const
+{
+    Addr EA;
+    %(op_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+
+    Fault fault = readMemAtomicLE(xc, traceData, EA,
+                                  *(vreg_t::Container*)(&Mem), memAccessFlags);
+    if (fault != NoFault)
+        return fault;
+
+    size_t elem_per_reg = VLEN / width_EEW(machInst.width);
+    for (size_t i = 0; i < elem_per_reg; i++) {
+        %(memacc_code)s;
+    }
+
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VlWholeMicroInitiateAcc {{
+
+Fault
+%(class_name)s::initiateAcc(ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    Addr EA;
+    %(op_src_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+
+    Fault fault = initiateMemRead(xc, traceData, EA, Mem, memAccessFlags);
+    return fault;
+}
+
+}};
+
+def template VlWholeMicroCompleteAcc {{
+
+Fault
+%(class_name)s::completeAcc(PacketPtr pkt, ExecContext* xc,
+        trace::InstRecord* traceData) const
+{
+    %(op_decl)s;
+    %(op_rd)s;
+
+    memcpy(Mem.as<uint8_t>(), pkt->getPtr<uint8_t>(), pkt->getSize());
+
+    size_t elem_per_reg = VLEN / width_EEW(machInst.width);
+    for (size_t i = 0; i < elem_per_reg; ++i) {
+        %(memacc_code)s;
+    }
+
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VlStrideConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    const int32_t num_elems_per_vreg = VLEN / width_EEW(_machInst.width);
+    int32_t remaining_vl = this->vl;
+    // Num of elems in one vreg
+    int32_t micro_vl = std::min(remaining_vl, num_elems_per_vreg);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; micro_vl > 0; ++i) {
+        for (int j = 0; j < micro_vl; ++j) {
+            microop = new %(class_name)sMicro(machInst, i, j, micro_vl);
+            microop->setFlag(IsDelayedCommit);
+            microop->setFlag(IsLoad);
+            this->microops.push_back(microop);
+        }
+        remaining_vl -= num_elems_per_vreg;
+        micro_vl = std::min(remaining_vl, num_elems_per_vreg);
+    }
+
+    this->microops.front()->setFlag(IsFirstMicroop);
+    this->microops.back()->setFlag(IsLastMicroop);
+    this->flags[IsVector] = true;
+}
+
+}};
+
+def template VlStrideMicroDeclare {{
+
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // rs1, rs2, vd, vm
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _regIdx, uint8_t _microIdx,
+        uint8_t _microVl)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s,
+        _regIdx, _microIdx, _microVl)
+    {
+        %(set_reg_idx_arr)s;
+        _numSrcRegs = 0;
+        _numDestRegs = 0;
+        setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _regIdx]);
+        _numTypedDestRegs[VecRegClass]++;
+        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs2]);
+        // We treat agnostic as undistrubed
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _regIdx]);
+        if (!_machInst.vm) {
+            setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
+        }
+        this->flags[IsLoad] = true;
+    }
+
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+    Fault completeAcc(PacketPtr, ExecContext *,
+                      trace::InstRecord *) const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VlStrideMicroExecute {{
+
+Fault
+%(class_name)s::execute(ExecContext *xc, trace::InstRecord *traceData) const
+{
+    Fault fault = NoFault;
+    Addr EA;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    constexpr uint8_t elem_size = sizeof(Vd[0]);
+    %(ea_code)s; // ea_code depends on elem_size
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if (!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs-1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    uint32_t mem_size = elem_size;
+    const std::vector<bool> byte_enable(mem_size, true);
+
+    size_t ei = this->regIdx * VLENB / elem_size + this->microIdx;
+    if (machInst.vm || elem_mask(v0, ei)) {
+        fault = xc->readMem(EA, Mem.as<uint8_t>(), mem_size,
+                                memAccessFlags, byte_enable);
+        if (fault != NoFault)
+            return fault;
+        %(memacc_code)s; /* Vd[this->microIdx] = Mem[0]; */
+    }
+
+    %(op_wb)s;
+    return fault;
+}
+
+}};
+
+def template VlStrideMicroInitiateAcc {{
+
+Fault
+%(class_name)s::initiateAcc(ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    Fault fault = NoFault;
+    Addr EA;
+
+    %(op_src_decl)s;
+    %(op_rd)s;
+    constexpr uint8_t elem_size = sizeof(Vd[0]);
+    %(ea_code)s; // ea_code depends on elem_size
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if (!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs-1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    uint32_t mem_size = elem_size;
+    size_t ei = this->regIdx * VLENB / elem_size + this->microIdx;
+    bool need_load = machInst.vm || elem_mask(v0, ei);
+    const std::vector<bool> byte_enable(mem_size, need_load);
+    fault = initiateMemRead(xc, EA, mem_size, memAccessFlags, byte_enable);
+    return fault;
+}
+
+}};
+
+def template VlStrideMicroCompleteAcc {{
+
+Fault
+%(class_name)s::completeAcc(PacketPtr pkt, ExecContext *xc,
+                            trace::InstRecord *traceData) const
+{
+    %(op_decl)s;
+    %(op_rd)s;
+
+    constexpr uint8_t elem_size = sizeof(Vd[0]);
+
+    RiscvISA::vreg_t old_vd;
+    decltype(Vd) old_Vd = nullptr;
+    // We treat agnostic as undistrubed
+    xc->getRegOperand(this, 2, &old_vd);
+    old_Vd = old_vd.as<std::remove_reference_t<decltype(Vd[0])> >();
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if (!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs-1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    if (microIdx == 0) {
+        // treat vma as vmu
+        // if (machInst.vtype8.vma == 0)
+        memcpy(Vd, old_Vd, microVl * elem_size);
+        // treat vta as vtu
+        // if (machInst.vtype8.vta == 0)
+        memcpy(Vd + microVl, old_Vd + microVl, VLENB - microVl * elem_size);
+    } else {
+        memcpy(Vd, old_Vd, VLENB);
+    }
+
+    size_t ei = this->regIdx * VLENB / sizeof(Vd[0]) + this->microIdx;
+    if (machInst.vm || elem_mask(v0, ei)) {
+        memcpy(Mem.as<uint8_t>(), pkt->getPtr<uint8_t>(), pkt->getSize());
+        %(memacc_code)s; /* Vd[this->microIdx] = Mem[0]; */
+    }
+
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VsStrideConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    const int32_t num_elems_per_vreg = VLEN / width_EEW(_machInst.width);
+    int32_t remaining_vl = this->vl;
+    // Num of elems in one vreg
+    int32_t micro_vl = std::min(remaining_vl, num_elems_per_vreg);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (int i = 0; micro_vl > 0; ++i) {
+        for (int j = 0; j < micro_vl; ++j) {
+            microop = new %(class_name)sMicro(machInst, i, j, micro_vl);
+            microop->setFlag(IsDelayedCommit);
+            microop->setFlag(IsStore);
+            this->microops.push_back(microop);
+        }
+        remaining_vl -= num_elems_per_vreg;
+        micro_vl = std::min(remaining_vl, num_elems_per_vreg);
+    }
+
+    this->microops.front()->setFlag(IsFirstMicroop);
+    this->microops.back()->setFlag(IsLastMicroop);
+    this->flags[IsVector] = true;
+}
+
+}};
+
+def template VsStrideMicroDeclare {{
+
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // rs1, rs2, vs3, vm
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[0];
+public:
+    %(class_name)s(ExtMachInst _machInst, uint8_t _regIdx, uint8_t _microIdx,
+            uint8_t _microVl)
+        : %(base_class)s("%(mnemonic)s""_micro", _machInst, %(op_class)s,
+            _regIdx, _microIdx, _microVl)
+    {
+        %(set_reg_idx_arr)s;
+        _numSrcRegs = 0;
+        _numDestRegs = 0;
+        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs2]);
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs3 + _regIdx]);
+        if (!_machInst.vm) {
+            setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
+        }
+        this->flags[IsStore] = true;
+    }
+
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+    Fault completeAcc(PacketPtr, ExecContext *,
+                      trace::InstRecord *) const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VsStrideMicroExecute {{
+
+Fault
+%(class_name)s::execute(ExecContext *xc, trace::InstRecord *traceData) const
+{
+    Fault fault = NoFault;
+    Addr EA;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    constexpr uint8_t elem_size = sizeof(Vs3[0]);
+    %(ea_code)s;
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if(!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs - 1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    uint32_t mem_size = elem_size;
+    const std::vector<bool> byte_enable(mem_size, true);
+
+    size_t ei = this->regIdx * VLENB / elem_size + this->microIdx;
+    if (machInst.vm || elem_mask(v0, ei)) {
+        %(memacc_code)s;
+        fault = xc->writeMem(Mem.as<uint8_t>(), mem_size, EA,
+                             memAccessFlags, nullptr, byte_enable);
+    }
+    return fault;
+}
+
+}};
+
+def template VsStrideMicroInitiateAcc {{
+
+Fault
+%(class_name)s::initiateAcc(ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    Fault fault = NoFault;
+    Addr EA;
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if(!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs - 1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    %(op_decl)s;
+    %(op_rd)s;
+    constexpr uint8_t elem_size = sizeof(Vs3[0]);
+    %(ea_code)s;
+
+    uint32_t mem_size = elem_size;
+    size_t ei = this->regIdx * VLENB / elem_size + this->microIdx;
+    bool need_store = machInst.vm || elem_mask(v0, ei);
+    if (need_store) {
+        const std::vector<bool> byte_enable(mem_size, need_store);
+        %(memacc_code)s;
+        fault = xc->writeMem(Mem.as<uint8_t>(), mem_size, EA,
+                            memAccessFlags, nullptr, byte_enable);
+    }
+    return fault;
+}
+
+}};
+
+def template VsStrideMicroCompleteAcc {{
+
+Fault
+%(class_name)s::completeAcc(PacketPtr pkt, ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    return NoFault;
+}
+
+}};
+
+def template VlIndexConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    const uint32_t vd_eewb = sizeof(ElemType);
+    const uint32_t vs2_eewb = width_EEW(_machInst.width) / 8;
+    const uint8_t vs2_split_num = (vd_eewb + vs2_eewb - 1) / vs2_eewb;
+    const uint8_t vd_split_num = (vs2_eewb + vd_eewb - 1) / vd_eewb;
+    const int32_t micro_vlmax = VLENB / std::max(vd_eewb, vs2_eewb);
+    int32_t remaining_vl = this->vl;
+    int32_t micro_vl = std::min(remaining_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (uint8_t i = 0; micro_vl > 0; i++) {
+        for (uint8_t j = 0; j < micro_vl; ++j) {
+            uint8_t vdRegIdx = i / vd_split_num;
+            uint8_t vs2RegIdx = i / vs2_split_num;
+            uint8_t vdElemIdx = j + micro_vlmax * (i % vd_split_num);
+            uint8_t vs2ElemIdx = j + micro_vlmax * (i % vs2_split_num);
+            microop = new %(class_name)sMicro<ElemType>(machInst,
+                vdRegIdx, vdElemIdx, vs2RegIdx, vs2ElemIdx);
+            microop->setFlag(IsDelayedCommit);
+            microop->setFlag(IsLoad);
+            this->microops.push_back(microop);
+        }
+        remaining_vl -= micro_vlmax;
+        micro_vl = std::min(remaining_vl, micro_vlmax);
+    }
+
+    this->microops.front()->setFlag(IsFirstMicroop);
+    this->microops.back()->setFlag(IsLastMicroop);
+    this->flags[IsVector] = true;
+}
+
+}};
+
+def template VlIndexMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // rs1, vs2, vd, vm
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[1];
+public:
+    %(class_name)s(ExtMachInst _machInst,
+        uint8_t _vdRegIdx, uint8_t _vdElemIdx,
+        uint8_t _vs2RegIdx, uint8_t _vs2ElemIdx)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s,
+        _vdRegIdx, _vdElemIdx, _vs2RegIdx, _vs2ElemIdx)
+    {
+        %(set_reg_idx_arr)s;
+        _numSrcRegs = 0;
+        _numDestRegs = 0;
+        setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _vdRegIdx]);
+        _numTypedDestRegs[VecRegClass]++;
+        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs2 + _vs2RegIdx]);
+        // We treat agnostic as undistrubed
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _vdRegIdx]);
+        if (!_machInst.vm) {
+            setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
+        }
+        this->flags[IsLoad] = true;
+    }
+
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+    Fault completeAcc(PacketPtr, ExecContext *,
+                      trace::InstRecord *) const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VlIndexMicroExecute {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext *xc,
+    trace::InstRecord *traceData)const
+{
+    using vu = std::make_unsigned_t<ElemType>;
+    Fault fault = NoFault;
+    Addr EA;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+    constexpr uint8_t elem_size = sizeof(Vd[0]);
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if (!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs-1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    uint32_t mem_size = elem_size;
+    const std::vector<bool> byte_enable(mem_size, true);
+
+    size_t ei = this->vdRegIdx * VLENB / elem_size + this->vdElemIdx;
+    if (machInst.vm || elem_mask(v0, ei)) {
+        fault = xc->readMem(EA, Mem.as<uint8_t>(), mem_size,
+                                memAccessFlags, byte_enable);
+        if (fault != NoFault)
+            return fault;
+        %(memacc_code)s; /* Vd[this->vdElemIdx] = Mem[0]; */
+    }
+
+    %(op_wb)s;
+    return fault;
+}
+
+}};
+
+def template VlIndexMicroInitiateAcc {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::initiateAcc(ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    using vu = std::make_unsigned_t<ElemType>;
+    Fault fault = NoFault;
+    Addr EA;
+
+    %(op_src_decl)s;
+    %(op_rd)s;
+    constexpr uint8_t elem_size = sizeof(Vd[0]);
+    %(ea_code)s; // ea_code depends on elem_size
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if (!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs-1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    uint32_t mem_size = elem_size;
+    size_t ei = this->vdRegIdx * VLENB / elem_size + this->vdElemIdx;
+    bool need_load = machInst.vm || elem_mask(v0, ei);
+    const std::vector<bool> byte_enable(mem_size, need_load);
+    fault = initiateMemRead(xc, EA, mem_size, memAccessFlags, byte_enable);
+    return fault;
+}
+
+}};
+
+def template VlIndexMicroCompleteAcc {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::completeAcc(PacketPtr pkt, ExecContext *xc,
+                            trace::InstRecord *traceData) const
+{
+    using vu = std::make_unsigned_t<ElemType>;
+    %(op_decl)s;
+    %(op_rd)s;
+
+    constexpr uint8_t elem_size = sizeof(Vd[0]);
+
+    RiscvISA::vreg_t old_vd;
+    decltype(Vd) old_Vd = nullptr;
+    // We treat agnostic as undistrubed
+    xc->getRegOperand(this, 2, &old_vd);
+    old_Vd = old_vd.as<std::remove_reference_t<decltype(Vd[0])> >();
+
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if (!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs-1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    memcpy(Vd, old_Vd, VLENB);
+
+    size_t ei = this->vdRegIdx * VLENB / elem_size + this->vdElemIdx;
+    if (machInst.vm || elem_mask(v0, ei)) {
+        memcpy(Mem.as<uint8_t>(), pkt->getPtr<uint8_t>(), pkt->getSize());
+        %(memacc_code)s; /* Vd[this->microIdx] = Mem[0]; */
+    }
+
+    %(op_wb)s;
+    return NoFault;
+}
+
+}};
+
+def template VsIndexConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s)
+{
+    %(set_reg_idx_arr)s;
+    %(constructor)s;
+
+    const uint32_t vs3_eewb = sizeof(ElemType);
+    const uint32_t vs2_eewb = width_EEW(_machInst.width) / 8;
+    const uint8_t vs2_split_num = (vs3_eewb + vs2_eewb - 1) / vs2_eewb;
+    const uint8_t vs3_split_num = (vs2_eewb + vs3_eewb - 1) / vs3_eewb;
+    const int32_t micro_vlmax = VLENB / std::max(vs3_eewb, vs2_eewb);
+    int32_t remaining_vl = this->vl;
+    int32_t micro_vl = std::min(remaining_vl, micro_vlmax);
+    StaticInstPtr microop;
+
+    if (micro_vl == 0) {
+        microop = new VectorNopMicroInst(_machInst);
+        this->microops.push_back(microop);
+    }
+    for (uint8_t i = 0; micro_vl > 0; i++) {
+        for (uint8_t j = 0; j < micro_vl; ++j) {
+            uint8_t vs3RegIdx = i / vs3_split_num;
+            uint8_t vs2RegIdx = i / vs2_split_num;
+            uint8_t vs3ElemIdx = j + micro_vlmax * (i % vs3_split_num);
+            uint8_t vs2ElemIdx = j + micro_vlmax * (i % vs2_split_num);
+            microop = new %(class_name)sMicro<ElemType>(machInst,
+                vs3RegIdx, vs3ElemIdx, vs2RegIdx, vs2ElemIdx);
+            microop->setFlag(IsDelayedCommit);
+            microop->setFlag(IsStore);
+            this->microops.push_back(microop);
+        }
+        remaining_vl -= micro_vlmax;
+        micro_vl = std::min(remaining_vl, micro_vlmax);
+    }
+
+    this->microops.front()->setFlag(IsFirstMicroop);
+    this->microops.back()->setFlag(IsLastMicroop);
+    this->flags[IsVector] = true;
+}
+
+}};
+
+def template VsIndexMicroDeclare {{
+
+template<typename ElemType>
+class %(class_name)s : public %(base_class)s
+{
+private:
+    // rs1, vs2, vs3, vm
+    RegId srcRegIdxArr[4];
+    RegId destRegIdxArr[0];
+public:
+    %(class_name)s(ExtMachInst _machInst,
+        uint8_t _vs3RegIdx, uint8_t _vs3ElemIdx,
+        uint8_t _vs2RegIdx, uint8_t _vs2ElemIdx)
+    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s,
+        _vs3RegIdx, _vs3ElemIdx, _vs2RegIdx, _vs2ElemIdx)
+    {
+        %(set_reg_idx_arr)s;
+        _numSrcRegs = 0;
+        _numDestRegs = 0;
+        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs2 + _vs2RegIdx]);
+        // We treat agnostic as undistrubed
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs3 + _vs3RegIdx]);
+        if (!_machInst.vm) {
+            setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
+        }
+        this->flags[IsStore] = true;
+    }
+
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
+    Fault completeAcc(PacketPtr, ExecContext *,
+                      trace::InstRecord *) const override;
+    using %(base_class)s::generateDisassembly;
+};
+
+}};
+
+def template VsIndexMicroExecute {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::execute(ExecContext *xc,
+    trace::InstRecord *traceData)const
+{
+    using vu = std::make_unsigned_t<ElemType>;
+    Fault fault = NoFault;
+    Addr EA;
+
+    %(op_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+    constexpr uint8_t elem_size = sizeof(Vs3[0]);
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if (!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs-1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    uint32_t mem_size = elem_size;
+    const std::vector<bool> byte_enable(mem_size, true);
+
+    size_t ei = this->vs3RegIdx * VLENB / elem_size + this->vs3ElemIdx;
+    if (machInst.vm || elem_mask(v0, ei)) {
+        %(memacc_code)s; /* Mem[0] = Vs3[this->vs3ElemIdx] */
+        fault = xc->writeMem(Mem.as<uint8_t>(), mem_size, EA,
+                             memAccessFlags, nullptr, byte_enable);
+    }
+    return fault;
+}
+
+}};
+
+def template VsIndexMicroInitiateAcc {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::initiateAcc(ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    using vu = std::make_unsigned_t<ElemType>;
+    Fault fault = NoFault;
+    Addr EA;
+
+    %(op_src_decl)s;
+    %(op_rd)s;
+    %(ea_code)s;
+    constexpr uint8_t elem_size = sizeof(Vs3[0]);
+    RiscvISA::vreg_t tmp_v0;
+    uint8_t *v0;
+    if (!machInst.vm) {
+        xc->getRegOperand(this, _numSrcRegs-1, &tmp_v0);
+        v0 = tmp_v0.as<uint8_t>();
+    }
+
+    constexpr uint8_t mem_size = elem_size;
+    const std::vector<bool> byte_enable(mem_size, true);
+
+    size_t ei = this->vs3RegIdx * VLENB / elem_size + this->vs3ElemIdx;
+    if (machInst.vm || elem_mask(v0, ei)) {
+        %(memacc_code)s; /* Mem[0] = Vs3[this->vs3ElemIdx] */
+        fault = xc->writeMem(Mem.as<uint8_t>(), mem_size, EA,
+                             memAccessFlags, nullptr, byte_enable);
+    }
+    return fault;
+}
+
+}};
+
+def template VsIndexMicroCompleteAcc {{
+
+template<typename ElemType>
+Fault
+%(class_name)s<ElemType>::completeAcc(PacketPtr pkt, ExecContext* xc,
+                            trace::InstRecord* traceData) const
+{
+    return NoFault;
+}
+
+}};
+
+def template VMemTemplateDecodeBlock {{
+
+switch(machInst.vtype8.vsew) {
+    case 0b000: {
+        return new %(class_name)s<uint8_t>(machInst);
+    }
+    case 0b001: {
+        return new %(class_name)s<uint16_t>(machInst);
+    }
+    case 0b010: {
+        return new %(class_name)s<uint32_t>(machInst);
+    }
+    case 0b011: {
+        return new %(class_name)s<uint64_t>(machInst);
+    }
+    default: GEM5_UNREACHABLE;
+}
+
+}};
diff --git a/src/arch/riscv/pcstate.hh b/src/arch/riscv/pcstate.hh
index de07145dc3..1c04cb5109 100644
--- a/src/arch/riscv/pcstate.hh
+++ b/src/arch/riscv/pcstate.hh
@@ -58,12 +58,12 @@ class PCState : public GenericISA::UPCState<4>
 {
   private:
     bool _compressed = false;
-    RiscvType _rv_type = RV64;
+    RiscvType _rvType = RV64;
 
   public:
     PCState() = default;
     PCState(const PCState &other) = default;
-    PCState(Addr addr, RiscvType rv_type) : UPCState(addr), _rv_type(rv_type)
+    PCState(Addr addr, RiscvType rvType) : UPCState(addr), _rvType(rvType)
     {
     }
 
@@ -75,14 +75,14 @@ class PCState : public GenericISA::UPCState<4>
         Base::update(other);
         auto &pcstate = other.as<PCState>();
         _compressed = pcstate._compressed;
-        _rv_type = pcstate._rv_type;
+        _rvType = pcstate._rvType;
     }
 
     void compressed(bool c) { _compressed = c; }
     bool compressed() const { return _compressed; }
 
-    void rvType(RiscvType rv_type) { _rv_type = rv_type; }
-    RiscvType rvType() const { return _rv_type; }
+    void rvType(RiscvType rvType) { _rvType = rvType; }
+    RiscvType rvType() const { return _rvType; }
 
     bool
     branching() const override
diff --git a/src/arch/riscv/regs/float.hh b/src/arch/riscv/regs/float.hh
index 4809372070..cca9e1be2f 100644
--- a/src/arch/riscv/regs/float.hh
+++ b/src/arch/riscv/regs/float.hh
@@ -211,6 +211,20 @@ const std::vector<std::string> RegNames = {
 
 } // namespace float_reg
 
+inline float32_t
+fsgnj32(float32_t a, float32_t b, bool n, bool x) {
+    if (n) b.v = ~b.v;
+    else if (x) b.v = a.v ^ b.v;
+    return f32(insertBits(b.v, 30, 0, a.v));
+}
+
+inline float64_t
+fsgnj64(float64_t a, float64_t b, bool n, bool x) {
+    if (n) b.v = ~b.v;
+    else if (x) b.v = a.v ^ b.v;
+    return f64(insertBits(b.v, 62, 0, a.v));
+}
+
 } // namespace RiscvISA
 } // namespace gem5
 
diff --git a/src/arch/riscv/regs/misc.hh b/src/arch/riscv/regs/misc.hh
index 5ea3536141..64072c97e2 100644
--- a/src/arch/riscv/regs/misc.hh
+++ b/src/arch/riscv/regs/misc.hh
@@ -191,6 +191,14 @@ enum MiscRegIndex
     MISCREG_FFLAGS,
     MISCREG_FRM,
 
+    MISCREG_VSTART,
+    MISCREG_VXSAT,
+    MISCREG_VXRM,
+    MISCREG_VCSR,
+    MISCREG_VL,
+    MISCREG_VTYPE,
+    MISCREG_VLENB,
+
     // These registers are not in the standard, hence does not exist in the
     // CSRData map. These are mainly used to provide a minimal implementation
     // for non-maskable-interrupt in our simple cpu.
@@ -476,7 +484,15 @@ enum CSRIndex
     CSR_TDATA3 = 0x7A3,
     CSR_DCSR = 0x7B0,
     CSR_DPC = 0x7B1,
-    CSR_DSCRATCH = 0x7B2
+    CSR_DSCRATCH = 0x7B2,
+
+    CSR_VSTART       = 0x008,
+    CSR_VXSAT        = 0x009,
+    CSR_VXRM         = 0x00A,
+    CSR_VCSR         = 0x00F,
+    CSR_VL           = 0xC20,
+    CSR_VTYPE        = 0xC21,
+    CSR_VLENB        = 0xC22
 };
 
 struct CSRMetadata
@@ -718,7 +734,15 @@ const std::unordered_map<int, CSRMetadata> CSRData = {
     {CSR_TDATA3, {"tdata3", MISCREG_TDATA3, rvTypeFlags(RV64, RV32)}},
     {CSR_DCSR, {"dcsr", MISCREG_DCSR, rvTypeFlags(RV64, RV32)}},
     {CSR_DPC, {"dpc", MISCREG_DPC, rvTypeFlags(RV64, RV32)}},
-    {CSR_DSCRATCH, {"dscratch", MISCREG_DSCRATCH, rvTypeFlags(RV64, RV32)}}
+    {CSR_DSCRATCH, {"dscratch", MISCREG_DSCRATCH, rvTypeFlags(RV64, RV32)}},
+
+    {CSR_VSTART, {"vstart", MISCREG_VSTART, rvTypeFlags(RV64, RV32)}},
+    {CSR_VXSAT,  {"vxsat" , MISCREG_VXSAT, rvTypeFlags(RV64, RV32)}},
+    {CSR_VXRM,   {"vxrm"  , MISCREG_VXRM, rvTypeFlags(RV64, RV32)}},
+    {CSR_VCSR,   {"vcsr"  , MISCREG_VCSR, rvTypeFlags(RV64, RV32)}},
+    {CSR_VL,     {"vl"    , MISCREG_VL, rvTypeFlags(RV64, RV32)}},
+    {CSR_VTYPE,  {"vtype" , MISCREG_VTYPE, rvTypeFlags(RV64, RV32)}},
+    {CSR_VLENB,  {"VLENB" , MISCREG_VLENB, rvTypeFlags(RV64, RV32)}}
 };
 
 /**
@@ -816,6 +840,7 @@ const off_t SBE_OFFSET[enums::Num_RiscvType] = {
 const off_t SXL_OFFSET = 34;
 const off_t UXL_OFFSET = 32;
 const off_t FS_OFFSET = 13;
+const off_t VS_OFFSET = 9;
 const off_t FRM_OFFSET = 5;
 
 const RegVal ISA_MXL_MASKS[enums::Num_RiscvType] = {
@@ -853,7 +878,7 @@ const RegVal STATUS_MPRV_MASK = 1ULL << 17;
 const RegVal STATUS_XS_MASK = 3ULL << 15;
 const RegVal STATUS_FS_MASK = 3ULL << FS_OFFSET;
 const RegVal STATUS_MPP_MASK = 3ULL << 11;
-const RegVal STATUS_VS_MASK = 3ULL << 9;
+const RegVal STATUS_VS_MASK = 3ULL << VS_OFFSET;
 const RegVal STATUS_SPP_MASK = 1ULL << 8;
 const RegVal STATUS_MPIE_MASK = 1ULL << 7;
 const RegVal STATUS_SPIE_MASK = 1ULL << 5;
diff --git a/src/arch/riscv/regs/vector.hh b/src/arch/riscv/regs/vector.hh
new file mode 100644
index 0000000000..d722c2d03a
--- /dev/null
+++ b/src/arch/riscv/regs/vector.hh
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022 PLCT Lab
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef __ARCH_RISCV_REGS_VECTOR_HH__
+#define __ARCH_RISCV_REGS_VECTOR_HH__
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "arch/generic/vec_pred_reg.hh"
+#include "arch/generic/vec_reg.hh"
+#include "base/bitunion.hh"
+#include "cpu/reg_class.hh"
+#include "debug/VecRegs.hh"
+
+namespace gem5
+{
+
+namespace RiscvISA
+{
+
+constexpr unsigned ELEN = 64;
+constexpr unsigned VLEN = 256;
+constexpr unsigned VLENB = VLEN / 8;
+
+using VecRegContainer = gem5::VecRegContainer<VLENB>;
+using vreg_t = VecRegContainer;
+
+const int NumVecStandardRegs = 32;
+const int NumVecInternalRegs = 8; // Used by vector uop
+const int NumVecRegs = NumVecStandardRegs + NumVecInternalRegs;
+
+const std::vector<std::string> VecRegNames = {
+    "v0",   "v1",   "v2",   "v3",   "v4",   "v5",   "v6",   "v7",
+    "v8",   "v9",   "v10",  "v11",  "v12",  "v13",  "v14",  "v15",
+    "v16",  "v17",  "v18",  "v19",  "v20",  "v21",  "v22",  "v23",
+    "v24",  "v25",  "v26",  "v27",  "v28",  "v29",  "v30",  "v31",
+    "vtmp0", "vtmp1", "vtmp2", "vtmp3", "vtmp4", "vtmp5", "vtmp6", "vtmp7"
+};
+
+// vector index
+const int VecMemInternalReg0 = NumVecStandardRegs;
+
+static inline TypedRegClassOps<RiscvISA::VecRegContainer> vecRegClassOps;
+
+inline constexpr RegClass vecRegClass =
+    RegClass(VecRegClass, VecRegClassName, NumVecRegs, debug::VecRegs).
+        ops(vecRegClassOps).
+        regType<VecRegContainer>();
+
+BitUnion32(VTYPE)
+    Bitfield<31> vill;
+    Bitfield<7, 0> vtype8;
+    Bitfield<7> vma;
+    Bitfield<6> vta;
+    Bitfield<5, 3> vsew;
+    Bitfield<2, 0> vlmul;
+EndBitUnion(VTYPE)
+
+} // namespace RiscvISA
+} // namespace gem5
+
+#endif // __ARCH_RISCV_REGS_VECTOR_HH__
diff --git a/src/arch/riscv/utility.hh b/src/arch/riscv/utility.hh
index 5fccc84c79..40054aec0f 100644
--- a/src/arch/riscv/utility.hh
+++ b/src/arch/riscv/utility.hh
@@ -51,6 +51,7 @@
 
 #include "arch/riscv/regs/float.hh"
 #include "arch/riscv/regs/int.hh"
+#include "arch/riscv/regs/vector.hh"
 #include "base/types.hh"
 #include "cpu/reg_class.hh"
 #include "cpu/static_inst.hh"
@@ -130,7 +131,14 @@ registerName(RegId reg)
             return str.str();
         }
         return float_reg::RegNames[reg.index()];
-    } else {
+    } else if (reg.is(VecRegClass)) {
+        if (reg.index() >= NumVecRegs) {
+            std::stringstream str;
+            str << "?? (v" << reg.index() << ')';
+            return str.str();
+        }
+        return VecRegNames[reg.index()];
+    } else  {
         /* It must be an InvalidRegClass, in RISC-V we should treat it as a
          * zero register for the disassembler to work correctly.
          */
@@ -233,6 +241,542 @@ remu(T rs1, T rs2)
     return (rs2 == 0) ? rs1 : rs1 % rs2;
 }
 
+// Vector extension functions
+inline uint64_t
+vtype_SEW(const uint64_t vtype)
+{
+    return 8 << bits(vtype, 5, 3);
+}
+
+/*
+* Encode LMUL to lmul as follows:
+*     LMUL    vlmul    lmul
+*      1       000       0
+*      2       001       1
+*      4       010       2
+*      8       011       3
+*      -       100       -
+*     1/8      101      -3
+*     1/4      110      -2
+*     1/2      111      -1
+*
+* then, we can calculate VLMAX = vlen >> (vsew + 3 - lmul)
+* e.g. vlen = 256 bits, SEW = 16, LMUL = 1/8
+*      => VLMAX = vlen >> (1 + 3 - (-3))
+*               = 256 >> 7
+*               = 2
+* Ref: https://github.com/qemu/qemu/blob/5e9d14f2/target/riscv/cpu.h
+*/
+inline uint64_t
+vtype_VLMAX(const uint64_t vtype, const bool per_reg = false)
+{
+    int64_t lmul = (int64_t)sext<3>(bits(vtype, 2, 0));
+    lmul = per_reg ? std::min<int64_t>(0, lmul) : lmul;
+    int64_t vsew = bits(vtype, 5, 3);
+    return gem5::RiscvISA::VLEN >> (vsew + 3 - lmul);
+}
+
+inline int64_t
+vtype_vlmul(const uint64_t vtype)
+{
+    return (int64_t)sext<3>(bits(vtype, 2, 0));
+}
+
+inline uint64_t
+vtype_regs_per_group(const uint64_t vtype)
+{
+    int64_t lmul = (int64_t)sext<3>(bits(vtype, 2, 0));
+    return 1 << std::max<int64_t>(0, lmul);
+}
+
+inline void
+vtype_set_vill(uint64_t& vtype)
+{
+    vtype = (uint64_t)0 ^ (1UL << (sizeof(RegVal) * 8 - 1));
+}
+
+inline uint64_t
+width_EEW(uint64_t width)
+{
+    switch (width) {
+    case 0b000: return 8;
+    case 0b101: return 16;
+    case 0b110: return 32;
+    case 0b111: return 64;
+    default: GEM5_UNREACHABLE;
+    }
+}
+
+/*
+  *  Spec Section 4.5
+  *  Ref:
+  *  https://github.com/qemu/qemu/blob/c7d773ae/target/riscv/vector_helper.c
+*/
+template<typename T>
+inline int
+elem_mask(const T* vs, const int index)
+{
+    static_assert(std::is_integral_v<T>);
+    int idx = index / (sizeof(T)*8);
+    int pos = index % (sizeof(T)*8);
+    return (vs[idx] >> pos) & 1;
+}
+
+template<typename Type> struct double_width;
+template<> struct double_width<uint8_t>     { using type = uint16_t;};
+template<> struct double_width<uint16_t>    { using type = uint32_t;};
+template<> struct double_width<uint32_t>    { using type = uint64_t;};
+template<> struct double_width<int8_t>      { using type = int16_t; };
+template<> struct double_width<int16_t>     { using type = int32_t; };
+template<> struct double_width<int32_t>     { using type = int64_t; };
+template<> struct double_width<float32_t>   { using type = float64_t;};
+
+template<typename Type> struct double_widthf;
+template<> struct double_widthf<uint32_t>    { using type = float64_t;};
+template<> struct double_widthf<int32_t>     { using type = float64_t;};
+
+template<typename FloatType, typename IntType = decltype(FloatType::v)> auto
+ftype(IntType a) -> FloatType
+{
+    if constexpr(std::is_same_v<uint32_t, IntType>)
+        return f32(a);
+    else if constexpr(std::is_same_v<uint64_t, IntType>)
+        return f64(a);
+    GEM5_UNREACHABLE;
+}
+
+// TODO: Consolidate ftype_freg(freg_t a) and ftype(IntType a) into a
+// single function
+template<typename FloatType, typename IntType = decltype(FloatType::v)> auto
+ftype_freg(freg_t a) -> FloatType
+{
+    if constexpr(std::is_same_v<uint32_t, IntType>)
+        return f32(a);
+    else if constexpr(std::is_same_v<uint64_t, IntType>)
+        return f64(a);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fadd(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_add(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_add(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fsub(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_sub(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_sub(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fmin(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_min(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_min(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fmax(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_max(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_max(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fdiv(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_div(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_div(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fmul(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_mul(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_mul(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fsqrt(FloatType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_sqrt(a);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_sqrt(a);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+frsqrte7(FloatType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_rsqrte7(a);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_rsqrte7(a);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+frecip7(FloatType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_recip7(a);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_recip7(a);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fclassify(FloatType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32(f32_classify(a));
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64(f64_classify(a));
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fsgnj(FloatType a, FloatType b, bool n, bool x)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return fsgnj32(a, b, n, x);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return fsgnj64(a, b, n, x);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> bool
+fle(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_le(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_le(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> bool
+feq(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_eq(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_eq(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> bool
+flt(FloatType a, FloatType b)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_lt(a, b);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_lt(a, b);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fmadd(FloatType a, FloatType b, FloatType c)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_mulAdd(a, b, c);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_mulAdd(a, b, c);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType> FloatType
+fneg(FloatType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32(a.v ^ uint32_t(mask(31, 31)));
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64(a.v ^ mask(63, 63));
+    GEM5_UNREACHABLE;
+}
+
+template<typename FT, typename WFT = typename double_width<FT>::type> WFT
+fwiden(FT a)
+{
+    if constexpr(std::is_same_v<float32_t, FT>)
+        return f32_to_f64(a);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType, typename IntType = decltype(FloatType::v)> IntType
+f_to_ui(FloatType a, uint_fast8_t mode)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_to_ui32(a, mode, true);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_to_ui64(a, mode, true);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename FloatType,
+    typename IntType = decltype(double_width<FloatType>::type::v)
+> IntType
+f_to_wui(FloatType a, uint_fast8_t mode)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_to_ui64(a, mode, true);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename IntType,
+    typename FloatType = typename double_widthf<IntType>::type
+> IntType
+f_to_nui(FloatType a, uint_fast8_t mode)
+{
+    if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_to_ui32(a, mode, true);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType, typename IntType = decltype(FloatType::v)> IntType
+f_to_i(FloatType a, uint_fast8_t mode)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return (uint32_t)f32_to_i32(a, mode, true);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return (uint64_t)f64_to_i64(a, mode, true);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename FloatType,
+    typename IntType = decltype(double_width<FloatType>::type::v)
+> IntType
+f_to_wi(FloatType a, uint_fast8_t mode)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return (uint64_t)f32_to_i64(a, mode, true);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename IntType,
+    typename FloatType = typename double_widthf<IntType>::type
+> IntType
+f_to_ni(FloatType a, uint_fast8_t mode)
+{
+    if constexpr(std::is_same_v<float64_t, FloatType>)
+        return (uint32_t)f64_to_i32(a, mode, true);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType, typename IntType = decltype(FloatType::v)>
+FloatType
+ui_to_f(IntType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return ui32_to_f32(a);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return ui64_to_f64(a);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename IntType,
+    typename FloatType = typename double_widthf<IntType>::type
+> FloatType
+ui_to_wf(IntType a)
+{
+    if constexpr(std::is_same_v<float64_t, FloatType>)
+        return ui32_to_f64(a);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename FloatType,
+    typename IntType = decltype(double_width<FloatType>::type::v)
+> FloatType
+ui_to_nf(IntType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return ui64_to_f32(a);
+    GEM5_UNREACHABLE;
+}
+
+template<typename FloatType, typename IntType = decltype(FloatType::v)>
+FloatType
+i_to_f(IntType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return i32_to_f32((int32_t)a);
+    else if constexpr(std::is_same_v<float64_t, FloatType>)
+        return i64_to_f64((int64_t)a);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename IntType,
+    typename FloatType = typename double_widthf<IntType>::type
+> FloatType
+i_to_wf(IntType a)
+{
+    if constexpr(std::is_same_v<float64_t, FloatType>)
+        return i32_to_f64((int32_t)a);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename FloatType,
+    typename IntType = std::make_signed_t<
+        decltype(double_width<FloatType>::type::v)
+    >
+> FloatType
+i_to_nf(IntType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return i64_to_f32(a);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename FloatType,
+    typename FloatWType = typename double_width<FloatType>::type
+> FloatWType
+f_to_wf(FloatType a)
+{
+    if constexpr(std::is_same_v<float32_t, FloatType>)
+        return f32_to_f64(a);
+    GEM5_UNREACHABLE;
+}
+
+template<
+    typename FloatNType,
+    typename FloatType = typename double_width<FloatNType>::type
+> FloatNType
+f_to_nf(FloatType a)
+{
+    if constexpr(std::is_same_v<float64_t, FloatType>)
+        return f64_to_f32(a);
+    GEM5_UNREACHABLE;
+}
+
+//ref:  https://locklessinc.com/articles/sat_arithmetic/
+template<typename T> T
+sat_add(T x, T y, bool* sat)
+{
+    using UT = std::make_unsigned_t<T>;
+    UT ux = x;
+    UT uy = y;
+    UT res = ux + uy;
+
+    int sh = sizeof(T) * 8 - 1;
+
+    ux = (ux >> sh) + (((UT)0x1 << sh) - 1);
+
+    if ((T) ((ux ^ uy) | ~(uy ^ res)) >= 0) {
+    res = ux;
+    *sat = true;
+    }
+    return res;
+}
+
+template<typename T> T
+sat_sub(T x, T y, bool* sat)
+{
+    using UT = std::make_unsigned_t<T>;
+    UT ux = x;
+    UT uy = y;
+    UT res = ux - uy;
+
+    int sh = sizeof(T) * 8 - 1;
+
+    ux = (ux >> sh) + (((UT)0x1 << sh) - 1);
+
+    if ((T) ((ux ^ uy) & (ux ^ res)) < 0) {
+    res = ux;
+    *sat = true;
+    }
+    return res;
+}
+
+template<typename T> T
+sat_addu(T x, T y, bool* sat)
+{
+    T res = x + y;
+
+    bool t = res < x;
+    if (false == *sat){
+    *sat = t;
+    }
+    res |= -(res < x);
+
+    return res;
+}
+
+template<typename T> T
+sat_subu(T x, T y, bool* sat)
+{
+    T res = x - y;
+
+    bool t = !(res <= x);
+    if (false == *sat){
+    *sat = t;
+    }
+
+    res &= -(res <= x);
+
+    return res;
+}
+
+/**
+ * Ref:
+ * https://github.com/riscv-software-src/riscv-isa-sim
+ */
+template<typename T> T
+int_rounding(T result, uint8_t xrm, unsigned gb) {
+    const uint64_t lsb = 1UL << gb;
+    const uint64_t lsb_half = lsb >> 1;
+    switch (xrm) {
+    case 0 /* RNU */:
+        result += lsb_half;
+        break;
+    case 1 /* RNE */:
+        if ((result & lsb_half) &&
+            ((result & (lsb_half - 1)) || (result & lsb)))
+            result += lsb;
+        break;
+    case 2 /* RDN */:
+        break;
+    case 3 /* ROD */:
+        if (result & (lsb - 1))
+            result |= lsb;
+        break;
+    default:
+        panic("Invalid xrm value %d", (int)xrm);
+    }
+
+    return result;
+}
+
 } // namespace RiscvISA
 } // namespace gem5
 
diff --git a/src/arch/x86/X86FsWorkload.py b/src/arch/x86/X86FsWorkload.py
index 294241b51c..277a37988e 100644
--- a/src/arch/x86/X86FsWorkload.py
+++ b/src/arch/x86/X86FsWorkload.py
@@ -65,6 +65,7 @@ class X86FsWorkload(KernelWorkload):
     acpi_description_table_pointer = Param.X86ACPIRSDP(
         X86ACPIRSDP(), "ACPI root description pointer structure"
     )
+    enable_osxsave = Param.Bool(False, "Enable OSXSAVE in CR4 register")
 
 
 class X86FsLinux(X86FsWorkload):
diff --git a/src/arch/x86/X86ISA.py b/src/arch/x86/X86ISA.py
index bb72c415e9..aa48d1aa6e 100644
--- a/src/arch/x86/X86ISA.py
+++ b/src/arch/x86/X86ISA.py
@@ -54,3 +54,73 @@ class X86ISA(BaseISA):
     vendor_string = Param.String(
         "HygonGenuine", "Vendor string for CPUID instruction"
     )
+    name_string = Param.String(
+        "Fake gem5 x86_64 CPU", "Processor name for CPUID instruction"
+    )
+
+    # For the functions that return numerical values we use a vector of ints.
+    # The order of the values is: EAX, EBX, EDX, ECX.
+    #
+    # If the CPU function can take an index, the index value is used as an
+    # offset into the vector and four numerical values are added for each
+    # possible index value. For example, if the function accepts 3 index
+    # values, there are 12 total ints in the vector param. In addition, the
+    # last values for functions which take an index must be all zeros. All
+    # zeros indicates to the KVM cpu / OS that there are no more index values
+    # to iterate over.
+    #
+    # A good resource for these values can be found here:
+    #     https://sandpile.org/x86/cpuid.htm
+    # 0000_0001h
+    FamilyModelStepping = VectorParam.UInt32(
+        [0x00020F51, 0x00000805, 0xEFDBFBFF, 0x00000209],
+        "type/family/model/stepping and feature flags",
+    )
+    # 0000_0004h
+    CacheParams = VectorParam.UInt32(
+        [0x00000000, 0x00000000, 0x00000000, 0x00000000],
+        "cache configuration descriptors",
+    )
+    # 0000_0007h
+    ExtendedFeatures = VectorParam.UInt32(
+        [0x00000000, 0x01800000, 0x00000000, 0x00000000], "feature flags"
+    )
+    # 0000_000Dh - This uses ECX index, so the last entry must be all zeros
+    ExtendedState = VectorParam.UInt32(
+        [
+            0x00000000,
+            0x00000000,
+            0x00000000,
+            0x00000000,
+            0x00000000,
+            0x00000000,
+            0x00000000,
+            0x00000000,
+        ],
+        "extended state enumeration",
+    )
+    # 8000_0001h
+    FamilyModelSteppingBrandFeatures = VectorParam.UInt32(
+        [0x00020F51, 0x00000405, 0xEBD3FBFF, 0x00020001],
+        "family/model/stepping and features flags",
+    )
+    # 8000_0005h
+    L1CacheAndTLB = VectorParam.UInt32(
+        [0xFF08FF08, 0xFF20FF20, 0x40020140, 0x40020140],
+        "L1 cache and L1 TLB configuration descriptors",
+    )
+    # 8000_0006h
+    L2L3CacheAndL2TLB = VectorParam.UInt32(
+        [0x00000000, 0x42004200, 0x00000000, 0x04008140],
+        "L2/L3 cache and L2 TLB configuration descriptors",
+    )
+    # 8000_0007h
+    APMInfo = VectorParam.UInt32(
+        [0x80000018, 0x68747541, 0x69746E65, 0x444D4163],
+        "processor feedback capabilities",
+    )
+    # 8000_0008h
+    LongModeAddressSize = VectorParam.UInt32(
+        [0x00003030, 0x00000000, 0x00000000, 0x00000000],
+        "miscellaneous information",
+    )
diff --git a/src/arch/x86/cpuid.cc b/src/arch/x86/cpuid.cc
index ac4709ce0e..2ce9ec9289 100644
--- a/src/arch/x86/cpuid.cc
+++ b/src/arch/x86/cpuid.cc
@@ -31,162 +31,135 @@
 #include "arch/x86/isa.hh"
 #include "base/bitfield.hh"
 #include "cpu/thread_context.hh"
+#include "debug/X86.hh"
 
 namespace gem5
 {
 
-namespace X86ISA {
-    enum StandardCpuidFunction
-    {
-        VendorAndLargestStdFunc,
-        FamilyModelStepping,
-        CacheAndTLB,
-        SerialNumber,
-        CacheParams,
-        MonitorMwait,
-        ThermalPowerMgmt,
-        ExtendedFeatures,
-        NumStandardCpuidFuncs
-    };
+namespace X86ISA
+{
 
-    enum ExtendedCpuidFunctions
-    {
-        VendorAndLargestExtFunc,
-        FamilyModelSteppingBrandFeatures,
-        NameString1,
-        NameString2,
-        NameString3,
-        L1CacheAndTLB,
-        L2L3CacheAndL2TLB,
-        APMInfo,
-        LongModeAddressSize,
+X86CPUID::X86CPUID(const std::string& vendor, const std::string& name)
+    : vendorString(vendor), nameString(name)
+{
+    fatal_if(vendorString.size() != 12,
+             "CPUID vendor string must be 12 characters\n");
+}
 
-        /*
-         * The following are defined by the spec but not yet implemented
-         */
-/*      // Function 9 is reserved
-        SVMInfo = 10,
-        // Functions 11-24 are reserved
-        TLB1GBPageInfo = 25,
-        PerformanceInfo,*/
+void
+X86CPUID::addStandardFunc(uint32_t func, std::vector<uint32_t> values)
+{
+    capabilities[func] = values;
+}
 
-        NumExtendedCpuidFuncs
-    };
+void
+X86CPUID::addExtendedFunc(uint32_t func, std::vector<uint32_t> values)
+{
+    // Extended functions begin with 8000_0000h, but the enum is based from
+    // zero, so we need to add that to the function value.
+    capabilities[func | 0x80000000] = values;
+}
 
-    static const int nameStringSize = 48;
-    static const char nameString[nameStringSize] = "Fake M5 x86_64 CPU";
+bool
+X86CPUID::doCpuid(ThreadContext * tc, uint32_t function, uint32_t index,
+                  CpuidResult &result)
+{
+    constexpr uint32_t ext = 0x80000000;
 
-    uint64_t
-    stringToRegister(const char *str)
-    {
-        uint64_t reg = 0;
-        for (int pos = 3; pos >=0; pos--) {
-            reg <<= 8;
-            reg |= str[pos];
-        }
-        return reg;
-    }
+    DPRINTF(X86, "Calling CPUID function %x with index %d\n", function, index);
 
-    bool
-    doCpuid(ThreadContext * tc, uint32_t function,
-            uint32_t index, CpuidResult &result)
-    {
-        uint16_t family = bits(function, 31, 16);
-        uint16_t funcNum = bits(function, 15, 0);
-        if (family == 0x8000) {
-            // The extended functions
-            switch (funcNum) {
-              case VendorAndLargestExtFunc:
-                {
-                  ISA *isa = dynamic_cast<ISA *>(tc->getIsaPtr());
-                  auto vendor_string = isa->getVendorString();
-                  result = CpuidResult(
-                          0x80000000 + NumExtendedCpuidFuncs - 1,
-                          stringToRegister(vendor_string.c_str()),
-                          stringToRegister(vendor_string.c_str() + 4),
-                          stringToRegister(vendor_string.c_str() + 8));
-                }
-                break;
-              case FamilyModelSteppingBrandFeatures:
-                result = CpuidResult(0x00020f51, 0x00000405,
-                                     0xebd3fbff, 0x00020001);
-                break;
-              case NameString1:
-              case NameString2:
-              case NameString3:
-                {
-                    // Zero fill anything beyond the end of the string. This
-                    // should go away once the string is a vetted parameter.
-                    char cleanName[nameStringSize];
-                    memset(cleanName, '\0', nameStringSize);
-                    strncpy(cleanName, nameString, nameStringSize);
+    // Handle the string-related CPUID functions specially
+    if (function == VendorAndLargestStdFunc) {
+        result = CpuidResult(NumStandardCpuidFuncs - 1,
+                             stringToRegister(vendorString.c_str()),
+                             stringToRegister(vendorString.c_str() + 4),
+                             stringToRegister(vendorString.c_str() + 8));
 
-                    int offset = (funcNum - NameString1) * 16;
-                    assert(nameStringSize >= offset + 16);
-                    result = CpuidResult(
-                            stringToRegister(cleanName + offset + 0),
-                            stringToRegister(cleanName + offset + 4),
-                            stringToRegister(cleanName + offset + 12),
-                            stringToRegister(cleanName + offset + 8));
-                }
-                break;
-              case L1CacheAndTLB:
-                result = CpuidResult(0xff08ff08, 0xff20ff20,
-                                     0x40020140, 0x40020140);
-                break;
-              case L2L3CacheAndL2TLB:
-                result = CpuidResult(0x00000000, 0x42004200,
-                                     0x00000000, 0x04008140);
-                break;
-              case APMInfo:
-                result = CpuidResult(0x80000018, 0x68747541,
-                                     0x69746e65, 0x444d4163);
-                break;
-              case LongModeAddressSize:
-                result = CpuidResult(0x00003030, 0x00000000,
-                                     0x00000000, 0x00000000);
-                break;
-/*            case SVMInfo:
-              case TLB1GBPageInfo:
-              case PerformanceInfo:*/
-              default:
-                warn("x86 cpuid family 0x8000: unimplemented function %u",
-                    funcNum);
-                return false;
-            }
-        } else if (family == 0x0000) {
-            // The standard functions
-            switch (funcNum) {
-              case VendorAndLargestStdFunc:
-                {
-                  ISA *isa = dynamic_cast<ISA *>(tc->getIsaPtr());
-                  auto vendor_string = isa->getVendorString();
-                  result = CpuidResult(
-                          NumStandardCpuidFuncs - 1,
-                          stringToRegister(vendor_string.c_str()),
-                          stringToRegister(vendor_string.c_str() + 4),
-                          stringToRegister(vendor_string.c_str() + 8));
-                }
-                break;
-              case FamilyModelStepping:
-                result = CpuidResult(0x00020f51, 0x00000805,
-                                     0xefdbfbff, 0x00000209);
-                break;
-              case ExtendedFeatures:
-                result = CpuidResult(0x00000000, 0x01800000,
-                                     0x00000000, 0x00000000);
-                break;
-              default:
-                warn("x86 cpuid family 0x0000: unimplemented function %u",
-                    funcNum);
-                return false;
-            }
-        } else {
-            warn("x86 cpuid: unknown family %#x", family);
-            return false;
-        }
+        return true;
+    } else if (function == (ext | VendorAndLargestExtFunc)) {
+        result = CpuidResult(0x80000000 + NumExtendedCpuidFuncs - 1,
+                             stringToRegister(vendorString.c_str()),
+                             stringToRegister(vendorString.c_str() + 4),
+                             stringToRegister(vendorString.c_str() + 8));
+
+        return true;
+    } else if ((function == (ext | NameString1)) ||
+               (function == (ext | NameString2)) ||
+               (function == (ext | NameString3))) {
+        // Zero fill anything beyond the end of the string. This
+        // should go away once the string is a vetted parameter.
+        char cleanName[nameStringSize];
+        memset(cleanName, '\0', nameStringSize);
+        strncpy(cleanName, nameString.c_str(), nameStringSize-1);
+
+        int funcNum = bits(function, 15, 0);
+        int offset = (funcNum - NameString1) * 16;
+        assert(nameStringSize >= offset + 16);
+        result = CpuidResult(
+                stringToRegister(cleanName + offset + 0),
+                stringToRegister(cleanName + offset + 4),
+                stringToRegister(cleanName + offset + 12),
+                stringToRegister(cleanName + offset + 8));
 
         return true;
     }
+
+    // Ignore anything not in the map of supported CPUID functions.
+    // This is checked after the string-related functions as those are not
+    // in the capabilities map.
+    if (!capabilities.count(function)) {
+        return false;
+    }
+
+    int cap_offset = 0;
+
+    // Ignore index values for functions that do not take index values.
+    if (hasSignificantIndex(function)) {
+        cap_offset = index * 4;
+    }
+
+    // Ensure we have the offset and 4 dwords after it.
+    assert(capabilities[function].size() >= (cap_offset + 4));
+
+    auto &cap_vec = capabilities[function];
+    result = CpuidResult(cap_vec[cap_offset + 0], cap_vec[cap_offset + 1],
+                         cap_vec[cap_offset + 2], cap_vec[cap_offset + 3]);
+    DPRINTF(X86, "CPUID function %x returning (%x, %x, %x, %x)\n",
+            function, result.rax, result.rbx, result.rdx, result.rcx);
+
+    return true;
+}
+
+uint64_t
+X86CPUID::stringToRegister(const char *str)
+{
+    uint64_t reg = 0;
+    for (int pos = 3; pos >=0; pos--) {
+        reg <<= 8;
+        reg |= str[pos];
+    }
+    return reg;
+}
+
+// Return true if the CPUID function takes ECX index as an input AND
+// those multiple index values are supported in gem5.
+bool
+X86CPUID::hasSignificantIndex(uint32_t function)
+{
+    uint16_t family = bits(function, 31, 16);
+    uint16_t funcNum = bits(function, 15, 0);
+
+    if (family == 0x0000) {
+        switch (funcNum) {
+          case ExtendedState:
+            return true;
+          default:
+            return false;
+        }
+    }
+
+    return false;
+}
+
 } // namespace X86ISA
 } // namespace gem5
diff --git a/src/arch/x86/cpuid.hh b/src/arch/x86/cpuid.hh
index 5c1a8ccb16..1c932980d2 100644
--- a/src/arch/x86/cpuid.hh
+++ b/src/arch/x86/cpuid.hh
@@ -29,7 +29,10 @@
 #ifndef __ARCH_X86_CPUID_HH__
 #define __ARCH_X86_CPUID_HH__
 
+#include <unordered_map>
+
 #include "base/types.hh"
+#include "params/X86ISA.hh"
 
 namespace gem5
 {
@@ -38,28 +41,74 @@ class ThreadContext;
 
 namespace X86ISA
 {
-    struct CpuidResult
-    {
-        uint64_t rax;
-        uint64_t rbx;
-        uint64_t rcx;
-        uint64_t rdx;
 
-        // These are not in alphebetical order on purpose. The order reflects
-        // how the CPUID orders the registers when it returns results.
-        CpuidResult(uint64_t _rax, uint64_t _rbx,
-                    uint64_t _rdx, uint64_t _rcx) :
-            rax(_rax), rbx(_rbx), rcx(_rcx), rdx(_rdx)
-        {}
+enum StandardCpuidFunction
+{
+    VendorAndLargestStdFunc,
+    FamilyModelStepping,
+    CacheAndTLB,
+    SerialNumber,
+    CacheParams,
+    MonitorMwait,
+    ThermalPowerMgmt,
+    ExtendedFeatures,
+    ExtendedState = 0xD,
+    NumStandardCpuidFuncs
+};
 
-        CpuidResult()
-        {}
-    };
+enum ExtendedCpuidFunctions
+{
+    VendorAndLargestExtFunc,
+    FamilyModelSteppingBrandFeatures,
+    NameString1,
+    NameString2,
+    NameString3,
+    L1CacheAndTLB,
+    L2L3CacheAndL2TLB,
+    APMInfo,
+    LongModeAddressSize,
+    NumExtendedCpuidFuncs
+};
 
-    uint64_t stringToRegister(const char *str);
+constexpr int nameStringSize = 48;
+
+struct CpuidResult
+{
+    uint64_t rax;
+    uint64_t rbx;
+    uint64_t rcx;
+    uint64_t rdx;
+
+    // These are not in alphebetical order on purpose. The order reflects
+    // how the CPUID orders the registers when it returns results.
+    CpuidResult(uint64_t _rax, uint64_t _rbx,
+                uint64_t _rdx, uint64_t _rcx) :
+        rax(_rax), rbx(_rbx), rcx(_rcx), rdx(_rdx)
+    {}
+
+    CpuidResult()
+    {}
+};
+
+class X86CPUID
+{
+  public:
+    X86CPUID(const std::string& vendor, const std::string& name);
+
+    void addStandardFunc(uint32_t func, std::vector<uint32_t> values);
+    void addExtendedFunc(uint32_t func, std::vector<uint32_t> values);
 
     bool doCpuid(ThreadContext * tc, uint32_t function,
-            uint32_t index, CpuidResult &result);
+                 uint32_t index, CpuidResult &result);
+    bool hasSignificantIndex(uint32_t function);
+
+  private:
+    const std::string vendorString;
+    const std::string nameString;
+    std::unordered_map<uint32_t, std::vector<uint32_t>> capabilities;
+
+    uint64_t stringToRegister(const char *str);
+};
 
 } // namespace X86ISA
 } // namespace gem5
diff --git a/src/arch/x86/fs_workload.cc b/src/arch/x86/fs_workload.cc
index 1a412380a6..88d7deed68 100644
--- a/src/arch/x86/fs_workload.cc
+++ b/src/arch/x86/fs_workload.cc
@@ -58,7 +58,8 @@ FsWorkload::FsWorkload(const Params &p) : KernelWorkload(p),
     smbiosTable(p.smbios_table),
     mpFloatingPointer(p.intel_mp_pointer),
     mpConfigTable(p.intel_mp_table),
-    rsdp(p.acpi_description_table_pointer)
+    rsdp(p.acpi_description_table_pointer),
+    enable_osxsave(p.enable_osxsave)
 {}
 
 void
@@ -295,6 +296,7 @@ FsWorkload::initState()
     CR4 cr4 = tc->readMiscRegNoEffect(misc_reg::Cr4);
     // Turn on pae.
     cr4.pae = 1;
+    cr4.osxsave = enable_osxsave;
     tc->setMiscReg(misc_reg::Cr4, cr4);
 
     // Point to the page tables.
diff --git a/src/arch/x86/fs_workload.hh b/src/arch/x86/fs_workload.hh
index 9d14f91bb5..81db414fb2 100644
--- a/src/arch/x86/fs_workload.hh
+++ b/src/arch/x86/fs_workload.hh
@@ -106,6 +106,9 @@ class FsWorkload : public KernelWorkload
             Addr &fpSize, Addr &tableSize, Addr table=0);
 
     void writeOutACPITables(Addr begin, Addr &size);
+
+  private:
+    bool enable_osxsave;
 };
 
 } // namespace X86ISA
diff --git a/src/arch/x86/isa.cc b/src/arch/x86/isa.cc
index 31efae3a43..9e6082a268 100644
--- a/src/arch/x86/isa.cc
+++ b/src/arch/x86/isa.cc
@@ -151,10 +151,20 @@ RegClass matRegClass(MatRegClass, MatRegClassName, 1, debug::MatRegs);
 
 } // anonymous namespace
 
-ISA::ISA(const X86ISAParams &p) : BaseISA(p), vendorString(p.vendor_string)
+ISA::ISA(const X86ISAParams &p)
+    : BaseISA(p), cpuid(new X86CPUID(p.vendor_string, p.name_string))
 {
-    fatal_if(vendorString.size() != 12,
-             "CPUID vendor string must be 12 characters\n");
+    cpuid->addStandardFunc(FamilyModelStepping, p.FamilyModelStepping);
+    cpuid->addStandardFunc(CacheParams, p.CacheParams);
+    cpuid->addStandardFunc(ExtendedFeatures, p.ExtendedFeatures);
+    cpuid->addStandardFunc(ExtendedState, p.ExtendedState);
+
+    cpuid->addExtendedFunc(FamilyModelSteppingBrandFeatures,
+                          p.FamilyModelSteppingBrandFeatures);
+    cpuid->addExtendedFunc(L1CacheAndTLB, p.L1CacheAndTLB);
+    cpuid->addExtendedFunc(L2L3CacheAndL2TLB, p.L2L3CacheAndL2TLB);
+    cpuid->addExtendedFunc(APMInfo, p.APMInfo);
+    cpuid->addExtendedFunc(LongModeAddressSize, p.LongModeAddressSize);
 
     _regClasses.push_back(&flatIntRegClass);
     _regClasses.push_back(&flatFloatRegClass);
diff --git a/src/arch/x86/isa.hh b/src/arch/x86/isa.hh
index f7ae210f96..9c6dcf0921 100644
--- a/src/arch/x86/isa.hh
+++ b/src/arch/x86/isa.hh
@@ -33,6 +33,7 @@
 #include <string>
 
 #include "arch/generic/isa.hh"
+#include "arch/x86/cpuid.hh"
 #include "arch/x86/pcstate.hh"
 #include "arch/x86/regs/ccr.hh"
 #include "arch/x86/regs/float.hh"
@@ -93,6 +94,8 @@ class ISA : public BaseISA
     void setThreadContext(ThreadContext *_tc) override;
 
     std::string getVendorString() const;
+
+    std::unique_ptr<X86CPUID> cpuid;
 };
 
 } // namespace X86ISA
diff --git a/src/arch/x86/isa/decoder/two_byte_opcodes.isa b/src/arch/x86/isa/decoder/two_byte_opcodes.isa
index 38937cb3e2..dac5706a06 100644
--- a/src/arch/x86/isa/decoder/two_byte_opcodes.isa
+++ b/src/arch/x86/isa/decoder/two_byte_opcodes.isa
@@ -690,8 +690,9 @@
             }
             0x2: CPUIDInst::CPUID({{
                 CpuidResult result;
-                bool success = doCpuid(xc->tcBase(), bits(Rax, 31, 0),
-                    bits(Rcx, 31, 0), result);
+                ISA *isa = dynamic_cast<ISA *>(xc->tcBase()->getIsaPtr());
+                bool success = isa->cpuid->doCpuid(xc->tcBase(),
+                    bits(Rax, 31, 0), bits(Rcx, 31, 0), result);
                 if (success) {
                     Rax = result.rax;
                     Rbx = result.rbx;
diff --git a/src/arch/x86/isa/includes.isa b/src/arch/x86/isa/includes.isa
index 6fc5f448a0..9445f2032b 100644
--- a/src/arch/x86/isa/includes.isa
+++ b/src/arch/x86/isa/includes.isa
@@ -63,6 +63,7 @@ output header {{
 #include "arch/x86/insts/microregop.hh"
 #include "arch/x86/insts/microspecop.hh"
 #include "arch/x86/insts/static_inst.hh"
+#include "arch/x86/isa.hh"
 #include "arch/x86/regs/ccr.hh"
 #include "arch/x86/regs/int.hh"
 #include "arch/x86/regs/misc.hh"
diff --git a/src/arch/x86/kvm/x86_cpu.cc b/src/arch/x86/kvm/x86_cpu.cc
index 7faa9159ab..e1c1b0dfc0 100644
--- a/src/arch/x86/kvm/x86_cpu.cc
+++ b/src/arch/x86/kvm/x86_cpu.cc
@@ -37,6 +37,7 @@
 #include "arch/x86/cpuid.hh"
 #include "arch/x86/faults.hh"
 #include "arch/x86/interrupts.hh"
+#include "arch/x86/isa.hh"
 #include "arch/x86/regs/float.hh"
 #include "arch/x86/regs/int.hh"
 #include "arch/x86/regs/msr.hh"
@@ -73,6 +74,13 @@ using namespace X86ISA;
 // data) is used to indicate that a segment has been accessed.
 #define SEG_TYPE_BIT_ACCESSED 1
 
+// Some linux distro s(e.g., RHEL7) define the KVM macros using "BIT" but do
+// not include where BIT is defined, so define it here in that case.
+#ifndef BIT
+#define BIT(nr)         (1UL << (nr))
+#endif
+
+
 struct GEM5_PACKED FXSave
 {
     uint16_t fcw;
@@ -1419,12 +1427,12 @@ X86KvmCPU::ioctlRun()
 
 static struct kvm_cpuid_entry2
 makeKvmCpuid(uint32_t function, uint32_t index,
-             CpuidResult &result)
+             CpuidResult &result, uint32_t flags = 0)
 {
     struct kvm_cpuid_entry2 e;
     e.function = function;
     e.index = index;
-    e.flags = 0;
+    e.flags = flags;
     e.eax = (uint32_t)result.rax;
     e.ebx = (uint32_t)result.rbx;
     e.ecx = (uint32_t)result.rcx;
@@ -1437,33 +1445,74 @@ void
 X86KvmCPU::updateCPUID()
 {
     Kvm::CPUIDVector m5_supported;
-
-    /* TODO: We currently don't support any of the functions that
-     * iterate through data structures in the CPU using an index. It's
-     * currently not a problem since M5 doesn't expose any of them at
-     * the moment.
-     */
+    X86ISA::ISA *isa = dynamic_cast<X86ISA::ISA *>(tc->getIsaPtr());
 
     /* Basic features */
     CpuidResult func0;
-    X86ISA::doCpuid(tc, 0x0, 0, func0);
+    isa->cpuid->doCpuid(tc, 0x0, 0, func0);
     for (uint32_t function = 0; function <= func0.rax; ++function) {
         CpuidResult cpuid;
         uint32_t idx(0);
 
-        X86ISA::doCpuid(tc, function, idx, cpuid);
-        m5_supported.push_back(makeKvmCpuid(function, idx, cpuid));
+        if (!isa->cpuid->hasSignificantIndex(function)) {
+            isa->cpuid->doCpuid(tc, function, idx, cpuid);
+            m5_supported.push_back(makeKvmCpuid(function, idx, cpuid));
+        } else {
+            while (true) {
+                bool rv = isa->cpuid->doCpuid(tc, function, idx, cpuid);
+                assert(rv);
+
+                if (idx &&
+                    !cpuid.rax && !cpuid.rbx && !cpuid.rdx && !cpuid.rcx) {
+                    break;
+                }
+
+                /*
+                 * For functions in family 0, this flag tells Linux to compare
+                 * the index as well as the function number rather than only
+                 * the function number. Important: Do NOT set this flag if the
+                 * function does not take an index. Doing so will break SMP.
+                 */
+                uint32_t flag = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+                m5_supported.push_back(
+                    makeKvmCpuid(function, idx, cpuid, flag));
+                idx++;
+            }
+        }
     }
 
     /* Extended features */
     CpuidResult efunc0;
-    X86ISA::doCpuid(tc, 0x80000000, 0, efunc0);
+    isa->cpuid->doCpuid(tc, 0x80000000, 0, efunc0);
     for (uint32_t function = 0x80000000; function <= efunc0.rax; ++function) {
         CpuidResult cpuid;
         uint32_t idx(0);
 
-        X86ISA::doCpuid(tc, function, idx, cpuid);
-        m5_supported.push_back(makeKvmCpuid(function, idx, cpuid));
+        if (!isa->cpuid->hasSignificantIndex(function)) {
+            isa->cpuid->doCpuid(tc, function, idx, cpuid);
+            m5_supported.push_back(makeKvmCpuid(function, idx, cpuid));
+        } else {
+            while (true) {
+                bool rv = isa->cpuid->doCpuid(tc, function, idx, cpuid);
+                assert(rv);
+
+                if (idx &&
+                    !cpuid.rax && !cpuid.rbx && !cpuid.rdx && !cpuid.rcx) {
+                    break;
+                }
+
+                /*
+                 * For functions in family 0, this flag tells Linux to compare
+                 * the index as well as the function number rather than only
+                 * the function number. Important: Do NOT set this flag if the
+                 * function does not take an index. Doing so will break SMP.
+                 */
+                uint32_t flag = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+                m5_supported.push_back(
+                    makeKvmCpuid(function, idx, cpuid, flag));
+                idx++;
+            }
+        }
     }
 
     setCPUID(m5_supported);
diff --git a/src/cpu/testers/gpu_ruby_test/ProtocolTester.py b/src/cpu/testers/gpu_ruby_test/ProtocolTester.py
index 178376bee9..c314392520 100644
--- a/src/cpu/testers/gpu_ruby_test/ProtocolTester.py
+++ b/src/cpu/testers/gpu_ruby_test/ProtocolTester.py
@@ -73,7 +73,7 @@ class ProtocolTester(ClockedObject):
     random_seed = Param.Int(
         0,
         "Random seed number. Default value (0) means \
-                                using runtime-specific value.",
+                                using base/random.hh without seed.",
     )
     log_file = Param.String("Log file's name")
     system = Param.System(Parent.any, "System we belong to")
diff --git a/src/cpu/testers/gpu_ruby_test/address_manager.cc b/src/cpu/testers/gpu_ruby_test/address_manager.cc
index 049ba86e51..a0c0670a8f 100644
--- a/src/cpu/testers/gpu_ruby_test/address_manager.cc
+++ b/src/cpu/testers/gpu_ruby_test/address_manager.cc
@@ -33,7 +33,6 @@
 
 #include <algorithm>
 #include <climits>
-#include <random>
 
 #include "base/intmath.hh"
 #include "base/logging.hh"
@@ -101,7 +100,8 @@ AddressManager::getAddress(Location loc)
 AddressManager::Location
 AddressManager::getAtomicLoc()
 {
-    Location ret_atomic_loc = random() % numAtomicLocs;
+    Location ret_atomic_loc = \
+        random_mt.random<unsigned long>() % numAtomicLocs;
     atomicStructs[ret_atomic_loc]->startLocSelection();
     return ret_atomic_loc;
 }
@@ -206,7 +206,9 @@ AddressManager::AtomicStruct::getLoadLoc()
         // we can pick any location btw
         // locArray [firstMark : arraySize-1]
         int range_size = arraySize - firstMark;
-        Location ret_loc = locArray[firstMark + random() % range_size];
+        Location ret_loc = locArray[
+                firstMark + random_mt.random<unsigned int>() % range_size
+        ];
 
         // update loadStoreMap
         LdStMap::iterator it = loadStoreMap.find(ret_loc);
@@ -238,7 +240,9 @@ AddressManager::AtomicStruct::getStoreLoc()
     } else {
         // we can pick any location btw [firstMark : secondMark-1]
         int range_size = secondMark - firstMark;
-        Location ret_loc = locArray[firstMark + random() % range_size];
+        Location ret_loc = locArray[
+            firstMark + random_mt.random<unsigned int>() % range_size
+        ];
 
         // update loadStoreMap
         LdStMap::iterator it = loadStoreMap.find(ret_loc);
diff --git a/src/cpu/testers/gpu_ruby_test/episode.cc b/src/cpu/testers/gpu_ruby_test/episode.cc
index 6822049bbd..7e16b0ef07 100644
--- a/src/cpu/testers/gpu_ruby_test/episode.cc
+++ b/src/cpu/testers/gpu_ruby_test/episode.cc
@@ -34,6 +34,7 @@
 #include <fstream>
 #include <unordered_set>
 
+#include "base/random.hh"
 #include "cpu/testers/gpu_ruby_test/protocol_tester.hh"
 #include "cpu/testers/gpu_ruby_test/tester_thread.hh"
 
@@ -100,7 +101,7 @@ Episode::initActions()
     int num_loads = numLoads;
     int num_stores = numStores;
     while ((num_loads + num_stores) > 0) {
-        switch (random() % 2) {
+        switch (random_mt.random<unsigned int>() % 2) {
             case 0: // Load
                 if (num_loads > 0) {
                     actions.push_back(new Action(Action::Type::LOAD,
diff --git a/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc b/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
index ae4078ee6c..0fcfba7a37 100644
--- a/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
+++ b/src/cpu/testers/gpu_ruby_test/gpu_wavefront.cc
@@ -189,6 +189,7 @@ GpuWavefront::issueAtomicOps()
                                              AtomicOpFunctorPtr(amo_op));
         req->setPaddr(address);
         req->setReqInstSeqNum(tester->getActionSeqNum());
+        req->setCacheCoherenceFlags(Request::SLC_BIT);
         // set protocol-specific flags
         setExtraRequestFlags(req);
 
diff --git a/src/cpu/testers/gpu_ruby_test/protocol_tester.cc b/src/cpu/testers/gpu_ruby_test/protocol_tester.cc
index f2fd7f9600..6b3f9e19f1 100644
--- a/src/cpu/testers/gpu_ruby_test/protocol_tester.cc
+++ b/src/cpu/testers/gpu_ruby_test/protocol_tester.cc
@@ -34,8 +34,8 @@
 #include <algorithm>
 #include <ctime>
 #include <fstream>
-#include <random>
 
+#include "base/random.hh"
 #include "cpu/testers/gpu_ruby_test/cpu_thread.hh"
 #include "cpu/testers/gpu_ruby_test/dma_thread.hh"
 #include "cpu/testers/gpu_ruby_test/gpu_wavefront.hh"
@@ -141,11 +141,20 @@ ProtocolTester::ProtocolTester(const Params &p)
 
     sentExitSignal = false;
 
-    // set random seed number
+    // set random seed number, if specified.
+    // Note: random_m5 will use a fixed key if random_seed is not set.
+    // This ensures a reproducable.
     if (p.random_seed != 0) {
-        srand(p.random_seed);
+        random_mt.init(p.random_seed);
     } else {
-        srand(time(NULL));
+        warn(
+            "If `random_seed == 0` (or `random_seed` is unset) "
+            "ProtocolTester does not seed the RNG. This will NOT result in "
+            "the RNG generating different results each run. In this case the "
+            "RNG is seeded by a default value. This differs from behavior in "
+            "previous versions of gem5. Setting `random_seed` to a non-zero "
+            "value is strongly recommended."
+        );
     }
 
     actionCount = 0;
diff --git a/src/cpu/testers/gpu_ruby_test/tester_thread.cc b/src/cpu/testers/gpu_ruby_test/tester_thread.cc
index 760f8c2d87..ce3a1bccc6 100644
--- a/src/cpu/testers/gpu_ruby_test/tester_thread.cc
+++ b/src/cpu/testers/gpu_ruby_test/tester_thread.cc
@@ -33,6 +33,7 @@
 
 #include <fstream>
 
+#include "base/random.hh"
 #include "debug/ProtocolTest.hh"
 
 namespace gem5
@@ -144,7 +145,8 @@ TesterThread::attachTesterThreadToPorts(ProtocolTester *_tester,
 void
 TesterThread::issueNewEpisode()
 {
-    int num_reg_loads = random() % tester->getEpisodeLength();
+    int num_reg_loads = \
+        random_mt.random<unsigned int>() % tester->getEpisodeLength();
     int num_reg_stores = tester->getEpisodeLength() - num_reg_loads;
 
     // create a new episode
diff --git a/src/dev/amdgpu/amdgpu_device.cc b/src/dev/amdgpu/amdgpu_device.cc
index d1058f1606..44a1c9d394 100644
--- a/src/dev/amdgpu/amdgpu_device.cc
+++ b/src/dev/amdgpu/amdgpu_device.cc
@@ -291,6 +291,7 @@ AMDGPUDevice::readFrame(PacketPtr pkt, Addr offset)
     system->getDeviceMemory(readPkt)->access(readPkt);
 
     pkt->setUintX(readPkt->getUintX(ByteOrder::little), ByteOrder::little);
+    delete readPkt;
 }
 
 void
diff --git a/src/dev/amdgpu/sdma_engine.cc b/src/dev/amdgpu/sdma_engine.cc
index e99d694634..0202f583e6 100644
--- a/src/dev/amdgpu/sdma_engine.cc
+++ b/src/dev/amdgpu/sdma_engine.cc
@@ -510,9 +510,12 @@ SDMAEngine::decodeHeader(SDMAQueue *q, uint32_t header)
         dmaReadVirt(q->rptr(), sizeof(sdmaAtomic), cb, dmaBuffer);
         } break;
       case SDMA_OP_CONST_FILL: {
-        q->incRptr(sizeof(sdmaConstFill));
-        warn("SDMA_OP_CONST_FILL not implemented");
-        decodeNext(q);
+        DPRINTF(SDMAEngine, "SDMA Constant fill packet\n");
+        dmaBuffer = new sdmaConstFill();
+        cb = new DmaVirtCallback<uint64_t>(
+            [ = ] (const uint64_t &)
+                { constFill(q, (sdmaConstFill *)dmaBuffer, header); });
+        dmaReadVirt(q->rptr(), sizeof(sdmaConstFill), cb, dmaBuffer);
         } break;
       case SDMA_OP_PTEPDE: {
         DPRINTF(SDMAEngine, "SDMA PTEPDE packet\n");
@@ -1026,6 +1029,68 @@ SDMAEngine::atomicDone(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt,
     decodeNext(q);
 }
 
+void
+SDMAEngine::constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header)
+{
+    q->incRptr(sizeof(sdmaConstFill));
+
+    sdmaConstFillHeader fill_header;
+    fill_header.ordinal = header;
+
+    DPRINTF(SDMAEngine, "ConstFill %lx srcData %x count %d size %d sw %d\n",
+            pkt->addr, pkt->srcData, pkt->count, fill_header.fillsize,
+            fill_header.sw);
+
+    // Count is number of <size> elements - 1. Size is log2 of byte size.
+    int fill_bytes = (pkt->count + 1) * (1 << fill_header.fillsize);
+    uint8_t *fill_data = new uint8_t[fill_bytes];
+
+    memset(fill_data, pkt->srcData, fill_bytes);
+
+    Addr device_addr = getDeviceAddress(pkt->addr);
+    if (device_addr) {
+        DPRINTF(SDMAEngine, "ConstFill %d bytes of %x to device at %lx\n",
+                fill_bytes, pkt->srcData, pkt->addr);
+
+        auto cb = new EventFunctionWrapper(
+            [ = ]{ constFillDone(q, pkt, fill_data); }, name());
+
+        // Copy the minimum page size at a time in case the physical addresses
+        // are not contiguous.
+        ChunkGenerator gen(pkt->addr, fill_bytes, AMDGPU_MMHUB_PAGE_SIZE);
+        for (; !gen.done(); gen.next()) {
+            Addr chunk_addr = getDeviceAddress(gen.addr());
+            assert(chunk_addr);
+
+            DPRINTF(SDMAEngine, "Copying chunk of %d bytes from %#lx (%#lx)\n",
+                    gen.size(), gen.addr(), chunk_addr);
+
+            gpuDevice->getMemMgr()->writeRequest(chunk_addr, fill_data,
+                                                 gen.size(), 0,
+                                                 gen.last() ? cb : nullptr);
+            fill_data += gen.size();
+        }
+    } else {
+        DPRINTF(SDMAEngine, "ConstFill %d bytes of %x to host at %lx\n",
+                fill_bytes, pkt->srcData, pkt->addr);
+
+        auto cb = new DmaVirtCallback<uint64_t>(
+            [ = ] (const uint64_t &)
+                { constFillDone(q, pkt, fill_data); });
+        dmaWriteVirt(pkt->addr, fill_bytes, cb, (void *)fill_data);
+    }
+}
+
+void
+SDMAEngine::constFillDone(SDMAQueue *q, sdmaConstFill *pkt, uint8_t *fill_data)
+{
+    DPRINTF(SDMAEngine, "ConstFill to %lx done\n", pkt->addr);
+
+    delete fill_data;
+    delete pkt;
+    decodeNext(q);
+}
+
 AddrRangeList
 SDMAEngine::getAddrRanges() const
 {
diff --git a/src/dev/amdgpu/sdma_engine.hh b/src/dev/amdgpu/sdma_engine.hh
index bcbd497e8a..5abe63fcc6 100644
--- a/src/dev/amdgpu/sdma_engine.hh
+++ b/src/dev/amdgpu/sdma_engine.hh
@@ -245,6 +245,8 @@ class SDMAEngine : public DmaVirtDevice
                     uint64_t *dmaBuffer);
     void atomicDone(SDMAQueue *q, sdmaAtomicHeader *header, sdmaAtomic *pkt,
                     uint64_t *dmaBuffer);
+    void constFill(SDMAQueue *q, sdmaConstFill *pkt, uint32_t header);
+    void constFillDone(SDMAQueue *q, sdmaConstFill *pkt, uint8_t *fill_data);
 
     /**
      * Methods for getting SDMA MMIO base address and size. These are set by
diff --git a/src/dev/amdgpu/sdma_packets.hh b/src/dev/amdgpu/sdma_packets.hh
index 52a47d3a2d..07d3f12600 100644
--- a/src/dev/amdgpu/sdma_packets.hh
+++ b/src/dev/amdgpu/sdma_packets.hh
@@ -37,7 +37,7 @@ namespace gem5
 {
 
 /**
- * SDMA packets
+ * SDMA packets - see src/core/inc/sdma_registers.h in ROCR-Runtime
  */
 typedef struct GEM5_PACKED
 {
@@ -80,6 +80,23 @@ typedef struct GEM5_PACKED
 }  sdmaConstFill;
 static_assert(sizeof(sdmaConstFill) == 16);
 
+typedef struct GEM5_PACKED
+{
+    union
+    {
+        struct
+        {
+            uint32_t op : 8;
+            uint32_t sub_op : 8;
+            uint32_t sw : 2;
+            uint32_t res0 : 12;
+            uint32_t fillsize : 2;
+        };
+        uint32_t ordinal;
+    };
+}  sdmaConstFillHeader;
+static_assert(sizeof(sdmaConstFillHeader) == 4);
+
 typedef struct GEM5_PACKED
 {
     uint32_t key0;
diff --git a/src/learning_gem5/part2/hello_object.hh b/src/learning_gem5/part2/hello_object.hh
index c34dde304d..05c6dde4d6 100644
--- a/src/learning_gem5/part2/hello_object.hh
+++ b/src/learning_gem5/part2/hello_object.hh
@@ -69,7 +69,7 @@ class HelloObject : public SimObject
      * SimObjects have been constructed. It is called after the user calls
      * simulate() for the first time.
      */
-    void startup();
+    void startup() override;
 };
 
 } // namespace gem5
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index ed7a94f4fb..ad05b72828 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -1156,7 +1156,7 @@ class Packet : public Printable, public Extensible<Packet>
   public:
     /**
      * @{
-     * @name Data accessor mehtods
+     * @name Data accessor methods
      */
 
     /**
diff --git a/src/python/gem5/resources/client.py b/src/python/gem5/resources/client.py
index ab8262bf92..571a8254e0 100644
--- a/src/python/gem5/resources/client.py
+++ b/src/python/gem5/resources/client.py
@@ -30,7 +30,7 @@ import os
 from typing import Optional, Dict, List
 from .client_api.client_wrapper import ClientWrapper
 from gem5.gem5_default_config import config
-from m5.util import inform
+from m5.util import inform, warn
 from _m5 import core
 
 
@@ -53,8 +53,38 @@ clientwrapper = None
 def _get_clientwrapper():
     global clientwrapper
     if clientwrapper is None:
+        if (
+            "GEM5_RESOURCE_JSON" in os.environ
+            and "GEM5_RESOURCE_JSON_APPEND" in os.environ
+        ):
+            raise Exception(
+                "Both GEM5_RESOURCE_JSON and GEM5_RESOURCE_JSON_APPEND are set. Please set only one of them."
+            )
+        gem5_config = {}
+        # If the GEM5_RESOURCE_JSON is set, use it as the only source
+        if "GEM5_RESOURCE_JSON" in os.environ:
+            json_source = {
+                "url": os.environ["GEM5_RESOURCE_JSON"],
+                "isMongo": False,
+            }
+            gem5_config["sources"] = {"GEM5_RESOURCE_JSON": json_source}
+            if "GEM5_CONFIG" in os.environ:
+                warn(
+                    f"Both GEM5_CONFIG and GEM5_RESOURCE_JSON are set.\n"
+                    f"GEM5_CONFIG will be ignored in favor of the GEM5_RESOURCE_JSON environment variable."
+                )
+            elif (Path().cwd().resolve() / "gem5-config.json").exists():
+                warn(
+                    f"Both gem5-config.json and GEM5_RESOURCE_JSON are set.\n"
+                    f"gem5-config.json will be ignored in favor of the GEM5_RESOURCE_JSON environment variable."
+                )
+            else:
+                warn(
+                    f"GEM5_RESOURCE_JSON is set.\n"
+                    f"gem5-default-config will be ignored in favor of the GEM5_RESOURCE_JSON environment variable."
+                )
         # First check if the config file path is provided in the environment variable
-        if "GEM5_CONFIG" in os.environ:
+        elif "GEM5_CONFIG" in os.environ:
             config_file_path = Path(os.environ["GEM5_CONFIG"])
             gem5_config = getFileContent(config_file_path)
             inform("Using config file specified by $GEM5_CONFIG")
@@ -68,6 +98,20 @@ def _get_clientwrapper():
         else:
             gem5_config = config
             inform("Using default config")
+
+        # If the GEM5_RESOURCE_JSON_APPEND is set, append the resources to the gem5_config
+        if "GEM5_RESOURCE_JSON_APPEND" in os.environ:
+            json_source = {
+                "url": os.environ["GEM5_RESOURCE_JSON_APPEND"],
+                "isMongo": False,
+            }
+            gem5_config["sources"].update(
+                {"GEM5_RESOURCE_JSON_APPEND": json_source}
+            )
+            inform(
+                f"Appending resources from {os.environ['GEM5_RESOURCE_JSON_APPEND']}"
+            )
+
         clientwrapper = ClientWrapper(gem5_config)
     return clientwrapper
 
diff --git a/src/python/gem5/resources/downloader.py b/src/python/gem5/resources/downloader.py
index 1aeb487e61..b4f7a2f016 100644
--- a/src/python/gem5/resources/downloader.py
+++ b/src/python/gem5/resources/downloader.py
@@ -280,17 +280,21 @@ def get_resource(
         # string-based way of doing things. It can be refactored away over
         # time:
         # https://gem5-review.googlesource.com/c/public/gem5-resources/+/51168
-        if isinstance(resource_json["is_zipped"], str):
-            run_unzip = unzip and resource_json["is_zipped"].lower() == "true"
-        elif isinstance(resource_json["is_zipped"], bool):
-            run_unzip = unzip and resource_json["is_zipped"]
-        else:
-            raise Exception(
-                "The resource.json entry for '{}' has a value for the "
-                "'is_zipped' field which is neither a string or a boolean.".format(
-                    resource_name
+        run_unzip = False
+        if "is_zipped" in resource_json:
+            if isinstance(resource_json["is_zipped"], str):
+                run_unzip = (
+                    unzip and resource_json["is_zipped"].lower() == "true"
+                )
+            elif isinstance(resource_json["is_zipped"], bool):
+                run_unzip = unzip and resource_json["is_zipped"]
+            else:
+                raise Exception(
+                    "The resource.json entry for '{}' has a value for the "
+                    "'is_zipped' field which is neither a string or a boolean.".format(
+                        resource_name
+                    )
                 )
-            )
 
         run_tar_extract = (
             untar
diff --git a/tests/gem5/gem5_resources/configs/download_check.py b/tests/gem5/gem5_resources/configs/download_check.py
index e3b06a578d..20081a46d5 100644
--- a/tests/gem5/gem5_resources/configs/download_check.py
+++ b/tests/gem5/gem5_resources/configs/download_check.py
@@ -127,9 +127,14 @@ for id in ids:
                 + f"({md5(Path(download_path))}) differs to that recorded in "
                 + f" gem5-resources ({resource_json['md5sum']}).{os.linesep}"
             )
+        # Remove the downloaded resource.
+        if os.path.isfile(download_path):
+            os.remove(download_path)
+        elif os.path.isdir(download_path):
+            shutil.rmtree(download_path, ignore_errors=True)
+        else:
+            raise Exception("{download_path} is not a file or directory.")
 
-# Remove the downloaded resource.
-shutil.rmtree(args.download_directory, ignore_errors=True)
 
 # If errors exist, raise an exception highlighting them.
 if errors:
diff --git a/util/cpt_upgraders/riscv-vext.py b/util/cpt_upgraders/riscv-vext.py
new file mode 100644
index 0000000000..ada492fe1e
--- /dev/null
+++ b/util/cpt_upgraders/riscv-vext.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2023 Barcelona Supercomputing Center (BSC)
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+def upgrader(cpt):
+    """
+    Update the checkpoint to support initial RVV implemtation.
+    The updater is taking the following steps.
+
+    1) Set vector registers to occupy 1280 bytes (40regs * 32bytes)
+    2) Clear vector_element, vector_predicate and matrix registers
+    3) Add RVV misc registers in the checkpoint
+    """
+
+    for sec in cpt.sections():
+        import re
+
+        # Search for all XC sections
+        if re.search(".*processor.*\.core.*\.xc.*", sec):
+
+            # Updating RVV vector registers (dummy values)
+            # Assuming VLEN = 256 bits (32 bytes)
+            mr = cpt.get(sec, "regs.vector").split()
+            if len(mr) <= 8:
+                cpt.set(sec, "regs.vector", " ".join("0" for i in range(1280)))
+
+            # Updating RVV vector element (dummy values)
+            cpt.set(sec, "regs.vector_element", "")
+
+            # Updating RVV vector predicate (dummy values)
+            cpt.set(sec, "regs.vector_predicate", "")
+
+            # Updating RVV matrix (dummy values)
+            cpt.set(sec, "regs.matrix", "")
+
+        # Search for all ISA sections
+        if re.search(".*processor.*\.core.*\.isa$", sec):
+
+            # Updating RVV misc registers (dummy values)
+            mr = cpt.get(sec, "miscRegFile").split()
+            if len(mr) == 164:
+                print(
+                    "MISCREG_* RVV registers already seem " "to be inserted."
+                )
+            else:
+                # Add dummy value for MISCREG_VSTART
+                mr.insert(121, 0)
+                # Add dummy value for MISCREG_VXSAT
+                mr.insert(121, 0)
+                # Add dummy value for MISCREG_VXRM
+                mr.insert(121, 0)
+                # Add dummy value for MISCREG_VCSR
+                mr.insert(121, 0)
+                # Add dummy value for MISCREG_VL
+                mr.insert(121, 0)
+                # Add dummy value for MISCREG_VTYPE
+                mr.insert(121, 0)
+                # Add dummy value for MISCREG_VLENB
+                mr.insert(121, 0)
+                cpt.set(sec, "miscRegFile", " ".join(str(x) for x in mr))
+
+
+legacy_version = 17
diff --git a/util/style/verifiers.py b/util/style/verifiers.py
index dbcce1c764..7a88e59789 100644
--- a/util/style/verifiers.py
+++ b/util/style/verifiers.py
@@ -424,7 +424,11 @@ class LineLength(LineVerifier):
     test_name = "line length"
     opt_name = "length"
 
-    def check_line(self, line, **kwargs):
+    def check_line(self, line, language, **kwargs):
+        # Ignore line length check for include pragmas of C/C++.
+        if language in {"C", "C++"}:
+            if line.startswith("#include"):
+                return True
         return style.normalized_len(line) <= 79
 
     def fix(self, filename, regions=all_regions, **kwargs):