From 605ec6899e6b2983e4ec9ea8e257447b19eb9a7a Mon Sep 17 00:00:00 2001
From: Roger Chang <rogerycchang@google.com>
Date: Tue, 29 Aug 2023 19:53:26 +0800
Subject: [PATCH 1/3] arch-riscv: Move VMem implementation from header to
 source

Move the VMem implementation from header_output to
decoder_output and exec_output respectively.

Change-Id: I699e197f37f22a59ecb9f92a64b5e296d2e9f5fa
---
 src/arch/riscv/isa/formats/vector_mem.isa   |  28 +-
 src/arch/riscv/isa/includes.isa             |   2 -
 src/arch/riscv/isa/templates/vector_mem.isa | 322 ++++++++++++--------
 3 files changed, 216 insertions(+), 136 deletions(-)

diff --git a/src/arch/riscv/isa/formats/vector_mem.isa b/src/arch/riscv/isa/formats/vector_mem.isa
index da53d80d0a..3b3309797c 100644
--- a/src/arch/riscv/isa/formats/vector_mem.isa
+++ b/src/arch/riscv/isa/formats/vector_mem.isa
@@ -34,6 +34,14 @@ def setVlen():
 def setVlenb():
         return "uint32_t vlenb = VlenbBits;\n"
 
+def declareVMemTemplate(class_name):
+    return f'''
+    template class {class_name}<uint8_t>;
+    template class {class_name}<uint16_t>;
+    template class {class_name}<uint32_t>;
+    template class {class_name}<uint64_t>;
+    '''
+
 def VMemBase(name, Name, ea_code, memacc_code, mem_flags,
                    inst_flags, base_class, postacc_code='',
                    declare_template_base=VMemMacroDeclare,
@@ -47,22 +55,20 @@ def VMemBase(name, Name, ea_code, memacc_code, mem_flags,
     iop = InstObjParams(name, Name, base_class,
         {'ea_code': ea_code,
          'memacc_code': memacc_code,
-         'postacc_code': postacc_code },
+         'postacc_code': postacc_code,
+         'declare_vmem_template': declareVMemTemplate(Name)},
         inst_flags)
 
     constructTemplate = eval(exec_template_base + 'Constructor')
 
     header_output   = declare_template_base.subst(iop)
-    decoder_output  = ''
-    if declare_template_base is not VMemTemplateMacroDeclare:
-        decoder_output  += constructTemplate.subst(iop)
-    else:
-        header_output   += constructTemplate.subst(iop)
+    decoder_output  = constructTemplate.subst(iop)
     decode_block    = decode_template.subst(iop)
     exec_output     = ''
     if not is_macroop:
         return (header_output, decoder_output, decode_block, exec_output)
 
+    micro_class_name = exec_template_base + 'MicroInst'
     microiop = InstObjParams(name + '_micro',
         Name + 'Micro',
         exec_template_base + 'MicroInst',
@@ -70,7 +76,8 @@ def VMemBase(name, Name, ea_code, memacc_code, mem_flags,
          'memacc_code': memacc_code,
          'postacc_code': postacc_code,
          'set_vlenb': setVlenb(),
-         'set_vlen': setVlen()},
+         'set_vlen': setVlen(),
+         'declare_vmem_template': declareVMemTemplate(Name + 'Micro')},
         inst_flags)
 
     if mem_flags:
@@ -79,17 +86,16 @@ def VMemBase(name, Name, ea_code, memacc_code, mem_flags,
         microiop.constructor += s
 
     microDeclTemplate = eval(exec_template_base + 'Micro' + 'Declare')
+    microConsTemplate = eval(exec_template_base + 'Micro' + 'Constructor')
     microExecTemplate = eval(exec_template_base + 'Micro' + 'Execute')
     microInitTemplate = eval(exec_template_base + 'Micro' + 'InitiateAcc')
     microCompTemplate = eval(exec_template_base + 'Micro' + 'CompleteAcc')
     header_output = microDeclTemplate.subst(microiop) + header_output
+    decoder_output = microConsTemplate.subst(microiop) + decoder_output
     micro_exec_output = (microExecTemplate.subst(microiop) +
         microInitTemplate.subst(microiop) +
         microCompTemplate.subst(microiop))
-    if declare_template_base is not VMemTemplateMacroDeclare:
-        exec_output += micro_exec_output
-    else:
-        header_output += micro_exec_output
+    exec_output += micro_exec_output
 
     return (header_output, decoder_output, decode_block, exec_output)
 
diff --git a/src/arch/riscv/isa/includes.isa b/src/arch/riscv/isa/includes.isa
index 76f2388faf..b37e62bca8 100644
--- a/src/arch/riscv/isa/includes.isa
+++ b/src/arch/riscv/isa/includes.isa
@@ -46,8 +46,6 @@ output header {{
 #include <softfloat.h>
 #include <specialize.h>
 
-#include "arch/generic/memhelpers.hh"
-#include "arch/riscv/decoder.hh"
 #include "arch/riscv/insts/amo.hh"
 #include "arch/riscv/insts/bs.hh"
 #include "arch/riscv/insts/compressed.hh"
diff --git a/src/arch/riscv/isa/templates/vector_mem.isa b/src/arch/riscv/isa/templates/vector_mem.isa
index 8cbab044ec..84cee9af73 100644
--- a/src/arch/riscv/isa/templates/vector_mem.isa
+++ b/src/arch/riscv/isa/templates/vector_mem.isa
@@ -96,22 +96,8 @@ private:
     RegId srcRegIdxArr[3];
     RegId destRegIdxArr[1];
 public:
-    %(class_name)s(ExtMachInst _machInst, uint32_t _microVl,
-        uint8_t _microIdx, uint32_t _vlen)
-    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s,
-                     _microVl, _microIdx, _vlen)
-    {
-        %(set_reg_idx_arr)s;
-        _numSrcRegs = 0;
-        _numDestRegs = 0;
-        setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _microIdx]);
-        _numTypedDestRegs[VecRegClass]++;
-        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
-        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _microIdx]);
-        if (!_machInst.vm) {
-            setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
-        }
-    }
+    %(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+        uint8_t _microIdx, uint32_t _vlen);
 
     Fault execute(ExecContext *, trace::InstRecord *) const override;
     Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
@@ -123,6 +109,27 @@ public:
 
 }};
 
+def template VleMicroConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst, uint8_t _microVl,
+    uint8_t _microIdx, uint32_t _vlen)
+  : %(base_class)s(
+        "%(mnemonic)s", _machInst, %(op_class)s, _microVl, _microIdx, _vlen)
+{
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _microIdx]);
+    _numTypedDestRegs[VecRegClass]++;
+    setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+    setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _microIdx]);
+    if (!_machInst.vm) {
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
+    }
+}
+
+}};
+
 def template VleMicroExecute {{
 
 Fault
@@ -293,21 +300,7 @@ private:
     RegId destRegIdxArr[0];
 public:
     %(class_name)s(ExtMachInst _machInst,
-        uint32_t _microVl, uint8_t _microIdx, uint32_t _vlen)
-    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s,
-                        _microVl, _microIdx, _vlen)
-    {
-        %(set_reg_idx_arr)s;
-        _numSrcRegs = 0;
-        _numDestRegs = 0;
-        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
-        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs3 + _microIdx]);
-        if (!_machInst.vm) {
-            setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
-        }
-        this->flags[IsVector] = true;
-        this->flags[IsStore] = true;
-    }
+        uint8_t _microVl, uint8_t _microIdx, uint32_t _vlen);
 
     Fault execute(ExecContext *, trace::InstRecord *) const override;
     Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
@@ -318,6 +311,27 @@ public:
 
 }};
 
+def template VseMicroConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst,
+    uint8_t _microVl, uint8_t _microIdx, uint32_t _vlen)
+  : %(base_class)s(
+        "%(mnemonic)s", _machInst, %(op_class)s, _microVl, _microIdx, _vlen)
+{
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+    setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs3 + _microIdx]);
+    if (!_machInst.vm) {
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
+    }
+    this->flags[IsVector] = true;
+    this->flags[IsStore] = true;
+}
+
+}};
+
 def template VseMicroExecute {{
 
 Fault
@@ -518,18 +532,8 @@ private:
     RegId srcRegIdxArr[2];
 public:
     %(class_name)s(ExtMachInst _machInst,
-        uint32_t _microVl, uint8_t _microIdx, uint32_t _vlen)
-        : %(base_class)s("%(mnemonic)s", _machInst,
-                         %(op_class)s, _microVl, _microIdx, _vlen)
-    {
-        %(set_reg_idx_arr)s;
-        _numSrcRegs = 0;
-        _numDestRegs = 0;
-        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
-        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs3 + _microIdx]);
-        this->flags[IsVector] = true;
-        this->flags[IsStore] = true;
-    }
+        uint8_t _microVl, uint8_t _microIdx, uint32_t _vlen);
+
     Fault execute(ExecContext *, trace::InstRecord *) const override;
     Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
     Fault completeAcc(PacketPtr, ExecContext *,
@@ -539,6 +543,24 @@ public:
 
 }};
 
+def template VsWholeMicroConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst,
+    uint8_t _microVl, uint8_t _microIdx, uint32_t _vlen)
+  : %(base_class)s(
+        "%(mnemonic)s", _machInst, %(op_class)s, _microVl, _microIdx, _vlen)
+{
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+    setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs3 + _microIdx]);
+    this->flags[IsVector] = true;
+    this->flags[IsStore] = true;
+}
+
+}};
+
 def template VsWholeMicroExecute {{
 
 Fault
@@ -644,19 +666,8 @@ private:
     RegId srcRegIdxArr[1];
 public:
     %(class_name)s(ExtMachInst _machInst,
-        uint32_t _microVl, uint8_t _microIdx, uint32_t _vlen)
-        : %(base_class)s("%(mnemonic)s_micro", _machInst,
-                         %(op_class)s, _microVl, _microIdx, _vlen)
-    {
-        %(set_reg_idx_arr)s;
-        _numSrcRegs = 0;
-        _numDestRegs = 0;
-        setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _microIdx]);
-        _numTypedDestRegs[VecRegClass]++;
-        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
-        this->flags[IsVector] = true;
-        this->flags[IsLoad] = true;
-    }
+        uint8_t _microVl, uint8_t _microIdx, uint32_t _vlen);
+
     Fault execute(ExecContext *, trace::InstRecord *) const override;
     Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
     Fault completeAcc(PacketPtr, ExecContext *,
@@ -666,6 +677,25 @@ public:
 
 }};
 
+def template VlWholeMicroConstructor {{
+
+%(class_name)s::%(class_name)s(ExtMachInst _machInst,
+    uint8_t _microVl, uint8_t _microIdx, uint32_t _vlen)
+  : %(base_class)s("%(mnemonic)s_micro", _machInst, %(op_class)s, _microVl,
+      _microIdx, _vlen)
+{
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _microIdx]);
+    _numTypedDestRegs[VecRegClass]++;
+    setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+    this->flags[IsVector] = true;
+    this->flags[IsLoad] = true;
+}
+
+}};
+
 def template VlWholeMicroExecute {{
 
 Fault
@@ -803,24 +833,7 @@ private:
     RegId destRegIdxArr[1];
 public:
     %(class_name)s(ExtMachInst _machInst, uint8_t _regIdx, uint8_t _microIdx,
-        uint32_t _microVl)
-    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s,
-        _regIdx, _microIdx, _microVl)
-    {
-        %(set_reg_idx_arr)s;
-        _numSrcRegs = 0;
-        _numDestRegs = 0;
-        setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _regIdx]);
-        _numTypedDestRegs[VecRegClass]++;
-        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
-        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs2]);
-        // We treat agnostic as undistrubed
-        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _regIdx]);
-        if (!_machInst.vm) {
-            setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
-        }
-        this->flags[IsLoad] = true;
-    }
+        uint32_t _microVl);
 
     Fault execute(ExecContext *, trace::InstRecord *) const override;
     Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
@@ -831,6 +844,31 @@ public:
 
 }};
 
+def template VlStrideMicroConstructor {{
+
+%(class_name)s::%(class_name)s(
+    ExtMachInst _machInst, uint8_t _regIdx, uint8_t _microIdx,
+    uint32_t _microVl)
+  : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s,
+        _regIdx, _microIdx, _microVl)
+{
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _regIdx]);
+    _numTypedDestRegs[VecRegClass]++;
+    setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+    setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs2]);
+    // We treat agnostic as undistrubed
+    setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _regIdx]);
+    if (!_machInst.vm) {
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
+    }
+    this->flags[IsLoad] = true;
+}
+
+}};
+
 def template VlStrideMicroExecute {{
 
 Fault
@@ -1019,21 +1057,7 @@ private:
     RegId destRegIdxArr[0];
 public:
     %(class_name)s(ExtMachInst _machInst, uint8_t _regIdx, uint8_t _microIdx,
-            uint32_t _microVl)
-        : %(base_class)s("%(mnemonic)s""_micro", _machInst, %(op_class)s,
-            _regIdx, _microIdx, _microVl)
-    {
-        %(set_reg_idx_arr)s;
-        _numSrcRegs = 0;
-        _numDestRegs = 0;
-        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
-        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs2]);
-        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs3 + _regIdx]);
-        if (!_machInst.vm) {
-            setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
-        }
-        this->flags[IsStore] = true;
-    }
+            uint32_t _microVl);
 
     Fault execute(ExecContext *, trace::InstRecord *) const override;
     Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
@@ -1044,6 +1068,28 @@ public:
 
 }};
 
+def template VsStrideMicroConstructor {{
+
+%(class_name)s::%(class_name)s(
+    ExtMachInst _machInst, uint8_t _regIdx, uint8_t _microIdx,
+    uint32_t _microVl)
+  : %(base_class)s("%(mnemonic)s""_micro", _machInst, %(op_class)s,
+      _regIdx, _microIdx, _microVl)
+{
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+    setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs2]);
+    setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs3 + _regIdx]);
+    if (!_machInst.vm) {
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
+    }
+    this->flags[IsStore] = true;
+}
+
+}};
+
 def template VsStrideMicroExecute {{
 
 Fault
@@ -1185,6 +1231,8 @@ template<typename ElemType>
     this->flags[IsVector] = true;
 }
 
+%(declare_vmem_template)s;
+
 }};
 
 def template VlIndexMicroDeclare {{
@@ -1199,24 +1247,7 @@ private:
 public:
     %(class_name)s(ExtMachInst _machInst,
         uint8_t _vdRegIdx, uint8_t _vdElemIdx,
-        uint8_t _vs2RegIdx, uint8_t _vs2ElemIdx)
-    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s,
-        _vdRegIdx, _vdElemIdx, _vs2RegIdx, _vs2ElemIdx)
-    {
-        %(set_reg_idx_arr)s;
-        _numSrcRegs = 0;
-        _numDestRegs = 0;
-        setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _vdRegIdx]);
-        _numTypedDestRegs[VecRegClass]++;
-        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
-        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs2 + _vs2RegIdx]);
-        // We treat agnostic as undistrubed
-        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _vdRegIdx]);
-        if (!_machInst.vm) {
-            setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
-        }
-        this->flags[IsLoad] = true;
-    }
+        uint8_t _vs2RegIdx, uint8_t _vs2ElemIdx);
 
     Fault execute(ExecContext *, trace::InstRecord *) const override;
     Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
@@ -1227,6 +1258,34 @@ public:
 
 }};
 
+def template VlIndexMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(
+    ExtMachInst _machInst,uint8_t _vdRegIdx, uint8_t _vdElemIdx,
+    uint8_t _vs2RegIdx, uint8_t _vs2ElemIdx)
+  : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s,
+      _vdRegIdx, _vdElemIdx, _vs2RegIdx, _vs2ElemIdx)
+{
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _vdRegIdx]);
+    _numTypedDestRegs[VecRegClass]++;
+    setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+    setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs2 + _vs2RegIdx]);
+    // We treat agnostic as undistrubed
+    setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _vdRegIdx]);
+    if (!_machInst.vm) {
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
+    }
+    this->flags[IsLoad] = true;
+}
+
+%(declare_vmem_template)s;
+
+}};
+
 def template VlIndexMicroExecute {{
 
 template<typename ElemType>
@@ -1364,6 +1423,8 @@ Fault
     return NoFault;
 }
 
+%(declare_vmem_template)s;
+
 }};
 
 def template VsIndexConstructor {{
@@ -1410,6 +1471,8 @@ template<typename ElemType>
     this->flags[IsVector] = true;
 }
 
+%(declare_vmem_template)s;
+
 }};
 
 def template VsIndexMicroDeclare {{
@@ -1424,22 +1487,7 @@ private:
 public:
     %(class_name)s(ExtMachInst _machInst,
         uint8_t _vs3RegIdx, uint8_t _vs3ElemIdx,
-        uint8_t _vs2RegIdx, uint8_t _vs2ElemIdx)
-    : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s,
-        _vs3RegIdx, _vs3ElemIdx, _vs2RegIdx, _vs2ElemIdx)
-    {
-        %(set_reg_idx_arr)s;
-        _numSrcRegs = 0;
-        _numDestRegs = 0;
-        setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
-        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs2 + _vs2RegIdx]);
-        // We treat agnostic as undistrubed
-        setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs3 + _vs3RegIdx]);
-        if (!_machInst.vm) {
-            setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
-        }
-        this->flags[IsStore] = true;
-    }
+        uint8_t _vs2RegIdx, uint8_t _vs2ElemIdx);
 
     Fault execute(ExecContext *, trace::InstRecord *) const override;
     Fault initiateAcc(ExecContext *, trace::InstRecord *) const override;
@@ -1450,6 +1498,32 @@ public:
 
 }};
 
+def template VsIndexMicroConstructor {{
+
+template<typename ElemType>
+%(class_name)s<ElemType>::%(class_name)s(ExtMachInst _machInst,
+    uint8_t _vs3RegIdx, uint8_t _vs3ElemIdx,
+    uint8_t _vs2RegIdx, uint8_t _vs2ElemIdx)
+  : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s,
+      _vs3RegIdx, _vs3ElemIdx, _vs2RegIdx, _vs2ElemIdx)
+{
+    %(set_reg_idx_arr)s;
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+    setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
+    setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs2 + _vs2RegIdx]);
+    // We treat agnostic as undistrubed
+    setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs3 + _vs3RegIdx]);
+    if (!_machInst.vm) {
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
+    }
+    this->flags[IsStore] = true;
+}
+
+%(declare_vmem_template)s;
+
+}};
+
 def template VsIndexMicroExecute {{
 
 template<typename ElemType>
@@ -1548,6 +1622,8 @@ Fault
     return NoFault;
 }
 
+%(declare_vmem_template)s;
+
 }};
 
 def template VMemBaseDecodeBlock {{

From 62af678d5c51b1fa15ec40c5d4ff4e36971e34f1 Mon Sep 17 00:00:00 2001
From: Roger Chang <rogerycchang@google.com>
Date: Tue, 24 Oct 2023 14:47:04 +0800
Subject: [PATCH 2/3] arch-riscv: Move VArith implementations from header to
 source

Move VArith implementations from heaher_output to decoder_output
and exec_output respectively

Change-Id: I406eedbd9dd625aa939ec0e20aa29ef4f18ba79c
---
 src/arch/riscv/isa/formats/vector_arith.isa   | 564 +++++++++++-------
 src/arch/riscv/isa/templates/vector_arith.isa | 145 ++++-
 2 files changed, 487 insertions(+), 222 deletions(-)

diff --git a/src/arch/riscv/isa/formats/vector_arith.isa b/src/arch/riscv/isa/formats/vector_arith.isa
index 0d5055ea8f..1ddf323f04 100644
--- a/src/arch/riscv/isa/formats/vector_arith.isa
+++ b/src/arch/riscv/isa/formats/vector_arith.isa
@@ -121,6 +121,28 @@ let {{
         softfloat_exceptionFlags = 0;
         xc->setMiscReg(MISCREG_FFLAGS, FFLAGS);
         '''
+
+    def declareVArithTemplate(
+        class_name, type_name='uint', min_size=8, max_size=64):
+        sizes = [8, 16, 32, 64]
+        code = ''
+        for size in sizes:
+            if size < min_size or size > max_size:
+                continue
+            code += f'template class {class_name}<{type_name}{size}_t>;\n'
+        return code
+
+    def declareGatherTemplate(class_name, index_type):
+        sizes = [8, 16, 32, 64]
+        code = ''
+        for size in sizes:
+            if index_type == 'elem_type':
+                idx_type = f'uint{size}_t'
+            else:
+                idx_type = index_type
+            code += ('template class'
+                     f' {class_name}<uint{size}_t, {idx_type}>;\n')
+        return code
 }};
 
 
@@ -132,8 +154,14 @@ def format VectorIntFormat(code, category, *flags) {{
         macroop_class_name = 'VectorVMUNARY0MacroInst'
         microp_class_name = 'VectorVMUNARY0MicroInst'
 
-    iop = InstObjParams(name, Name, macroop_class_name, {'code': code},
-                        flags)
+    iop = InstObjParams(
+        name,
+        Name,
+        macroop_class_name,
+        {'code': code,
+         'declare_varith_template': declareVArithTemplate(Name)},
+        flags
+    )
     inst_name, inst_suffix = name.split("_", maxsplit=1)
     v0_required = inst_name not in ["vmv"]
     mask_cond = v0_required and (inst_suffix not in ['vvm', 'vxm', 'vim'])
@@ -192,25 +220,30 @@ def format VectorIntFormat(code, category, *flags) {{
          'set_src_reg_idx': set_src_reg_idx,
          'set_vlenb' : set_vlenb,
          'vm_decl_rd': vm_decl_rd,
-         'copy_old_vd': copyOldVd(old_vd_idx)},
+         'copy_old_vd': copyOldVd(old_vd_idx),
+         'declare_varith_template': declareVArithTemplate(Name + "Micro")},
         flags)
 
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
     header_output = \
         VectorIntMicroDeclare.subst(microiop) + \
+        VectorIntMacroDeclare.subst(iop)
+    decoder_output = \
         VectorIntMicroConstructor.subst(microiop) + \
-        VectorIntMicroExecute.subst(microiop) + \
-        VectorIntMacroDeclare.subst(iop) + \
         VectorIntMacroConstructor.subst(iop)
-
+    exec_output = VectorIntMicroExecute.subst(microiop)
     decode_block = VectorIntDecodeBlock.subst(iop)
 }};
 
 
 def format VectorIntExtFormat(code, category, *flags) {{
-    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
-                        flags)
+    iop = InstObjParams(
+        name,
+        Name,
+        'VectorArithMacroInst',
+        {'code': code,
+         'declare_varith_template': declareVArithTemplate(Name)},
+        flags
+    )
     inst_name, inst_suffix = name.split("_", maxsplit=1)
     ext_div = int(inst_suffix[-1])
 
@@ -245,24 +278,31 @@ def format VectorIntExtFormat(code, category, *flags) {{
          'set_vlen': set_vlen,
          'vm_decl_rd': vm_decl_rd,
          'copy_old_vd': copyOldVd(old_vd_idx),
-         'ext_div': ext_div},
+         'ext_div': ext_div,
+         'declare_varith_template': declareVArithTemplate(Name + "Micro")},
         flags)
 
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
     header_output = \
         VectorIntExtMicroDeclare.subst(microiop) + \
+        VectorIntExtMacroDeclare.subst(iop)
+    decoder_output = \
         VectorIntMicroConstructor.subst(microiop) + \
-        VectorIntExtMicroExecute.subst(microiop) + \
-        VectorIntExtMacroDeclare.subst(iop) + \
         VectorIntMacroConstructor.subst(iop)
-
+    exec_output = \
+        VectorIntExtMicroExecute.subst(microiop) + \
+        VectorIntExtMacroExecute.subst(iop)
     decode_block = VectorIntDecodeBlock.subst(iop)
 }};
 
 def format VectorIntWideningFormat(code, category, *flags) {{
-    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
-                        flags)
+    iop = InstObjParams(
+        name,
+        Name,
+        'VectorArithMacroInst',
+        {'code': code,
+         'declare_varith_template': declareVArithTemplate(Name, max_size=32)},
+        flags
+    )
     inst_name, inst_suffix = name.split("_", maxsplit=1)
     v0_required = True
     mask_cond = v0_required
@@ -308,6 +348,7 @@ def format VectorIntWideningFormat(code, category, *flags) {{
     set_vlenb = setVlenb();
     set_vlen = setVlen();
 
+    varith_micro_declare = declareVArithTemplate(Name + "Micro", max_size=32)
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         'VectorArithMicroInst',
@@ -317,24 +358,29 @@ def format VectorIntWideningFormat(code, category, *flags) {{
          'set_vlenb': set_vlenb,
          'set_vlen': set_vlen,
          'vm_decl_rd': vm_decl_rd,
-         'copy_old_vd': copyOldVd(old_vd_idx)},
+         'copy_old_vd': copyOldVd(old_vd_idx),
+         'declare_varith_template': varith_micro_declare},
         flags)
 
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
     header_output = \
         VectorIntWideningMicroDeclare.subst(microiop) + \
+        VectorIntWideningMacroDeclare.subst(iop)
+    decoder_output = \
         VectorIntWideningMicroConstructor.subst(microiop) + \
-        VectorIntWideningMicroExecute.subst(microiop) + \
-        VectorIntWideningMacroDeclare.subst(iop) + \
         VectorIntWideningMacroConstructor.subst(iop)
-
+    exec_output = VectorIntWideningMicroExecute.subst(microiop)
     decode_block = VectorIntWideningDecodeBlock.subst(iop)
 }};
 
 def format VectorIntNarrowingFormat(code, category, *flags) {{
-    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
-                        flags)
+    iop = InstObjParams(
+        name,
+        Name,
+        'VectorArithMacroInst',
+        {'code': code,
+         'declare_varith_template': declareVArithTemplate(Name, max_size=32)},
+        flags
+    )
     mask_cond = True
     need_elem_idx = True
 
@@ -368,6 +414,7 @@ def format VectorIntNarrowingFormat(code, category, *flags) {{
     set_vlenb = setVlenb();
     set_vlen = setVlen();
 
+    varith_micro_declare = declareVArithTemplate(Name + "Micro", max_size=32)
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         'VectorArithMicroInst',
@@ -378,18 +425,17 @@ def format VectorIntNarrowingFormat(code, category, *flags) {{
          'set_vlen': set_vlen,
          'vm_decl_rd': vm_decl_rd,
          'copy_old_vd': copyOldVd(old_vd_idx),
-         },
+         'declare_varith_template': varith_micro_declare
+        },
         flags)
 
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
     header_output = \
         VectorIntWideningMicroDeclare.subst(microiop) + \
+        VectorIntWideningMacroDeclare.subst(iop)
+    decoder_output = \
         VectorIntWideningMicroConstructor.subst(microiop) + \
-        VectorIntNarrowingMicroExecute.subst(microiop) + \
-        VectorIntWideningMacroDeclare.subst(iop) + \
         VectorIntWideningMacroConstructor.subst(iop)
-
+    exec_output = VectorIntNarrowingMicroExecute.subst(microiop)
     decode_block = VectorIntWideningDecodeBlock.subst(iop)
 }};
 
@@ -397,7 +443,8 @@ def format VectorIntMaskFormat(code, category, *flags) {{
     iop = InstObjParams(name,
         Name,
         'VectorArithMacroInst',
-        {'code': code},
+        {'code': code,
+         'declare_varith_template': declareVArithTemplate(Name)},
         flags)
     inst_name, inst_suffix = name.split("_", maxsplit=1)
     v0_required = not (inst_name in ["vmadc", "vmsbc"] \
@@ -448,17 +495,17 @@ def format VectorIntMaskFormat(code, category, *flags) {{
          'set_src_reg_idx': set_src_reg_idx,
          'set_vlenb': set_vlenb,
          'vm_decl_rd': vm_decl_rd,
-         'copy_old_vd': copyOldVd(old_vd_idx)},
+         'copy_old_vd': copyOldVd(old_vd_idx),
+         'declare_varith_template': declareVArithTemplate(Name + "Micro")},
         flags)
 
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
     header_output = \
         VectorIntMaskMicroDeclare.subst(microiop) + \
+        VectorIntMaskMacroDeclare.subst(iop)
+    decoder_output = \
         VectorIntMaskMicroConstructor.subst(microiop) + \
-        VectorIntMaskMicroExecute.subst(microiop) + \
-        VectorIntMaskMacroDeclare.subst(iop) + \
         VectorIntMaskMacroConstructor.subst(iop)
+    exec_output = VectorIntMaskMicroExecute.subst(microiop)
     decode_block = VectorIntDecodeBlock.subst(iop)
 }};
 
@@ -470,7 +517,8 @@ def format VectorGatherFormat(code, category, *flags) {{
         idx_type = "elem_type"
     iop = InstObjParams(name, Name, 'VectorArithMacroInst',
         {'idx_type': idx_type,
-         'code': code},
+         'code': code,
+         'declare_varith_template': declareGatherTemplate(Name, idx_type)},
         flags)
     old_vd_idx = 2
     dest_reg_id = "vecRegClass[_machInst.vd + vd_idx]"
@@ -502,6 +550,7 @@ def format VectorGatherFormat(code, category, *flags) {{
     set_vlenb = setVlenb();
     set_vlen = setVlen();
 
+    varith_micro_declare = declareGatherTemplate(Name + "Micro", idx_type)
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         'VectorArithMicroInst',
@@ -512,25 +561,30 @@ def format VectorGatherFormat(code, category, *flags) {{
          'set_vlen': set_vlen,
          'vm_decl_rd': vm_decl_rd,
          'copy_old_vd': copyOldVd(old_vd_idx),
-         'idx_type': idx_type},
+         'idx_type': idx_type,
+         'declare_varith_template': varith_micro_declare},
         flags)
 
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
     header_output = \
         VectorGatherMicroDeclare.subst(microiop) + \
+        VectorGatherMacroDeclare.subst(iop)
+    decoder_output = \
         VectorGatherMicroConstructor.subst(microiop) + \
-        VectorGatherMicroExecute.subst(microiop) + \
-        VectorGatherMacroDeclare.subst(iop) + \
         VectorGatherMacroConstructor.subst(iop)
-
+    exec_output = VectorGatherMicroExecute.subst(microiop)
     decode_block = VectorGatherDecodeBlock.subst(iop)
 
 }};
 
 def format VectorFloatFormat(code, category, *flags) {{
-    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
-                        flags)
+    iop = InstObjParams(
+        name,
+        Name,
+        'VectorArithMacroInst',
+        {'code': code,
+         'declare_varith_template': declareVArithTemplate(Name, 'float', 32)},
+        flags
+    )
     inst_name, inst_suffix = name.split("_", maxsplit=1)
     v0_required = inst_name not in ["vfmv"]
     mask_cond = v0_required and (inst_suffix not in ['vvm', 'vfm'])
@@ -569,6 +623,7 @@ def format VectorFloatFormat(code, category, *flags) {{
 
     set_vlenb = setVlenb();
 
+    varith_micro_declare = declareVArithTemplate(Name + "Micro", 'float', 32)
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         'VectorArithMicroInst',
@@ -577,24 +632,29 @@ def format VectorFloatFormat(code, category, *flags) {{
          'set_src_reg_idx': set_src_reg_idx,
          'set_vlenb': set_vlenb,
          'vm_decl_rd': vm_decl_rd,
-         'copy_old_vd': copyOldVd(2)},
+         'copy_old_vd': copyOldVd(2),
+         'declare_varith_template': varith_micro_declare},
         flags)
 
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
     header_output = \
         VectorFloatMicroDeclare.subst(microiop) + \
+        VectorFloatMacroDeclare.subst(iop)
+    decoder_output = \
         VectorFloatMicroConstructor.subst(microiop) + \
-        VectorFloatMicroExecute.subst(microiop) + \
-        VectorFloatMacroDeclare.subst(iop) + \
         VectorFloatMacroConstructor.subst(iop)
-
+    exec_output = VectorFloatMicroExecute.subst(microiop)
     decode_block = VectorFloatDecodeBlock.subst(iop)
 }};
 
 def format VectorFloatCvtFormat(code, category, *flags) {{
-    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
-                        flags)
+    iop = InstObjParams(
+        name,
+        Name,
+        'VectorArithMacroInst',
+        {'code': code,
+         'declare_varith_template': declareVArithTemplate(Name, 'float', 32)},
+        flags
+    )
 
     old_vd_idx = 1
     dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
@@ -616,6 +676,7 @@ def format VectorFloatCvtFormat(code, category, *flags) {{
 
     set_vlenb = setVlenb();
 
+    varith_micro_declare = declareVArithTemplate(Name + "Micro", 'float', 32)
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         'VectorArithMicroInst',
@@ -624,24 +685,30 @@ def format VectorFloatCvtFormat(code, category, *flags) {{
          'set_src_reg_idx': set_src_reg_idx,
          'set_vlenb': set_vlenb,
          'vm_decl_rd': vm_decl_rd,
-         'copy_old_vd': copyOldVd(old_vd_idx)},
+         'copy_old_vd': copyOldVd(old_vd_idx),
+         'declare_varith_template': varith_micro_declare},
         flags)
 
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
     header_output = \
         VectorFloatCvtMicroDeclare.subst(microiop) + \
+        VectorFloatCvtMacroDeclare.subst(iop)
+    decoder_output = \
         VectorFloatMicroConstructor.subst(microiop) + \
-        VectorFloatMicroExecute.subst(microiop) + \
-        VectorFloatCvtMacroDeclare.subst(iop) + \
         VectorFloatMacroConstructor.subst(iop)
-
+    exec_output = VectorFloatMicroExecute.subst(microiop)
     decode_block = VectorFloatDecodeBlock.subst(iop)
 }};
 
 def format VectorFloatWideningFormat(code, category, *flags) {{
-    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
-                        flags)
+    varith_macro_declare = declareVArithTemplate(Name, 'float', 32, 32)
+    iop = InstObjParams(
+        name,
+        Name,
+        'VectorArithMacroInst',
+        {'code': code,
+         'declare_varith_template': varith_macro_declare},
+        flags
+    )
     inst_name, inst_suffix = name.split("_", maxsplit=1)
     v0_required = True
     mask_cond = v0_required
@@ -688,6 +755,8 @@ def format VectorFloatWideningFormat(code, category, *flags) {{
     set_vlenb = setVlenb();
     set_vlen = setVlen();
 
+    varith_micro_declare = declareVArithTemplate(
+        Name + "Micro", 'float', 32, 32)
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         'VectorArithMicroInst',
@@ -697,24 +766,30 @@ def format VectorFloatWideningFormat(code, category, *flags) {{
          'set_vlenb': set_vlenb,
          'set_vlen': set_vlen,
          'vm_decl_rd': vm_decl_rd,
-         'copy_old_vd': copyOldVd(2)},
+         'copy_old_vd': copyOldVd(2),
+         'declare_varith_template': varith_micro_declare},
         flags)
 
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
     header_output = \
         VectorIntWideningMicroDeclare.subst(microiop) + \
+        VectorIntWideningMacroDeclare.subst(iop)
+    decoder_output = \
         VectorIntWideningMicroConstructor.subst(microiop) + \
-        VectorFloatWideningMicroExecute.subst(microiop) + \
-        VectorIntWideningMacroDeclare.subst(iop) + \
         VectorIntWideningMacroConstructor.subst(iop)
-
+    exec_output = VectorFloatWideningMicroExecute.subst(microiop)
     decode_block = VectorFloatWideningDecodeBlock.subst(iop)
 }};
 
 def format VectorFloatWideningCvtFormat(code, category, *flags) {{
-    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
-                        flags)
+    varith_macro_declare = declareVArithTemplate(Name, 'float', 32, 32)
+    iop = InstObjParams(
+        name,
+        Name,
+        'VectorArithMacroInst',
+        {'code': code,
+         'declare_varith_template': varith_macro_declare},
+        flags
+    )
 
     old_vd_idx = 1
     dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
@@ -737,6 +812,8 @@ def format VectorFloatWideningCvtFormat(code, category, *flags) {{
     set_vlenb = setVlenb();
     set_vlen = setVlen();
 
+    varith_micro_declare = declareVArithTemplate(
+        Name + "Micro", 'float', 32, 32)
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         'VectorArithMicroInst',
@@ -746,24 +823,30 @@ def format VectorFloatWideningCvtFormat(code, category, *flags) {{
          'set_vlenb': set_vlenb,
          'set_vlen': set_vlen,
          'vm_decl_rd': vm_decl_rd,
-         'copy_old_vd': copyOldVd(old_vd_idx)},
+         'copy_old_vd': copyOldVd(old_vd_idx),
+         'declare_varith_template': varith_micro_declare},
         flags)
 
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
     header_output = \
         VectorFloatCvtMicroDeclare.subst(microiop) + \
+        VectorFloatCvtMacroDeclare.subst(iop)
+    decoder_output = \
         VectorFloatMicroConstructor.subst(microiop) + \
-        VectorFloatWideningMicroExecute.subst(microiop) + \
-        VectorFloatCvtMacroDeclare.subst(iop) + \
         VectorIntWideningMacroConstructor.subst(iop)
-
+    exec_output = VectorFloatWideningMicroExecute.subst(microiop)
     decode_block = VectorFloatWideningDecodeBlock.subst(iop)
 }};
 
 def format VectorFloatNarrowingCvtFormat(code, category, *flags) {{
-    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
-                        flags)
+    varith_macro_declare = declareVArithTemplate(Name, 'float', 32, 32)
+    iop = InstObjParams(
+        name,
+        Name,
+        'VectorArithMacroInst',
+        {'code': code,
+         'declare_varith_template': varith_macro_declare},
+        flags
+    )
 
     old_vd_idx = 1
     dest_reg_id = "vecRegClass[_machInst.vd + _microIdx / 2]"
@@ -787,6 +870,8 @@ def format VectorFloatNarrowingCvtFormat(code, category, *flags) {{
     set_vlenb = setVlenb();
     set_vlen = setVlen();
 
+    varith_micro_declare = declareVArithTemplate(
+        Name + "Micro", 'float', 32, 32)
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         'VectorArithMicroInst',
@@ -796,18 +881,17 @@ def format VectorFloatNarrowingCvtFormat(code, category, *flags) {{
          'set_vlenb': set_vlenb,
          'set_vlen': set_vlen,
          'vm_decl_rd': vm_decl_rd,
-         'copy_old_vd': copyOldVd(old_vd_idx)},
+         'copy_old_vd': copyOldVd(old_vd_idx),
+         'declare_varith_template': varith_micro_declare},
         flags)
 
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
     header_output = \
         VectorFloatCvtMicroDeclare.subst(microiop) + \
+        VectorFloatCvtMacroDeclare.subst(iop)
+    decoder_output = \
         VectorFloatMicroConstructor.subst(microiop) + \
-        VectorFloatNarrowingMicroExecute.subst(microiop) + \
-        VectorFloatCvtMacroDeclare.subst(iop) + \
         VectorIntWideningMacroConstructor.subst(iop)
-
+    exec_output = VectorFloatNarrowingMicroExecute.subst(microiop)
     decode_block = VectorFloatWideningDecodeBlock.subst(iop)
 }};
 
@@ -815,8 +899,10 @@ def format VectorFloatMaskFormat(code, category, *flags) {{
     iop = InstObjParams(name,
         Name,
         'VectorArithMacroInst',
-        {'code': code},
-        flags)
+        {'code': code,
+         'declare_varith_template': declareVArithTemplate(Name, 'float', 32)},
+        flags
+    )
     dest_reg_id = "vecRegClass[VecMemInternalReg0 + _microIdx]"
     src1_reg_id = ""
     if category == "OPFVV":
@@ -841,6 +927,7 @@ def format VectorFloatMaskFormat(code, category, *flags) {{
     code = loopWrapper(code)
     code = fflags_wrapper(code)
 
+    varith_micro_declare = declareVArithTemplate(Name + "Micro", 'float', 32)
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         'VectorArithMicroInst',
@@ -849,17 +936,17 @@ def format VectorFloatMaskFormat(code, category, *flags) {{
          'set_src_reg_idx': set_src_reg_idx,
          'set_vlenb': set_vlenb,
          'vm_decl_rd': vm_decl_rd,
-         'copy_old_vd': copyOldVd(2)},
+         'copy_old_vd': copyOldVd(2),
+         'declare_varith_template': varith_micro_declare},
         flags)
 
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
     header_output = \
         VectorFloatMaskMicroDeclare.subst(microiop) + \
+        VectorFloatMaskMacroDeclare.subst(iop)
+    decoder_output = \
         VectorFloatMaskMicroConstructor.subst(microiop) + \
-        VectorFloatMaskMicroExecute.subst(microiop) + \
-        VectorFloatMaskMacroDeclare.subst(iop) + \
         VectorFloatMaskMacroConstructor.subst(iop)
+    exec_output = VectorFloatMaskMicroExecute.subst(microiop)
     decode_block = VectorFloatDecodeBlock.subst(iop)
 }};
 
@@ -884,8 +971,14 @@ def format VMvWholeFormat(code, category, *flags) {{
 }};
 
 def format ViotaFormat(code, category, *flags){{
-    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
-                        flags)
+    iop = InstObjParams(
+        name,
+        Name,
+        'VectorArithMacroInst',
+        {'code': code,
+         'declare_varith_template': declareVArithTemplate(Name)},
+        flags
+    )
 
     inst_name, inst_suffix = name.split("_", maxsplit=1)
     dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
@@ -912,17 +1005,17 @@ def format ViotaFormat(code, category, *flags){{
          'set_vlenb': set_vlenb,
          'vm_decl_rd': vm_decl_rd,
          'set_vm_idx': set_vm_idx,
-         'copy_old_vd': copyOldVd(1)},
+         'copy_old_vd': copyOldVd(1),
+         'declare_varith_template': declareVArithTemplate(Name + "Micro")},
         flags)
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
+
     header_output = \
         ViotaMicroDeclare.subst(microiop) + \
+        ViotaMacroDeclare.subst(iop)
+    decoder_output = \
         ViotaMicroConstructor.subst(microiop) + \
-        ViotaMicroExecute.subst(microiop)+\
-        ViotaMacroDeclare.subst(iop) + \
         ViotaMacroConstructor.subst(iop)
-
+    exec_output = ViotaMicroExecute.subst(microiop)
     decode_block = VectorIntDecodeBlock.subst(iop)
 
 }};
@@ -951,15 +1044,14 @@ def format Vector1Vs1VdMaskFormat(code, category, *flags){{
          'set_vlenb': set_vlenb,
          'vm_decl_rd': vm_decl_rd,
          'set_vm_idx': set_vm_idx,
-         'copy_old_vd': copyOldVd(1)},
+         'copy_old_vd': copyOldVd(1),
+         'declare_varith_template': declareVArithTemplate(Name, 'uint', 8, 8),
+         },
         flags)
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
-    header_output = \
-        Vector1Vs1RdMaskDeclare.subst(iop) + \
-        Vector1Vs1VdMaskConstructor.subst(iop) + \
-        Vector1Vs1VdMaskExecute.subst(iop)
 
+    header_output = Vector1Vs1RdMaskDeclare.subst(iop)
+    decoder_output = Vector1Vs1VdMaskConstructor.subst(iop)
+    exec_output = Vector1Vs1VdMaskExecute.subst(iop)
     decode_block = VectorMaskDecodeBlock.subst(iop)
 }};
 
@@ -972,15 +1064,14 @@ def format Vector1Vs1RdMaskFormat(code, category, *flags){{
         'VectorNonSplitInst',
         {'code': code,
          'vm_decl_rd': vm_decl_rd,
-         'set_vm_idx': set_vm_idx},
+         'set_vm_idx': set_vm_idx,
+         'declare_varith_template': declareVArithTemplate(Name, 'uint', 8, 8)
+        },
         flags)
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
-    header_output = \
-        Vector1Vs1RdMaskDeclare.subst(iop) + \
-        Vector1Vs1RdMaskConstructor.subst(iop) + \
-        Vector1Vs1RdMaskExecute.subst(iop)
 
+    header_output = Vector1Vs1RdMaskDeclare.subst(iop)
+    decoder_output = Vector1Vs1RdMaskConstructor.subst(iop)
+    exec_output = Vector1Vs1RdMaskExecute.subst(iop)
     decode_block = VectorMaskDecodeBlock.subst(iop)
 }};
 
@@ -993,31 +1084,36 @@ def format VectorNonSplitFormat(code, category, *flags) {{
     if inst_name == "vfmv" :
         code = fflags_wrapper(code)
 
-    iop = InstObjParams(name,
-        Name,
-        'VectorNonSplitInst',
-        {'code': code,
-         'vm_decl_rd': vm_decl_rd,
-         'set_vm_idx': set_vm_idx},
-        flags)
-
-
     if inst_name == "vfmv" :
-        execute_block = VectorFloatNonSplitExecute.subst(iop)
+        varith_template = declareVArithTemplate(Name, 'float', 32)
+        iop = InstObjParams(name,
+            Name,
+            'VectorNonSplitInst',
+            {'code': code,
+             'vm_decl_rd': vm_decl_rd,
+             'set_vm_idx': set_vm_idx,
+             'declare_varith_template': varith_template},
+            flags)
+        header_output = VectorNonSplitDeclare.subst(iop)
+        decoder_output = VectorNonSplitConstructor.subst(iop)
+        exec_output = VectorFloatNonSplitExecute.subst(iop)
         decode_block = VectorFloatNonSplitDecodeBlock.subst(iop)
     elif inst_name == "vmv" :
-        execute_block = VectorIntNonSplitExecute.subst(iop)
+        iop = InstObjParams(name,
+            Name,
+            'VectorNonSplitInst',
+            {'code': code,
+             'vm_decl_rd': vm_decl_rd,
+             'set_vm_idx': set_vm_idx,
+             'declare_varith_template': declareVArithTemplate(Name)},
+            flags)
+        header_output = VectorNonSplitDeclare.subst(iop)
+        decoder_output = VectorNonSplitConstructor.subst(iop)
+        exec_output = VectorIntNonSplitExecute.subst(iop)
         decode_block = VectorIntNonSplitDecodeBlock.subst(iop)
     else :
         error("Unsupported inst for VectorNonSplitFormat: %s" % inst_name)
 
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
-    header_output = \
-        VectorNonSplitDeclare.subst(iop) + \
-        VectorNonSplitConstructor.subst(iop) + \
-        execute_block
-
 }};
 
 def format VectorMaskFormat(code, category, *flags) {{
@@ -1053,21 +1149,26 @@ def format VectorMaskFormat(code, category, *flags) {{
          'set_dest_reg_idx': set_dest_reg_idx,
          'set_src_reg_idx': set_src_reg_idx,
          'set_vlenb': set_vlenb,
-         'copy_old_vd': copyOldVd(old_vd_idx)},
+         'copy_old_vd': copyOldVd(old_vd_idx),
+         'declare_varith_template': declareVArithTemplate(Name, 'uint', 8, 8)
+        },
         flags)
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
-    header_output = \
-        VectorMaskDeclare.subst(iop) + \
-        VectorMaskConstructor.subst(iop) + \
-        VectorMaskExecute.subst(iop)
 
+    header_output = VectorMaskDeclare.subst(iop)
+    decoder_output = VectorMaskConstructor.subst(iop)
+    exec_output = VectorMaskExecute.subst(iop)
     decode_block = VectorMaskDecodeBlock.subst(iop)
 }};
 
 def format VectorReduceIntFormat(code, category, *flags) {{
-    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
-                        flags)
+    iop = InstObjParams(
+        name,
+        Name,
+        'VectorArithMacroInst',
+        {'code': code,
+         'declare_varith_template': declareVArithTemplate(Name)},
+        flags
+    )
     inst_name, inst_suffix = name.split("_", maxsplit=1)
     dest_reg_id = "vecRegClass[_machInst.vd]"
     src1_reg_id = "vecRegClass[_machInst.vs1]"
@@ -1098,23 +1199,29 @@ def format VectorReduceIntFormat(code, category, *flags) {{
          'set_vlen' : set_vlen,
          'vm_decl_rd': vm_decl_rd,
          'type_def': type_def,
-         'copy_old_vd': copyOldVd(2)},
+         'copy_old_vd': copyOldVd(2),
+         'declare_varith_template': declareVArithTemplate(Name + "Micro")},
         flags)
 
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
     header_output = \
         VectorReduceMicroDeclare.subst(microiop) + \
+        VectorReduceMacroDeclare.subst(iop)
+    decoder_output = \
         VectorReduceMicroConstructor.subst(microiop) + \
-        VectorReduceIntMicroExecute.subst(microiop) + \
-        VectorReduceMacroDeclare.subst(iop) + \
         VectorReduceMacroConstructor.subst(iop)
+    exec_output = VectorReduceIntMicroExecute.subst(microiop)
     decode_block = VectorIntDecodeBlock.subst(iop)
 }};
 
 def format VectorReduceFloatFormat(code, category, *flags) {{
-    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
-                        flags)
+    iop = InstObjParams(
+        name,
+        Name,
+        'VectorArithMacroInst',
+        {'code': code,
+         'declare_varith_template': declareVArithTemplate(Name, 'float', 32)},
+        flags
+    )
     inst_name, inst_suffix = name.split("_", maxsplit=1)
     dest_reg_id = "vecRegClass[_machInst.vd]"
     src1_reg_id = "vecRegClass[_machInst.vs1]"
@@ -1138,6 +1245,7 @@ def format VectorReduceFloatFormat(code, category, *flags) {{
 
     code = fflags_wrapper(code)
 
+    varith_micro_declare = declareVArithTemplate(Name + "Micro", 'float', 32)
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         'VectorArithMicroInst',
@@ -1148,23 +1256,30 @@ def format VectorReduceFloatFormat(code, category, *flags) {{
          'set_vlen': set_vlen,
          'vm_decl_rd': vm_decl_rd,
          'type_def': type_def,
-         'copy_old_vd': copyOldVd(2)},
+         'copy_old_vd': copyOldVd(2),
+         'declare_varith_template': varith_micro_declare},
         flags)
 
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
     header_output = \
         VectorReduceMicroDeclare.subst(microiop) + \
+        VectorReduceMacroDeclare.subst(iop)
+    decoder_output = \
         VectorReduceMicroConstructor.subst(microiop) + \
-        VectorReduceFloatMicroExecute.subst(microiop) + \
-        VectorReduceMacroDeclare.subst(iop) + \
         VectorReduceMacroConstructor.subst(iop)
+    exec_output = VectorReduceFloatMicroExecute.subst(microiop)
     decode_block = VectorFloatDecodeBlock.subst(iop)
 }};
 
 def format VectorReduceFloatWideningFormat(code, category, *flags) {{
-    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
-                        flags)
+    varith_macro_declare = declareVArithTemplate(Name, 'float', 32, 32)
+    iop = InstObjParams(
+        name,
+        Name,
+        'VectorArithMacroInst',
+        {'code': code,
+         'declare_varith_template': varith_macro_declare},
+        flags
+    )
     inst_name, inst_suffix = name.split("_", maxsplit=1)
     dest_reg_id = "vecRegClass[_machInst.vd]"
     src1_reg_id = "vecRegClass[_machInst.vs1]"
@@ -1186,6 +1301,9 @@ def format VectorReduceFloatWideningFormat(code, category, *flags) {{
         using ewt = typename double_width<et>::type;
         using vwu = decltype(ewt::v);
     '''
+
+    varith_micro_declare = declareVArithTemplate(
+        Name + "Micro", 'float', 32, 32)
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         'VectorArithMicroInst',
@@ -1196,23 +1314,29 @@ def format VectorReduceFloatWideningFormat(code, category, *flags) {{
          'set_vlen': set_vlen,
          'vm_decl_rd': vm_decl_rd,
          'type_def': type_def,
-         'copy_old_vd': copyOldVd(2)},
+         'copy_old_vd': copyOldVd(2),
+         'declare_varith_template': varith_micro_declare},
         flags)
 
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
     header_output = \
         VectorReduceMicroDeclare.subst(microiop) + \
+        VectorReduceMacroDeclare.subst(iop)
+    decoder_output = \
         VectorReduceMicroConstructor.subst(microiop) + \
-        VectorReduceFloatWideningMicroExecute.subst(microiop) + \
-        VectorReduceMacroDeclare.subst(iop) + \
         VectorReduceMacroConstructor.subst(iop)
+    exec_output = VectorReduceFloatWideningMicroExecute.subst(microiop)
     decode_block = VectorFloatWideningDecodeBlock.subst(iop)
 }};
 
 def format VectorIntVxsatFormat(code, category, *flags) {{
-    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
-                        flags)
+    iop = InstObjParams(
+        name,
+        Name,
+        'VectorArithMacroInst',
+        {'code': code,
+         'declare_varith_template': declareVArithTemplate(Name)},
+        flags
+    )
     inst_name, inst_suffix = name.split("_", maxsplit=1)
     old_vd_idx = 2
     dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
@@ -1251,24 +1375,29 @@ def format VectorIntVxsatFormat(code, category, *flags) {{
          'set_src_reg_idx': set_src_reg_idx,
          'set_vlenb': set_vlenb,
          'vm_decl_rd': vm_decl_rd,
-         'copy_old_vd': copyOldVd(old_vd_idx)},
+         'copy_old_vd': copyOldVd(old_vd_idx),
+         'declare_varith_template': declareVArithTemplate(Name + "Micro")},
         flags)
 
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
     header_output = \
         VectorIntVxsatMicroDeclare.subst(microiop) + \
+        VectorIntVxsatMacroDeclare.subst(iop)
+    decoder_output = \
         VectorIntVxsatMicroConstructor.subst(microiop) + \
-        VectorIntMicroExecute.subst(microiop) + \
-        VectorIntVxsatMacroDeclare.subst(iop) + \
         VectorIntVxsatMacroConstructor.subst(iop)
-
+    exec_output = VectorIntMicroExecute.subst(microiop)
     decode_block = VectorIntDecodeBlock.subst(iop)
 }};
 
 def format VectorReduceIntWideningFormat(code, category, *flags) {{
-    iop = InstObjParams(name, Name, 'VectorArithMacroInst', {'code': code},
-                        flags)
+    iop = InstObjParams(
+        name,
+        Name,
+        'VectorArithMacroInst',
+        {'code': code,
+         'declare_varith_template': declareVArithTemplate(Name, max_size=32)},
+        flags
+    )
     inst_name, inst_suffix = name.split("_", maxsplit=1)
     dest_reg_id = "vecRegClass[_machInst.vd]"
     src1_reg_id = "vecRegClass[_machInst.vs1]"
@@ -1284,6 +1413,8 @@ def format VectorReduceIntWideningFormat(code, category, *flags) {{
     vm_decl_rd = vmDeclAndReadData()
     set_vlenb = setVlenb()
     set_vlen = setVlen()
+
+    varith_micro_declare = declareVArithTemplate(Name + "Micro", max_size=32)
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         'VectorArithMicroInst',
@@ -1293,17 +1424,17 @@ def format VectorReduceIntWideningFormat(code, category, *flags) {{
          'set_vlenb': set_vlenb,
          'set_vlen': set_vlen,
          'vm_decl_rd': vm_decl_rd,
-         'copy_old_vd': copyOldVd(2)},
+         'copy_old_vd': copyOldVd(2),
+         'declare_varith_template': varith_micro_declare},
         flags)
 
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
     header_output = \
         VectorReduceMicroDeclare.subst(microiop) + \
+        VectorReduceMacroDeclare.subst(iop)
+    decoder_output = \
         VectorReduceMicroConstructor.subst(microiop) + \
-        VectorReduceIntWideningMicroExecute.subst(microiop) + \
-        VectorReduceMacroDeclare.subst(iop) + \
         VectorReduceMacroConstructor.subst(iop)
+    exec_output = VectorReduceIntWideningMicroExecute.subst(microiop)
     decode_block = VectorIntWideningDecodeBlock.subst(iop)
 }};
 
@@ -1315,8 +1446,20 @@ def VectorSlideBase(name, Name, category, code, flags, macro_construtor,
     microop_class_name = 'VectorSlideMicroInst'
     # Make sure flags are in lists (convert to lists if not).
     flags = makeList(flags)
-    iop = InstObjParams(name, Name, macroop_class_name, {'code': code},
-                        flags)
+
+    if decode_template is VectorIntDecodeBlock:
+        varith_macro_declare = declareVArithTemplate(Name)
+    elif decode_template is VectorFloatDecodeBlock:
+        varith_macro_declare = declareVArithTemplate(Name, 'float', 32)
+
+    iop = InstObjParams(
+        name,
+        Name,
+        macroop_class_name,
+       {'code': code,
+        'declare_varith_template': varith_macro_declare},
+        flags
+    )
     inst_name, inst_suffix = name.split("_", maxsplit=1)
     dest_reg_id = "vecRegClass[_machInst.vd + vdIdx]"
     src2_reg_id = "vecRegClass[_machInst.vs2 + vs2Idx]"
@@ -1345,6 +1488,13 @@ def VectorSlideBase(name, Name, category, code, flags, macro_construtor,
     set_src_reg_idx += setSrcVm()
     set_vlenb = setVlenb()
     set_vlen = setVlen()
+
+    if decode_template is VectorIntDecodeBlock:
+        varith_micro_declare = declareVArithTemplate(Name + "Micro")
+    elif decode_template is VectorFloatDecodeBlock:
+        varith_micro_declare = declareVArithTemplate(
+            Name + "Micro", 'float', 32)
+
     microiop = InstObjParams(name + "_micro",
         Name + "Micro",
         microop_class_name,
@@ -1354,52 +1504,54 @@ def VectorSlideBase(name, Name, category, code, flags, macro_construtor,
          'set_vlenb': set_vlenb,
          'set_vlen': set_vlen,
          'vm_decl_rd': vm_decl_rd,
-         'copy_old_vd': copyOldVd(old_vd_idx)},
+         'copy_old_vd': copyOldVd(old_vd_idx),
+         'declare_varith_template': varith_micro_declare},
         flags)
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
-    # Because of the use of templates, we had to put all parts in header to
-    # keep the compiler happy.
+
     header_output = \
         VectorSlideMicroDeclare.subst(microiop) + \
+        VectorSlideMacroDeclare.subst(iop)
+    decoder_output = \
         VectorSlideMicroConstructor.subst(microiop) + \
-        micro_execute_template.subst(microiop) + \
-        VectorSlideMacroDeclare.subst(iop) + \
         macro_construtor.subst(iop)
-
+    exec_output = micro_execute_template.subst(microiop)
     decode_block = decode_template.subst(iop)
-    return (header_output, decode_block)
+    return (header_output, decoder_output, decode_block, exec_output)
 
 }};
 
 def format VectorSlideUpFormat(code, category, *flags) {{
-    (header_output, decode_block) = VectorSlideBase(name, Name, category, code,
-        flags,
-        macro_construtor = VectorSlideUpMacroConstructor,
-        decode_template = VectorIntDecodeBlock,
-        micro_execute_template = VectorSlideMicroExecute)
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VectorSlideBase(name, Name, category, code,
+            flags,
+            macro_construtor = VectorSlideUpMacroConstructor,
+            decode_template = VectorIntDecodeBlock,
+            micro_execute_template = VectorSlideMicroExecute)
 }};
 
 def format VectorSlideDownFormat(code, category, *flags) {{
-    (header_output, decode_block) = VectorSlideBase(name, Name, category, code,
-        flags,
-        macro_construtor = VectorSlideDownMacroConstructor,
-        decode_template = VectorIntDecodeBlock,
-        micro_execute_template = VectorSlideMicroExecute)
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VectorSlideBase(name, Name, category, code,
+            flags,
+            macro_construtor = VectorSlideDownMacroConstructor,
+            decode_template = VectorIntDecodeBlock,
+            micro_execute_template = VectorSlideMicroExecute)
 }};
 
 def format VectorFloatSlideUpFormat(code, category, *flags) {{
-    (header_output, decode_block) = VectorSlideBase(name, Name, category, code,
-        flags,
-        macro_construtor = VectorSlideUpMacroConstructor,
-        decode_template = VectorFloatDecodeBlock,
-        micro_execute_template = VectorFloatSlideMicroExecute)
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VectorSlideBase(name, Name, category, code,
+            flags,
+            macro_construtor = VectorSlideUpMacroConstructor,
+            decode_template = VectorFloatDecodeBlock,
+            micro_execute_template = VectorFloatSlideMicroExecute)
 }};
 
 def format VectorFloatSlideDownFormat(code, category, *flags) {{
-    (header_output, decode_block) = VectorSlideBase(name, Name, category, code,
-        flags,
-        macro_construtor = VectorSlideDownMacroConstructor,
-        decode_template = VectorFloatDecodeBlock,
-        micro_execute_template = VectorFloatSlideMicroExecute)
+    (header_output, decoder_output, decode_block, exec_output) = \
+        VectorSlideBase(name, Name, category, code,
+            flags,
+            macro_construtor = VectorSlideDownMacroConstructor,
+            decode_template = VectorFloatDecodeBlock,
+            micro_execute_template = VectorFloatSlideMicroExecute)
 }};
diff --git a/src/arch/riscv/isa/templates/vector_arith.isa b/src/arch/riscv/isa/templates/vector_arith.isa
index 306b1c53f1..3a528f1198 100644
--- a/src/arch/riscv/isa/templates/vector_arith.isa
+++ b/src/arch/riscv/isa/templates/vector_arith.isa
@@ -107,6 +107,9 @@ template<typename ElemType>
     this->microops.front()->setFirstMicroop();
     this->microops.back()->setLastMicroop();
 }
+
+%(declare_varith_template)s;
+
 }};
 
 def template VectorIntMicroDeclare {{
@@ -145,6 +148,8 @@ template<typename ElemType>
     %(set_src_reg_idx)s;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorIntMicroExecute {{
@@ -182,6 +187,8 @@ Fault
     return NoFault;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorIntExtMacroDeclare {{
@@ -193,14 +200,7 @@ private:
 public:
     %(class_name)s(ExtMachInst _machInst, uint32_t _vlen);
     std::string generateDisassembly(Addr pc,
-        const loader::SymbolTable *symtab) const override
-    {
-        std::stringstream ss;
-        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
-            << registerName(srcRegIdx(0));
-        if (machInst.vm == 0) ss << ", v0.t";
-        return ss.str();
-    }
+        const loader::SymbolTable *symtab) const override;
 };
 
 }};
@@ -219,14 +219,7 @@ public:
                    uint8_t _microIdx);
     Fault execute(ExecContext* xc, trace::InstRecord* traceData)const override;
     std::string generateDisassembly(Addr pc,
-        const loader::SymbolTable *symtab) const override
-    {
-        std::stringstream ss;
-        ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
-            << registerName(srcRegIdx(0));
-        if (machInst.vm == 0) ss << ", v0.t";
-        return ss.str();
-    }
+        const loader::SymbolTable *symtab) const override;
 };
 
 }};
@@ -303,6 +296,38 @@ Fault
     return NoFault;
 }
 
+template <typename ElemType>
+std::string
+%(class_name)s<ElemType>::generateDisassembly(Addr pc,
+    const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+        << registerName(srcRegIdx(0));
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+%(declare_varith_template)s;
+
+}};
+
+def template VectorIntExtMacroExecute {{
+
+template <typename ElemType>
+std::string
+%(class_name)s<ElemType>::generateDisassembly(Addr pc,
+    const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
+        << registerName(srcRegIdx(0));
+    if (machInst.vm == 0) ss << ", v0.t";
+    return ss.str();
+}
+
+%(declare_varith_template)s;
+
 }};
 
 def template VectorIntDecodeBlock {{
@@ -365,6 +390,8 @@ template<typename ElemType>
     this->microops.back()->setLastMicroop();
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorIntWideningMicroDeclare {{
@@ -402,6 +429,8 @@ template<typename ElemType>
     %(set_src_reg_idx)s;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorIntWideningMicroExecute {{
@@ -447,6 +476,8 @@ Fault
     return NoFault;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorIntNarrowingMicroExecute {{
@@ -493,6 +524,8 @@ Fault
     return NoFault;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorIntWideningDecodeBlock {{
@@ -546,6 +579,9 @@ template<typename ElemType>
     this->microops.front()->setFirstMicroop();
     this->microops.back()->setLastMicroop();
 }
+
+%(declare_varith_template)s;
+
 }};
 
 def template VectorFloatMicroDeclare {{
@@ -582,6 +618,8 @@ template<typename ElemType>
     %(set_src_reg_idx)s;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorFloatMicroExecute {{
@@ -620,6 +658,8 @@ Fault
     return NoFault;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorFloatDecodeBlock {{
@@ -725,6 +765,8 @@ Fault
     return NoFault;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorFloatNarrowingMicroExecute {{
@@ -772,6 +814,8 @@ Fault
     return NoFault;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorFloatWideningDecodeBlock {{
@@ -826,6 +870,8 @@ template<typename ElemType>
     this->microops.back()->setLastMicroop();
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template ViotaMicroDeclare {{
@@ -865,6 +911,8 @@ template<typename ElemType>
     setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs2]);
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template ViotaMicroExecute {{
@@ -899,6 +947,8 @@ Fault
     return NoFault;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 
@@ -915,6 +965,8 @@ template<typename ElemType>
     %(set_vm_idx)s;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template Vector1Vs1VdMaskExecute {{
@@ -948,6 +1000,8 @@ Fault
     return NoFault;
 };
 
+%(declare_varith_template)s;
+
 }};
 
 def template Vector1Vs1RdMaskDeclare {{
@@ -978,6 +1032,8 @@ template<typename ElemType>
     %(set_vm_idx)s;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template Vector1Vs1RdMaskExecute {{
@@ -1010,6 +1066,8 @@ Fault
     return NoFault;
 };
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorIntMaskMacroDeclare {{
@@ -1057,6 +1115,8 @@ template<typename ElemType>
     this->microops.back()->setLastMicroop();
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorIntMaskMicroDeclare {{
@@ -1095,6 +1155,8 @@ template<typename ElemType>
     %(set_src_reg_idx)s;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorIntMaskMicroExecute {{
@@ -1133,6 +1195,8 @@ Fault
     return NoFault;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorFloatMaskMacroDeclare {{
@@ -1180,6 +1244,8 @@ template<typename ElemType>
     this->microops.back()->setLastMicroop();
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorFloatMaskMicroDeclare {{
@@ -1217,6 +1283,8 @@ template<typename ElemType>
     %(set_src_reg_idx)s;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorFloatMaskMicroExecute {{
@@ -1255,6 +1323,8 @@ Fault
     return NoFault;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VMvWholeMacroDeclare {{
@@ -1381,6 +1451,8 @@ template<typename ElemType>
     %(set_src_reg_idx)s;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorMaskExecute {{
@@ -1415,6 +1487,8 @@ Fault
     return NoFault;
 };
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorMaskDecodeBlock {{
@@ -1449,6 +1523,8 @@ template<typename ElemType>
     %(set_vm_idx)s;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorIntNonSplitExecute {{
@@ -1481,6 +1557,8 @@ Fault
     return NoFault;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorFloatNonSplitExecute {{
@@ -1513,6 +1591,8 @@ Fault
     return NoFault;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorFloatNonSplitDecodeBlock {{
@@ -1578,6 +1658,8 @@ template<typename ElemType>
     this->microops.back()->setLastMicroop();
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorReduceMicroDeclare {{
@@ -1615,6 +1697,8 @@ template<typename ElemType>
     %(set_src_reg_idx)s;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorReduceIntMicroExecute {{
@@ -1664,6 +1748,8 @@ Fault
     return NoFault;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorReduceFloatMicroExecute {{
@@ -1715,6 +1801,8 @@ Fault
     return NoFault;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorReduceFloatWideningMicroExecute {{
@@ -1765,6 +1853,8 @@ Fault
     return NoFault;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorGatherMacroDeclare {{
@@ -1824,6 +1914,8 @@ template<typename ElemType, typename IndexType>
     this->microops.back()->setLastMicroop();
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorGatherMicroDeclare {{
@@ -1873,6 +1965,8 @@ template<typename ElemType, typename IndexType>
     %(set_src_reg_idx)s;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorGatherMicroExecute {{
@@ -1930,6 +2024,8 @@ Fault
     return NoFault;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorGatherDecodeBlock {{
@@ -2003,6 +2099,9 @@ template<typename ElemType>
     this->microops.front()->setFirstMicroop();
     this->microops.back()->setLastMicroop();
 }
+
+%(declare_varith_template)s;
+
 }};
 
 def template VectorIntVxsatMicroDeclare {{
@@ -2041,6 +2140,8 @@ template<typename ElemType>
     %(set_src_reg_idx)s;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorReduceIntWideningMicroExecute {{
@@ -2094,6 +2195,8 @@ Fault
     return NoFault;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorSlideMacroDeclare {{
@@ -2142,6 +2245,8 @@ template<typename ElemType>
     this->microops.back()->setLastMicroop();
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorSlideDownMacroConstructor {{
@@ -2177,6 +2282,8 @@ template<typename ElemType>
     this->microops.back()->setLastMicroop();
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorSlideMicroDeclare {{
@@ -2215,6 +2322,8 @@ template<typename ElemType>
     %(set_src_reg_idx)s;
 }
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorSlideMicroExecute {{
@@ -2254,6 +2363,8 @@ Fault
     return NoFault;
 };
 
+%(declare_varith_template)s;
+
 }};
 
 def template VectorFloatSlideMicroExecute {{
@@ -2293,4 +2404,6 @@ Fault
     return NoFault;
 };
 
+%(declare_varith_template)s;
+
 }};

From e561f3b6f149f45f291c8ad7b310f157bfcba154 Mon Sep 17 00:00:00 2001
From: Roger Chang <rogerycchang@google.com>
Date: Tue, 24 Oct 2023 15:29:40 +0800
Subject: [PATCH 3/3] arch-riscv: Move insts/vector from header to source

Move the implemention of following classes
- VMaskMergeMicroInst
- VxsatMicroInst

Change-Id: I42ec45681064a0f599c3b2313c2125da7cfc849b
---
 src/arch/riscv/insts/vector.cc                | 93 ++++++++++++++++++
 src/arch/riscv/insts/vector.hh                | 94 ++-----------------
 src/arch/riscv/isa/templates/vector_arith.isa |  8 +-
 3 files changed, 106 insertions(+), 89 deletions(-)

diff --git a/src/arch/riscv/insts/vector.cc b/src/arch/riscv/insts/vector.cc
index c99e806e9b..7f17bb055e 100644
--- a/src/arch/riscv/insts/vector.cc
+++ b/src/arch/riscv/insts/vector.cc
@@ -32,6 +32,9 @@
 #include <string>
 
 #include "arch/riscv/insts/static_inst.hh"
+#include "arch/riscv/isa.hh"
+#include "arch/riscv/regs/misc.hh"
+#include "arch/riscv/regs/vector.hh"
 #include "arch/riscv/utility.hh"
 #include "cpu/static_inst.hh"
 
@@ -408,5 +411,95 @@ VMvWholeMicroInst::generateDisassembly(Addr pc,
     return ss.str();
 }
 
+VMaskMergeMicroInst::VMaskMergeMicroInst(ExtMachInst extMachInst,
+    uint8_t _dstReg, uint8_t _numSrcs, uint32_t _vlen, size_t _elemSize)
+    : VectorArithMicroInst("vmask_mv_micro", extMachInst,
+                            VectorIntegerArithOp, 0, 0),
+      vlen(_vlen),
+      elemSize(_elemSize)
+{
+    setRegIdxArrays(
+        reinterpret_cast<RegIdArrayPtr>(
+            &std::remove_pointer_t<decltype(this)>::srcRegIdxArr),
+        reinterpret_cast<RegIdArrayPtr>(
+            &std::remove_pointer_t<decltype(this)>::destRegIdxArr));
+
+    _numSrcRegs = 0;
+    _numDestRegs = 0;
+
+    setDestRegIdx(_numDestRegs++, vecRegClass[_dstReg]);
+    _numTypedDestRegs[VecRegClass]++;
+    for (uint8_t i=0; i<_numSrcs; i++) {
+        setSrcRegIdx(_numSrcRegs++, vecRegClass[VecMemInternalReg0 + i]);
+    }
+}
+
+Fault
+VMaskMergeMicroInst::execute(ExecContext* xc,
+    trace::InstRecord* traceData) const
+{
+    vreg_t& tmp_d0 = *(vreg_t *)xc->getWritableRegOperand(this, 0);
+    PCStateBase *pc_ptr = xc->tcBase()->pcState().clone();
+    auto Vd = tmp_d0.as<uint8_t>();
+    uint32_t vlenb = pc_ptr->as<PCState>().vlenb();
+    const uint32_t elems_per_vreg = vlenb / elemSize;
+    size_t bit_cnt = elems_per_vreg;
+    vreg_t tmp_s;
+    xc->getRegOperand(this, 0, &tmp_s);
+    auto s = tmp_s.as<uint8_t>();
+    // cp the first result and tail
+    memcpy(Vd, s, vlenb);
+    for (uint8_t i = 1; i < this->_numSrcRegs; i++) {
+        xc->getRegOperand(this, i, &tmp_s);
+        s = tmp_s.as<uint8_t>();
+        if (elems_per_vreg < 8) {
+            const uint32_t m = (1 << elems_per_vreg) - 1;
+            const uint32_t mask = m << (i * elems_per_vreg % 8);
+            // clr & ext bits
+            Vd[bit_cnt/8] ^= Vd[bit_cnt/8] & mask;
+            Vd[bit_cnt/8] |= s[bit_cnt/8] & mask;
+            bit_cnt += elems_per_vreg;
+        } else {
+            const uint32_t byte_offset = elems_per_vreg / 8;
+            memcpy(Vd + i * byte_offset, s + i * byte_offset, byte_offset);
+        }
+    }
+    if (traceData)
+        traceData->setData(vecRegClass, &tmp_d0);
+    return NoFault;
+}
+
+std::string
+VMaskMergeMicroInst::generateDisassembly(Addr pc,
+    const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << registerName(destRegIdx(0));
+    for (uint8_t i = 0; i < this->_numSrcRegs; i++) {
+        ss << ", " << registerName(srcRegIdx(i));
+    }
+    unsigned vlenb = vlen >> 3;
+    ss << ", offset:" << vlenb / elemSize;
+    return ss.str();
+}
+
+Fault
+VxsatMicroInst::execute(ExecContext* xc, trace::InstRecord* traceData) const
+{
+    xc->setMiscReg(MISCREG_VXSAT, *vxsat);
+    auto vcsr = xc->readMiscReg(MISCREG_VCSR);
+    xc->setMiscReg(MISCREG_VCSR, ((vcsr&~1)|*vxsat));
+    return NoFault;
+}
+
+std::string
+VxsatMicroInst::generateDisassembly(Addr pc,
+    const loader::SymbolTable *symtab) const
+{
+    std::stringstream ss;
+    ss << mnemonic << ' ' << "VXSAT" << ", " << (*vxsat ? "0x1" : "0x0");
+    return ss.str();
+}
+
 } // namespace RiscvISA
 } // namespace gem5
diff --git a/src/arch/riscv/insts/vector.hh b/src/arch/riscv/insts/vector.hh
index c986c99c72..4127060e4a 100644
--- a/src/arch/riscv/insts/vector.hh
+++ b/src/arch/riscv/insts/vector.hh
@@ -34,7 +34,6 @@
 #include "arch/riscv/insts/static_inst.hh"
 #include "arch/riscv/isa.hh"
 #include "arch/riscv/regs/misc.hh"
-#include "arch/riscv/regs/vector.hh"
 #include "arch/riscv/utility.hh"
 #include "cpu/exec_context.hh"
 #include "cpu/static_inst.hh"
@@ -539,7 +538,7 @@ class VMvWholeMicroInst : public VectorArithMicroInst
             Addr pc, const loader::SymbolTable *symtab) const override;
 };
 
-template<typename ElemType>
+
 class VMaskMergeMicroInst : public VectorArithMicroInst
 {
   private:
@@ -548,75 +547,12 @@ class VMaskMergeMicroInst : public VectorArithMicroInst
 
   public:
     uint32_t vlen;
+    size_t elemSize;
     VMaskMergeMicroInst(ExtMachInst extMachInst,
-        uint8_t _dstReg, uint8_t _numSrcs, uint32_t _vlen)
-        : VectorArithMicroInst("vmask_mv_micro", extMachInst,
-                               VectorIntegerArithOp, 0, 0),
-          vlen(_vlen)
-    {
-        setRegIdxArrays(
-            reinterpret_cast<RegIdArrayPtr>(
-                &std::remove_pointer_t<decltype(this)>::srcRegIdxArr),
-            reinterpret_cast<RegIdArrayPtr>(
-                &std::remove_pointer_t<decltype(this)>::destRegIdxArr));
-
-        _numSrcRegs = 0;
-        _numDestRegs = 0;
-
-        setDestRegIdx(_numDestRegs++, vecRegClass[_dstReg]);
-        _numTypedDestRegs[VecRegClass]++;
-        for (uint8_t i=0; i<_numSrcs; i++) {
-            setSrcRegIdx(_numSrcRegs++, vecRegClass[VecMemInternalReg0 + i]);
-        }
-    }
-
-    Fault
-    execute(ExecContext* xc, trace::InstRecord* traceData) const override
-    {
-        vreg_t& tmp_d0 = *(vreg_t *)xc->getWritableRegOperand(this, 0);
-        PCStateBase *pc_ptr = xc->tcBase()->pcState().clone();
-        auto Vd = tmp_d0.as<uint8_t>();
-        uint32_t vlenb = pc_ptr->as<PCState>().vlenb();
-        const uint32_t elems_per_vreg = vlenb / sizeof(ElemType);
-        size_t bit_cnt = elems_per_vreg;
-        vreg_t tmp_s;
-        xc->getRegOperand(this, 0, &tmp_s);
-        auto s = tmp_s.as<uint8_t>();
-        // cp the first result and tail
-        memcpy(Vd, s, vlenb);
-        for (uint8_t i = 1; i < this->_numSrcRegs; i++) {
-            xc->getRegOperand(this, i, &tmp_s);
-            s = tmp_s.as<uint8_t>();
-            if (elems_per_vreg < 8) {
-                const uint32_t m = (1 << elems_per_vreg) - 1;
-                const uint32_t mask = m << (i * elems_per_vreg % 8);
-                // clr & ext bits
-                Vd[bit_cnt/8] ^= Vd[bit_cnt/8] & mask;
-                Vd[bit_cnt/8] |= s[bit_cnt/8] & mask;
-                bit_cnt += elems_per_vreg;
-            } else {
-                const uint32_t byte_offset = elems_per_vreg / 8;
-                memcpy(Vd + i * byte_offset, s + i * byte_offset, byte_offset);
-            }
-        }
-        if (traceData)
-            traceData->setData(vecRegClass, &tmp_d0);
-        return NoFault;
-    }
-
-    std::string
-    generateDisassembly(Addr pc, const loader::SymbolTable *symtab)
-        const override
-    {
-        std::stringstream ss;
-        ss << mnemonic << ' ' << registerName(destRegIdx(0));
-        for (uint8_t i = 0; i < this->_numSrcRegs; i++) {
-            ss << ", " << registerName(srcRegIdx(i));
-        }
-        unsigned vlenb = vlen >> 3;
-        ss << ", offset:" << vlenb / sizeof(ElemType);
-        return ss.str();
-    }
+        uint8_t _dstReg, uint8_t _numSrcs, uint32_t _vlen, size_t _elemSize);
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    std::string generateDisassembly(Addr,
+        const loader::SymbolTable *) const override;
 };
 
 class VxsatMicroInst : public VectorArithMicroInst
@@ -630,21 +566,9 @@ class VxsatMicroInst : public VectorArithMicroInst
     {
         vxsat = Vxsat;
     }
-    Fault
-    execute(ExecContext* xc, trace::InstRecord* traceData) const override
-    {
-        xc->setMiscReg(MISCREG_VXSAT,*vxsat);
-        auto vcsr = xc->readMiscReg(MISCREG_VCSR);
-        xc->setMiscReg(MISCREG_VCSR, ((vcsr&~1)|*vxsat));
-        return NoFault;
-    }
-    std::string generateDisassembly(Addr pc, const loader::SymbolTable *symtab)
-        const override
-    {
-        std::stringstream ss;
-        ss << mnemonic << ' ' << "VXSAT" << ", " << (*vxsat ? "0x1" : "0x0");
-        return ss.str();
-    }
+    Fault execute(ExecContext *, trace::InstRecord *) const override;
+    std::string generateDisassembly(Addr, const loader::SymbolTable *)
+        const override;
 };
 
 } // namespace RiscvISA
diff --git a/src/arch/riscv/isa/templates/vector_arith.isa b/src/arch/riscv/isa/templates/vector_arith.isa
index 3a528f1198..364639a716 100644
--- a/src/arch/riscv/isa/templates/vector_arith.isa
+++ b/src/arch/riscv/isa/templates/vector_arith.isa
@@ -1107,8 +1107,8 @@ template<typename ElemType>
         this->microops.push_back(microop);
         micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
     }
-    microop = new VMaskMergeMicroInst<ElemType>(_machInst, _machInst.vd,
-        this->microops.size(), _vlen);
+    microop = new VMaskMergeMicroInst(_machInst, _machInst.vd,
+        this->microops.size(), _vlen, sizeof(ElemType));
     this->microops.push_back(microop);
 
     this->microops.front()->setFirstMicroop();
@@ -1236,8 +1236,8 @@ template<typename ElemType>
         this->microops.push_back(microop);
         micro_vl = std::min(tmp_vl -= micro_vlmax, micro_vlmax);
     }
-    microop = new VMaskMergeMicroInst<ElemType>(_machInst, _machInst.vd,
-        this->microops.size(), _vlen);
+    microop = new VMaskMergeMicroInst(_machInst, _machInst.vd,
+        this->microops.size(), _vlen, sizeof(ElemType));
     this->microops.push_back(microop);
 
     this->microops.front()->setFirstMicroop();