From f6c61836b3cbe06eb1da56a28dc0b542ad052c0f Mon Sep 17 00:00:00 2001 From: Ivan Fernandez Date: Wed, 6 Mar 2024 20:27:06 +0100 Subject: [PATCH] arch-riscv: adding vector unit-stride segment loads to RISC-V (#851) This commit adds support for vector unit-stride segment load operations for RISC-V (vlsege). This implementation is based in two types of microops: - VlSeg microops that load data as it is organized in memory in structs of several fields. - VectorDeIntrlv microops that properly deinterleave structs into destination registers. Gem5 issue: https://github.com/gem5/gem5/issues/382 --- src/arch/riscv/insts/vector.cc | 97 +++++++ src/arch/riscv/insts/vector.hh | 59 ++++ src/arch/riscv/isa/decoder.isa | 296 ++++++++++++++++++-- src/arch/riscv/isa/formats/vector_mem.isa | 13 + src/arch/riscv/isa/templates/vector_mem.isa | 232 +++++++++++++++ src/arch/riscv/utility.hh | 11 + src/cpu/FuncUnit.py | 1 + src/cpu/minor/BaseMinorCPU.py | 1 + src/cpu/op_class.hh | 1 + 9 files changed, 683 insertions(+), 28 deletions(-) diff --git a/src/arch/riscv/insts/vector.cc b/src/arch/riscv/insts/vector.cc index b6d052ce4b..c044752ac8 100644 --- a/src/arch/riscv/insts/vector.cc +++ b/src/arch/riscv/insts/vector.cc @@ -588,5 +588,102 @@ VlFFTrimVlMicroOp::generateDisassembly(Addr pc, return ss.str(); } +std::string VlSegMacroInst::generateDisassembly(Addr pc, + const loader::SymbolTable *symtab) const +{ + std::stringstream ss; + ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " << + '(' << registerName(srcRegIdx(0)) << ')' << + ", " << registerName(srcRegIdx(1)); + if (!machInst.vm) + ss << ", v0.t"; + return ss.str(); +} + +std::string VlSegMicroInst::generateDisassembly(Addr pc, + const loader::SymbolTable *symtab) const +{ + std::stringstream ss; + ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " << + '(' << registerName(srcRegIdx(0)) << ')' << + ", "<< registerName(srcRegIdx(1)); + if (microIdx != 0 || machInst.vtype8.vma == 0 || machInst.vtype8.vta == 0) + ss << ", " << registerName(srcRegIdx(2)); + if (!machInst.vm) + ss << ", v0.t"; + return ss.str(); +} + +VlSegDeIntrlvMicroInst::VlSegDeIntrlvMicroInst(ExtMachInst extMachInst, uint32_t _micro_vl, + uint32_t _dstReg, uint32_t _numSrcs, + uint32_t _microIdx, uint32_t _numMicroops, + uint32_t _field, uint32_t _vlen, uint32_t _sizeOfElement) + : VectorArithMicroInst("vlseg_deintrlv_micro", extMachInst, + VectorIntegerArithOp, 0, 0), + vlen(_vlen) +{ + setRegIdxArrays( + reinterpret_cast( + &std::remove_pointer_t::srcRegIdxArr), + reinterpret_cast( + &std::remove_pointer_t::destRegIdxArr)); + + _numSrcRegs = 0; + _numDestRegs = 0; + numSrcs = _numSrcs; + numMicroops = _numMicroops; + field =_field; + sizeOfElement = _sizeOfElement; + microIdx = _microIdx; + micro_vl = _micro_vl; + + setDestRegIdx(_numDestRegs++, vecRegClass[_dstReg]); + _numTypedDestRegs[VecRegClass]++; + for (uint32_t i=0; i < _numSrcs; i++) { + uint32_t index = VecMemInternalReg0 + i + (microIdx * _numSrcs); + setSrcRegIdx(_numSrcRegs++, vecRegClass[index]); + } +} + +Fault +VlSegDeIntrlvMicroInst::execute(ExecContext* xc, trace::InstRecord* traceData) const +{ + vreg_t& tmp_d0 = *(vreg_t *)xc->getWritableRegOperand(this, 0); + auto Vd = tmp_d0.as(); + const uint32_t elems_per_vreg = micro_vl; + vreg_t tmp_s; + auto s = tmp_s.as(); + uint32_t elem = 0; + uint32_t index = field; + for (uint32_t i = 0; i < numSrcs; i++) { + xc->getRegOperand(this, i, &tmp_s); + s = tmp_s.as(); + while(index < (i + 1) * elems_per_vreg) + { + memcpy(Vd + (elem * sizeOfElement), + s + ((index % elems_per_vreg) * sizeOfElement), + sizeOfElement); + index += numSrcs; + elem++; + } + } + if (traceData) + traceData->setData(vecRegClass, &tmp_d0); + return NoFault; +} + +std::string +VlSegDeIntrlvMicroInst::generateDisassembly(Addr pc, const loader::SymbolTable *symtab) + const +{ + std::stringstream ss; + ss << mnemonic << ' ' << registerName(destRegIdx(0)); + for (uint8_t i = 0; i < this->_numSrcRegs; i++) { + ss << ", " << registerName(srcRegIdx(i)); + } + ss << ", field: " << field; + return ss.str(); +} + } // namespace RiscvISA } // namespace gem5 diff --git a/src/arch/riscv/insts/vector.hh b/src/arch/riscv/insts/vector.hh index fd891dad42..db562cf55c 100644 --- a/src/arch/riscv/insts/vector.hh +++ b/src/arch/riscv/insts/vector.hh @@ -596,6 +596,65 @@ class VlFFTrimVlMicroOp : public VectorMicroInst const override; }; +class VlSegMacroInst : public VectorMemMacroInst +{ + protected: + VlSegMacroInst(const char* mnem, ExtMachInst _machInst, + OpClass __opClass, uint32_t _vlen) + : VectorMemMacroInst(mnem, _machInst, __opClass, _vlen) + {} + + std::string generateDisassembly( + Addr pc, const loader::SymbolTable *symtab) const override; +}; + +class VlSegMicroInst : public VectorMicroInst +{ + protected: + Request::Flags memAccessFlags; + uint8_t regIdx; + + VlSegMicroInst(const char *mnem, ExtMachInst _machInst, + OpClass __opClass, uint32_t _microVl, + uint32_t _microIdx, uint32_t _numMicroops, + uint32_t _field, uint32_t _numFields, + uint32_t _vlen) + : VectorMicroInst(mnem, _machInst, __opClass, _microVl, + _microIdx, _vlen) + { + this->flags[IsLoad] = true; + } + + std::string generateDisassembly( + Addr pc, const loader::SymbolTable *symtab) const override; +}; + +class VlSegDeIntrlvMicroInst : public VectorArithMicroInst +{ + private: + RegId srcRegIdxArr[NumVecInternalRegs]; + RegId destRegIdxArr[1]; + uint32_t numSrcs; + uint32_t numMicroops; + uint32_t field; + uint32_t sizeOfElement; + uint32_t micro_vl; + + public: + uint32_t vlen; + + VlSegDeIntrlvMicroInst(ExtMachInst extMachInst, uint32_t _micro_vl, + uint32_t _dstReg, uint32_t _numSrcs, + uint32_t _microIdx, uint32_t _numMicroops, + uint32_t _field, uint32_t _vlen, + uint32_t _sizeOfElement); + + Fault execute(ExecContext *, trace::InstRecord *) const override; + + std::string generateDisassembly(Addr, + const loader::SymbolTable *) const override; +}; + } // namespace RiscvISA } // namespace gem5 diff --git a/src/arch/riscv/isa/decoder.isa b/src/arch/riscv/isa/decoder.isa index 58468d7400..cdcb0bc6eb 100644 --- a/src/arch/riscv/isa/decoder.isa +++ b/src/arch/riscv/isa/decoder.isa @@ -599,14 +599,74 @@ decode QUADRANT default Unknown::unknown() { 0x0: decode MOP { 0x0: decode LUMOP { - 0x00: VleOp::vle8_v({{ - if ((machInst.vm || elem_mask(v0, ei)) && - i < this->microVl) { - Vd_ub[i] = Mem_vc.as()[i]; - } else { - Vd_ub[i] = Vs2_ub[i]; + 0x00: decode NF { + 0x00: VleOp::vle8_v({{ + if ((machInst.vm || elem_mask(v0, ei)) && + i < this->microVl) { + Vd_ub[i] = Mem_vc.as()[i]; + } else { + Vd_ub[i] = Vs2_ub[i]; + } + }}, inst_flags=VectorUnitStrideLoadOp); + format VlSegOp { + 0x01: vlseg2e8_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 2)) && + i < this->microVl) { + Vd_ub[i] = Mem_vc.as()[i]; + } else { + Vd_ub[i] = Vs2_ub[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x02: vlseg3e8_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 3)) && + i < this->microVl) { + Vd_ub[i] = Mem_vc.as()[i]; + } else { + Vd_ub[i] = Vs2_ub[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x03: vlseg4e8_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 4)) && + i < this->microVl) { + Vd_ub[i] = Mem_vc.as()[i]; + } else { + Vd_ub[i] = Vs2_ub[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x04: vlseg5e8_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 5)) && + i < this->microVl) { + Vd_ub[i] = Mem_vc.as()[i]; + } else { + Vd_ub[i] = Vs2_ub[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x05: vlseg6e8_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 6)) && + i < this->microVl) { + Vd_ub[i] = Mem_vc.as()[i]; + } else { + Vd_ub[i] = Vs2_ub[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x06: vlseg7e8_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 7)) && + i < this->microVl) { + Vd_ub[i] = Mem_vc.as()[i]; + } else { + Vd_ub[i] = Vs2_ub[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x07: vlseg8e8_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 8)) && + i < this->microVl) { + Vd_ub[i] = Mem_vc.as()[i]; + } else { + Vd_ub[i] = Vs2_ub[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); } - }}, inst_flags=VectorUnitStrideLoadOp); + } 0x08: decode NF { format VlWholeOp { 0x0: vl1re8_v({{ @@ -651,14 +711,74 @@ decode QUADRANT default Unknown::unknown() { } 0x5: decode MOP { 0x0: decode LUMOP { - 0x00: VleOp::vle16_v({{ - if ((machInst.vm || elem_mask(v0, ei)) && - i < this->microVl) { - Vd_uh[i] = Mem_vc.as()[i]; - } else { - Vd_uh[i] = Vs2_uh[i]; + 0x00: decode NF { + 0x00: VleOp::vle16_v({{ + if ((machInst.vm || elem_mask(v0, ei)) && + i < this->microVl) { + Vd_uh[i] = Mem_vc.as()[i]; + } else { + Vd_uh[i] = Vs2_uh[i]; + } + }}, inst_flags=VectorUnitStrideLoadOp); + format VlSegOp { + 0x01: vlseg2e16_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 2)) && + i < this->microVl) { + Vd_uh[i] = Mem_vc.as()[i]; + } else { + Vd_uh[i] = Vs2_uh[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x02: vlseg3e16_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 3)) && + i < this->microVl) { + Vd_uh[i] = Mem_vc.as()[i]; + } else { + Vd_uh[i] = Vs2_uh[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x03: vlseg4e16_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 4)) && + i < this->microVl) { + Vd_uh[i] = Mem_vc.as()[i]; + } else { + Vd_uh[i] = Vs2_uh[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x04: vlseg5e16_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 5)) && + i < this->microVl) { + Vd_uh[i] = Mem_vc.as()[i]; + } else { + Vd_uh[i] = Vs2_uh[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x05: vlseg6e16_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 6)) && + i < this->microVl) { + Vd_uh[i] = Mem_vc.as()[i]; + } else { + Vd_uh[i] = Vs2_uh[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x06: vlseg7e16_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 7)) && + i < this->microVl) { + Vd_uh[i] = Mem_vc.as()[i]; + } else { + Vd_uh[i] = Vs2_uh[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x07: vlseg8e16_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 8)) && + i < this->microVl) { + Vd_uh[i] = Mem_vc.as()[i]; + } else { + Vd_uh[i] = Vs2_uh[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); } - }}, inst_flags=VectorUnitStrideLoadOp); + } 0x08: decode NF { format VlWholeOp { 0x0: vl1re16_v({{ @@ -700,14 +820,74 @@ decode QUADRANT default Unknown::unknown() { } 0x6: decode MOP { 0x0: decode LUMOP { - 0x00: VleOp::vle32_v({{ - if ((machInst.vm || elem_mask(v0, ei)) && - i < this->microVl) { - Vd_uw[i] = Mem_vc.as()[i]; - } else { - Vd_uw[i] = Vs2_uw[i]; + 0x00: decode NF { + 0x00: VleOp::vle32_v({{ + if ((machInst.vm || elem_mask(v0, ei)) && + i < this->microVl) { + Vd_uw[i] = Mem_vc.as()[i]; + } else { + Vd_uw[i] = Vs2_uw[i]; + } + }}, inst_flags=VectorUnitStrideLoadOp); + format VlSegOp { + 0x01: vlseg2e32_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 2)) && + i < this->microVl) { + Vd_uw[i] = Mem_vc.as()[i]; + } else { + Vd_uw[i] = Vs2_uw[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x02: vlseg3e32_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 3)) && + i < this->microVl) { + Vd_uw[i] = Mem_vc.as()[i]; + } else { + Vd_uw[i] = Vs2_uw[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x03: vlseg4e32_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 4)) && + i < this->microVl) { + Vd_uw[i] = Mem_vc.as()[i]; + } else { + Vd_uw[i] = Vs2_uw[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x04: vlseg5e32_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 5)) && + i < this->microVl) { + Vd_uw[i] = Mem_vc.as()[i]; + } else { + Vd_uw[i] = Vs2_uw[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x05: vlseg6e32_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 6)) && + i < this->microVl) { + Vd_uw[i] = Mem_vc.as()[i]; + } else { + Vd_uw[i] = Vs2_uw[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x06: vlseg7e32_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 7)) && + i < this->microVl) { + Vd_uw[i] = Mem_vc.as()[i]; + } else { + Vd_uw[i] = Vs2_uw[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x07: vlseg8e32_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 8)) && + i < this->microVl) { + Vd_uw[i] = Mem_vc.as()[i]; + } else { + Vd_uw[i] = Vs2_uw[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); } - }}, inst_flags=VectorUnitStrideLoadOp); + } 0x08: decode NF { format VlWholeOp { 0x0: vl1re32_v({{ @@ -749,14 +929,74 @@ decode QUADRANT default Unknown::unknown() { } 0x7: decode MOP { 0x0: decode LUMOP { - 0x00: VleOp::vle64_v({{ - if ((machInst.vm || elem_mask(v0, ei)) && - i < this->microVl) { - Vd_ud[i] = Mem_vc.as()[i]; - } else { - Vd_ud[i] = Vs2_ud[i]; + 0x00: decode NF { + 0x00: VleOp::vle64_v({{ + if ((machInst.vm || elem_mask(v0, ei)) && + i < this->microVl) { + Vd_ud[i] = Mem_vc.as()[i]; + } else { + Vd_ud[i] = Vs2_ud[i]; + } + }}, inst_flags=VectorUnitStrideLoadOp); + format VlSegOp { + 0x01: vlseg2e64_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 2)) && + i < this->microVl) { + Vd_ud[i] = Mem_vc.as()[i]; + } else { + Vd_ud[i] = Vs2_ud[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x02: vlseg3e64_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 3)) && + i < this->microVl) { + Vd_ud[i] = Mem_vc.as()[i]; + } else { + Vd_ud[i] = Vs2_ud[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x03: vlseg4e64_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 4)) && + i < this->microVl) { + Vd_ud[i] = Mem_vc.as()[i]; + } else { + Vd_ud[i] = Vs2_ud[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x04: vlseg5e64_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 5)) && + i < this->microVl) { + Vd_ud[i] = Mem_vc.as()[i]; + } else { + Vd_ud[i] = Vs2_ud[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x05: vlseg6e64_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 6)) && + i < this->microVl) { + Vd_ud[i] = Mem_vc.as()[i]; + } else { + Vd_ud[i] = Vs2_ud[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x06: vlseg7e64_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 7)) && + i < this->microVl) { + Vd_ud[i] = Mem_vc.as()[i]; + } else { + Vd_ud[i] = Vs2_ud[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); + 0x07: vlseg8e64_v({{ + if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 8)) && + i < this->microVl) { + Vd_ud[i] = Mem_vc.as()[i]; + } else { + Vd_ud[i] = Vs2_ud[i]; + } + }}, inst_flags=VectorUnitStrideSegmentedLoadOp); } - }}, inst_flags=VectorUnitStrideLoadOp); + } 0x08: decode NF { format VlWholeOp { 0x0: vl1re64_v({{ diff --git a/src/arch/riscv/isa/formats/vector_mem.isa b/src/arch/riscv/isa/formats/vector_mem.isa index 77123fb7ef..b86c7a37c4 100644 --- a/src/arch/riscv/isa/formats/vector_mem.isa +++ b/src/arch/riscv/isa/formats/vector_mem.isa @@ -245,3 +245,16 @@ def format VsIndexOp( decode_template=VMemSplitTemplateDecodeBlock ) }}; + +def format VlSegOp( + memacc_code, + ea_code={{ + EA = Rs1 + mem_size * (microIdx + (field * numMicroops)); + }}, + mem_flags=[], + inst_flags=[] +) {{ + (header_output, decoder_output, decode_block, exec_output) = \ + VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags, + 'VlSegMacroInst', exec_template_base='VlSeg') +}}; diff --git a/src/arch/riscv/isa/templates/vector_mem.isa b/src/arch/riscv/isa/templates/vector_mem.isa index 4013b2a5d0..e691c86d95 100644 --- a/src/arch/riscv/isa/templates/vector_mem.isa +++ b/src/arch/riscv/isa/templates/vector_mem.isa @@ -1636,6 +1636,238 @@ Fault }}; +def template VlSegConstructor {{ + +%(class_name)s::%(class_name)s(ExtMachInst _machInst, uint32_t _vlen) + : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s, _vlen) +{ + %(set_reg_idx_arr)s; + %(constructor)s; + + const int32_t micro_vlmax = vlen / width_EEW(_machInst.width); + const uint32_t num_microops = ceil((float) this->vl / (micro_vlmax)); + int32_t remaining_vl = this->vl; + int32_t micro_vl = std::min(remaining_vl, micro_vlmax); + size_t NFIELDS = machInst.nf + 1; + StaticInstPtr microop; + uint32_t size_per_elem = width_EEW(_machInst.width) / 8; + + if (micro_vl == 0) { + microop = new VectorNopMicroInst(_machInst); + this->microops.push_back(microop); + } else { + for (int f = 0; f < NFIELDS; ++f) { + remaining_vl = this->vl; + micro_vl = std::min(remaining_vl, micro_vlmax); + for (int i = 0; i < num_microops && micro_vl > 0; ++i) { + microop = new %(class_name)sMicro(_machInst, micro_vl, i, num_microops, f, NFIELDS, vlen); + microop->setDelayedCommit(); + microop->setFlag(IsLoad); + this->microops.push_back(microop); + micro_vl = std::min(remaining_vl -= micro_vlmax, micro_vlmax); + } + } + for (int f = 0; f < NFIELDS; ++f) { + remaining_vl = this->vl; + micro_vl = std::min(remaining_vl, micro_vlmax); + for (int i = 0; i < num_microops && micro_vl > 0; ++i) { + microop = new VlSegDeIntrlvMicroInst(_machInst, micro_vl, _machInst.vd + i + (f * num_microops), + NFIELDS, i, num_microops, f, vlen, size_per_elem); + this->microops.push_back(microop); + micro_vl = std::min(remaining_vl -= micro_vlmax, micro_vlmax); + } + } + } + + this->microops.front()->setFlag(IsFirstMicroop); + this->microops.back()->setFlag(IsLastMicroop); + this->flags[IsVector] = true; +} +}}; + +def template VlSegMicroDeclare {{ + +class %(class_name)s : public %(base_class)s +{ +private: + // rs1, rs2, vd, vm + RegId srcRegIdxArr[4]; + RegId destRegIdxArr[1]; + uint32_t field; + uint32_t numFields; + uint32_t numMicroops; +public: + %(class_name)s(ExtMachInst _machInst, uint32_t _microVl, uint32_t _microIdx, uint32_t _numMicroops, uint32_t _field, uint32_t _numFields, uint32_t _vlen); + + Fault execute(ExecContext *, trace::InstRecord *) const override; + Fault initiateAcc(ExecContext *, trace::InstRecord *) const override; + Fault completeAcc(PacketPtr, ExecContext *, + trace::InstRecord *) const override; + using %(base_class)s::generateDisassembly; +}; + +}}; + +def template VlSegMicroConstructor {{ + + %(class_name)s::%(class_name)s(ExtMachInst _machInst, uint32_t _microVl, uint32_t _microIdx, uint32_t _numMicroops, uint32_t _field, uint32_t _numFields, uint32_t _vlen) + : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s, _microVl, _microIdx , _numMicroops, _field, _numFields, _vlen) +{ + %(set_reg_idx_arr)s; + + _numSrcRegs = 0; + _numDestRegs = 0; + field = _field; + numFields = _numFields; + numMicroops = _numMicroops; + setDestRegIdx(_numDestRegs++, vecRegClass[VecMemInternalReg0 + _microIdx + + (field * numMicroops)]); + _numTypedDestRegs[VecRegClass]++; + setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]); + setSrcRegIdx(_numSrcRegs++, vecRegClass[VecMemInternalReg0 + _microIdx + + (field * numMicroops)]); + if (!_machInst.vm) { + setSrcRegIdx(_numSrcRegs++, vecRegClass[0]); + } +} + +}}; + +def template VlSegMicroExecute {{ + +Fault +%(class_name)s::execute(ExecContext *xc, trace::InstRecord *traceData) const +{ + Addr EA; + uint32_t mem_size = width_EEW(machInst.width) / 8 * microVl; + + %(op_decl)s; + %(op_rd)s; + %(set_vlen)s; + %(ea_code)s; + + RiscvISA::vreg_t tmp_v0; + uint8_t *v0; + MISA misa = xc->readMiscReg(MISCREG_ISA); + STATUS status = xc->readMiscReg(MISCREG_STATUS); + + if (!misa.rvv || status.vs == VPUStatus::OFF) { + return std::make_shared( + "RVV is disabled or VPU is off", machInst); + } + + if (machInst.vill) + return std::make_shared("VILL is set", machInst); + + const int64_t vlmul = vtype_vlmul(machInst.vtype8); + + panic_if((pow(2, vlmul) * this->numFields) > 8, "LMUL value is illegal for vlseg inst"); + + status.vs = VPUStatus::DIRTY; + xc->setMiscReg(MISCREG_STATUS, status); + + if(!machInst.vm) { + xc->getRegOperand(this, _numSrcRegs - 1, &tmp_v0); + v0 = tmp_v0.as(); + } + + const std::vector byte_enable(mem_size, true); + Fault fault = xc->readMem(EA, Mem.as(), mem_size, memAccessFlags, + byte_enable); + + if (fault != NoFault) + return fault; + + const size_t micro_vlmax = vtype_VLMAX(machInst.vtype8, vlen, true); + const size_t micro_elems = vlen / width_EEW(machInst.width); + + size_t ei; + + for (size_t i = 0; i < micro_elems; i++) { + ei = i + micro_vlmax * microIdx; + %(memacc_code)s; + } + + %(op_wb)s; + return fault; +} + +}}; + +def template VlSegMicroInitiateAcc {{ + +Fault +%(class_name)s::initiateAcc(ExecContext* xc, + trace::InstRecord* traceData) const +{ + + Addr EA; + uint32_t mem_size = width_EEW(this->machInst.width) / 8 * this->microVl; + + %(op_decl)s; + %(op_rd)s; + %(ea_code)s; + + MISA misa = xc->readMiscReg(MISCREG_ISA); + STATUS status = xc->readMiscReg(MISCREG_STATUS); + if (!misa.rvv || status.vs == VPUStatus::OFF) { + return std::make_shared( + "RVV is disabled or VPU is off", machInst); + } + if (machInst.vill) + return std::make_shared("VILL is set", machInst); + + const int64_t vlmul = vtype_vlmul(machInst.vtype8); + + panic_if((pow(2, vlmul) * this->numFields) > 8, "LMUL value is illegal for vlseg inst"); + + const std::vector byte_enable(mem_size, true); + Fault fault = initiateMemRead(xc, EA, mem_size, memAccessFlags, + byte_enable); + return fault; +} + +}}; + +def template VlSegMicroCompleteAcc {{ + +Fault +%(class_name)s::completeAcc(PacketPtr pkt, ExecContext *xc, + trace::InstRecord *traceData) const +{ + %(op_decl)s; + %(op_rd)s; + %(set_vlen)s; + + STATUS status = xc->readMiscReg(MISCREG_STATUS); + status.vs = VPUStatus::DIRTY; + xc->setMiscReg(MISCREG_STATUS, status); + + RiscvISA::vreg_t tmp_v0; + uint8_t *v0; + if(!machInst.vm) { + xc->getRegOperand(this, _numSrcRegs - 1, &tmp_v0); + v0 = tmp_v0.as(); + } + + memcpy(Mem.as(), pkt->getPtr(), pkt->getSize()); + + const size_t micro_vlmax = vtype_VLMAX(machInst.vtype8, vlen, true); + const size_t micro_elems = vlen / width_EEW(machInst.width); + + size_t ei; + for (size_t i = 0; i < micro_elems; i++) { + ei = i + micro_vlmax * microIdx; + %(memacc_code)s; + } + + %(op_wb)s; + return NoFault; +} + +}}; + + def template VMemBaseDecodeBlock {{ return new %(class_name)s(machInst, vlen); }}; diff --git a/src/arch/riscv/utility.hh b/src/arch/riscv/utility.hh index cf5620d250..cd1379b8d2 100644 --- a/src/arch/riscv/utility.hh +++ b/src/arch/riscv/utility.hh @@ -306,6 +306,17 @@ elem_mask(const T* vs, const int index) return (vs[idx] >> pos) & 1; } +template +inline int +elem_mask_vlseg(const T* vs, const int elem, const int num_fields) +{ + int index = floor(elem / num_fields); + static_assert(std::is_integral_v); + int idx = index / (sizeof(T)*8); + int pos = index % (sizeof(T)*8); + return (vs[idx] >> pos) & 1; +} + template auto ftype(IntType a) -> FloatType { diff --git a/src/cpu/FuncUnit.py b/src/cpu/FuncUnit.py index cba3eda878..a32138a29c 100644 --- a/src/cpu/FuncUnit.py +++ b/src/cpu/FuncUnit.py @@ -112,6 +112,7 @@ class OpClass(Enum): "VectorWholeRegisterLoad", "VectorWholeRegisterStore", "VectorIntegerArith", + "VectorUnitStrideSegmentedLoad", "VectorFloatArith", "VectorFloatConvert", "VectorIntegerReduce", diff --git a/src/cpu/minor/BaseMinorCPU.py b/src/cpu/minor/BaseMinorCPU.py index 7110caac2c..6369981c57 100644 --- a/src/cpu/minor/BaseMinorCPU.py +++ b/src/cpu/minor/BaseMinorCPU.py @@ -261,6 +261,7 @@ class MinorDefaultVecFU(MinorFU): "VectorIndexedLoad", "VectorIndexedStore", "VectorUnitStrideFaultOnlyFirstLoad", + "VectorUnitStrideSegmentedLoad", "VectorWholeRegisterLoad", "VectorWholeRegisterStore", "VectorIntegerArith", diff --git a/src/cpu/op_class.hh b/src/cpu/op_class.hh index 0151df06a9..6690edcae5 100644 --- a/src/cpu/op_class.hh +++ b/src/cpu/op_class.hh @@ -133,6 +133,7 @@ static const OpClass VectorIntegerReduceOp = enums::VectorIntegerReduce; static const OpClass VectorFloatReduceOp = enums::VectorFloatReduce; static const OpClass VectorMiscOp = enums::VectorMisc; static const OpClass VectorIntegerExtensionOp = enums::VectorIntegerExtension; +static const OpClass VectorUnitStrideSegmentedLoadOp = enums::VectorUnitStrideSegmentedLoad; static const OpClass VectorConfigOp = enums::VectorConfig; static const OpClass Num_OpClasses = enums::Num_OpClass;