From 1e743fd85ab55ea75301f2fed20243387806fe72 Mon Sep 17 00:00:00 2001 From: Ivan Fernandez Date: Fri, 22 Mar 2024 23:45:58 +0100 Subject: [PATCH] arch-riscv: adding vector unit-stride segment stores to RISC-V (#913) This commit adds support for vector unit-stride segment store operations for RISC-V (vssegXeXX). This implementation is based in two types of microops: - VsSegIntrlv microops that properly interleave source registers into structs. - VsSeg microops that store data in memory as contiguous structs of several fields. Change-Id: Id80dd4e781743a60eb76c18b6a28061f8e9f723d Gem5 issue: https://github.com/gem5/gem5/issues/382 --- src/arch/riscv/insts/vector.cc | 111 +++++++++- src/arch/riscv/insts/vector.hh | 59 +++++ src/arch/riscv/isa/decoder.isa | 180 +++++++++++---- src/arch/riscv/isa/formats/vector_mem.isa | 13 ++ src/arch/riscv/isa/templates/vector_mem.isa | 229 ++++++++++++++++++++ src/arch/riscv/utility.hh | 2 +- src/cpu/FuncUnit.py | 1 + src/cpu/op_class.hh | 2 + 8 files changed, 555 insertions(+), 42 deletions(-) diff --git a/src/arch/riscv/insts/vector.cc b/src/arch/riscv/insts/vector.cc index c044752ac8..c4942f6d51 100644 --- a/src/arch/riscv/insts/vector.cc +++ b/src/arch/riscv/insts/vector.cc @@ -501,7 +501,6 @@ VxsatMicroInst::generateDisassembly(Addr pc, return ss.str(); } - VlFFTrimVlMicroOp::VlFFTrimVlMicroOp(ExtMachInst _machInst, uint32_t _microVl, uint32_t _microIdx, uint32_t _vlen, std::vector& _microops) : VectorMicroInst("vlff_trimvl_v_micro", _machInst, VectorConfigOp, @@ -685,5 +684,115 @@ VlSegDeIntrlvMicroInst::generateDisassembly(Addr pc, const loader::SymbolTable * return ss.str(); } +std::string VsSegMacroInst::generateDisassembly(Addr pc, + const loader::SymbolTable *symtab) const +{ + std::stringstream ss; + ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " << + '(' << registerName(srcRegIdx(0)) << ')' << + ", " << registerName(srcRegIdx(1)); + if (!machInst.vm) + ss << ", v0.t"; + return ss.str(); +} + +std::string VsSegMicroInst::generateDisassembly(Addr pc, + const loader::SymbolTable *symtab) const +{ + std::stringstream ss; + ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " << + '(' << registerName(srcRegIdx(0)) << ')' << + ", "<< registerName(srcRegIdx(1)); + if (microIdx != 0 || machInst.vtype8.vma == 0 || machInst.vtype8.vta == 0) + ss << ", " << registerName(srcRegIdx(2)); + if (!machInst.vm) + ss << ", v0.t"; + return ss.str(); +} + +VsSegIntrlvMicroInst::VsSegIntrlvMicroInst(ExtMachInst extMachInst, uint32_t _micro_vl, + uint32_t _dstReg, uint32_t _numSrcs, + uint32_t _microIdx, uint32_t _numMicroops, + uint32_t _field, uint32_t _vlen, uint32_t _sizeOfElement) + : VectorArithMicroInst("vsseg_reintrlv_micro", extMachInst, + VectorIntegerArithOp, 0, 0), + vlen(_vlen) +{ + setRegIdxArrays( + reinterpret_cast( + &std::remove_pointer_t::srcRegIdxArr), + reinterpret_cast( + &std::remove_pointer_t::destRegIdxArr)); + + _numSrcRegs = 0; + _numDestRegs = 0; + numSrcs = _numSrcs; + numMicroops = _numMicroops; + field =_field; + sizeOfElement = _sizeOfElement; + microIdx = _microIdx; + micro_vl = _micro_vl; + + setDestRegIdx(_numDestRegs++, vecRegClass[VecMemInternalReg0 + field + + (_microIdx * numSrcs)]); + + _numTypedDestRegs[VecRegClass]++; + for (uint8_t i=0; i<_numSrcs; i++) { + setSrcRegIdx(_numSrcRegs++, vecRegClass[_dstReg + (i * numMicroops) + + (microIdx)]); + } +} + +Fault +VsSegIntrlvMicroInst::execute(ExecContext* xc, + trace::InstRecord* traceData) const +{ + const uint32_t elems_per_vreg = micro_vl; + vreg_t& tmp_d0 = *(vreg_t *)xc->getWritableRegOperand(this, 0); + auto Vd = tmp_d0.as(); + + vreg_t tmp_s; + auto s = tmp_s.as(); + xc->getRegOperand(this, 0, &tmp_s); + s = tmp_s.as(); + + uint32_t indexVd = 0; + uint32_t srcReg = (field * elems_per_vreg) % numSrcs; + uint32_t indexs = (field * elems_per_vreg) / numSrcs; + + while (indexVd < elems_per_vreg) { + xc->getRegOperand(this, srcReg, &tmp_s); + s = tmp_s.as(); + + memcpy(Vd + (indexVd * sizeOfElement), + s + (indexs * sizeOfElement), + sizeOfElement); + + indexVd++; + srcReg++; + if (srcReg >= numSrcs) { + srcReg = 0; + indexs++; + } + } + + if (traceData) + traceData->setData(vecRegClass, &tmp_d0); + return NoFault; +} + +std::string +VsSegIntrlvMicroInst::generateDisassembly(Addr pc, + const loader::SymbolTable *symtab) const +{ + std::stringstream ss; + ss << mnemonic << ' ' << registerName(destRegIdx(0)); + for (uint8_t i = 0; i < this->_numSrcRegs; i++) { + ss << ", " << registerName(srcRegIdx(i)); + } + ss << ", field: " << field; + return ss.str(); +} + } // namespace RiscvISA } // namespace gem5 diff --git a/src/arch/riscv/insts/vector.hh b/src/arch/riscv/insts/vector.hh index db562cf55c..4fcbb410d5 100644 --- a/src/arch/riscv/insts/vector.hh +++ b/src/arch/riscv/insts/vector.hh @@ -655,6 +655,65 @@ class VlSegDeIntrlvMicroInst : public VectorArithMicroInst const loader::SymbolTable *) const override; }; +class VsSegMacroInst : public VectorMemMacroInst +{ + protected: + VsSegMacroInst(const char* mnem, ExtMachInst _machInst, + OpClass __opClass, uint32_t _vlen) + : VectorMemMacroInst(mnem, _machInst, __opClass, _vlen) + {} + + std::string generateDisassembly( + Addr pc, const loader::SymbolTable *symtab) const override; +}; + +class VsSegMicroInst : public VectorMicroInst +{ + protected: + Request::Flags memAccessFlags; + uint8_t regIdx; + + VsSegMicroInst(const char *mnem, ExtMachInst _machInst, + OpClass __opClass, uint32_t _microVl, + uint32_t _microIdx, uint32_t _numMicroops, + uint32_t _field, uint32_t _numFields, + uint32_t _vlen) + : VectorMicroInst(mnem, _machInst, __opClass, _microVl, + _microIdx, _vlen) + { + this->flags[IsStore] = true; + } + + std::string generateDisassembly( + Addr pc, const loader::SymbolTable *symtab) const override; +}; + +class VsSegIntrlvMicroInst : public VectorArithMicroInst +{ + private: + RegId srcRegIdxArr[NumVecInternalRegs]; + RegId destRegIdxArr[1]; + uint32_t numSrcs; + uint32_t numMicroops; + uint32_t field; + uint32_t sizeOfElement; + uint32_t micro_vl; + + public: + uint32_t vlen; + + VsSegIntrlvMicroInst(ExtMachInst extMachInst, uint32_t _micro_vl, + uint32_t _dstReg, uint32_t _numSrcs, + uint32_t _microIdx, uint32_t _numMicroops, + uint32_t _field, uint32_t _vlen, + uint32_t _sizeOfElement); + + Fault execute(ExecContext *, trace::InstRecord *) const override; + + std::string generateDisassembly(Addr, + const loader::SymbolTable *) const override; +}; + } // namespace RiscvISA } // namespace gem5 diff --git a/src/arch/riscv/isa/decoder.isa b/src/arch/riscv/isa/decoder.isa index cdcb0bc6eb..2147d8701b 100644 --- a/src/arch/riscv/isa/decoder.isa +++ b/src/arch/riscv/isa/decoder.isa @@ -610,7 +610,7 @@ decode QUADRANT default Unknown::unknown() { }}, inst_flags=VectorUnitStrideLoadOp); format VlSegOp { 0x01: vlseg2e8_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 2)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 2)) && i < this->microVl) { Vd_ub[i] = Mem_vc.as()[i]; } else { @@ -618,7 +618,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x02: vlseg3e8_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 3)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 3)) && i < this->microVl) { Vd_ub[i] = Mem_vc.as()[i]; } else { @@ -626,7 +626,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x03: vlseg4e8_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 4)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 4)) && i < this->microVl) { Vd_ub[i] = Mem_vc.as()[i]; } else { @@ -634,7 +634,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x04: vlseg5e8_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 5)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 5)) && i < this->microVl) { Vd_ub[i] = Mem_vc.as()[i]; } else { @@ -642,7 +642,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x05: vlseg6e8_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 6)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 6)) && i < this->microVl) { Vd_ub[i] = Mem_vc.as()[i]; } else { @@ -650,7 +650,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x06: vlseg7e8_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 7)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 7)) && i < this->microVl) { Vd_ub[i] = Mem_vc.as()[i]; } else { @@ -658,7 +658,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x07: vlseg8e8_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 8)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 8)) && i < this->microVl) { Vd_ub[i] = Mem_vc.as()[i]; } else { @@ -722,7 +722,7 @@ decode QUADRANT default Unknown::unknown() { }}, inst_flags=VectorUnitStrideLoadOp); format VlSegOp { 0x01: vlseg2e16_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 2)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 2)) && i < this->microVl) { Vd_uh[i] = Mem_vc.as()[i]; } else { @@ -730,7 +730,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x02: vlseg3e16_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 3)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 3)) && i < this->microVl) { Vd_uh[i] = Mem_vc.as()[i]; } else { @@ -738,7 +738,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x03: vlseg4e16_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 4)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 4)) && i < this->microVl) { Vd_uh[i] = Mem_vc.as()[i]; } else { @@ -746,7 +746,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x04: vlseg5e16_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 5)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 5)) && i < this->microVl) { Vd_uh[i] = Mem_vc.as()[i]; } else { @@ -754,7 +754,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x05: vlseg6e16_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 6)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 6)) && i < this->microVl) { Vd_uh[i] = Mem_vc.as()[i]; } else { @@ -762,7 +762,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x06: vlseg7e16_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 7)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 7)) && i < this->microVl) { Vd_uh[i] = Mem_vc.as()[i]; } else { @@ -770,7 +770,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x07: vlseg8e16_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 8)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 8)) && i < this->microVl) { Vd_uh[i] = Mem_vc.as()[i]; } else { @@ -831,7 +831,7 @@ decode QUADRANT default Unknown::unknown() { }}, inst_flags=VectorUnitStrideLoadOp); format VlSegOp { 0x01: vlseg2e32_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 2)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 2)) && i < this->microVl) { Vd_uw[i] = Mem_vc.as()[i]; } else { @@ -839,7 +839,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x02: vlseg3e32_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 3)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 3)) && i < this->microVl) { Vd_uw[i] = Mem_vc.as()[i]; } else { @@ -847,7 +847,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x03: vlseg4e32_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 4)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 4)) && i < this->microVl) { Vd_uw[i] = Mem_vc.as()[i]; } else { @@ -855,7 +855,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x04: vlseg5e32_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 5)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 5)) && i < this->microVl) { Vd_uw[i] = Mem_vc.as()[i]; } else { @@ -863,7 +863,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x05: vlseg6e32_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 6)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 6)) && i < this->microVl) { Vd_uw[i] = Mem_vc.as()[i]; } else { @@ -871,7 +871,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x06: vlseg7e32_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 7)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 7)) && i < this->microVl) { Vd_uw[i] = Mem_vc.as()[i]; } else { @@ -879,7 +879,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x07: vlseg8e32_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 8)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 8)) && i < this->microVl) { Vd_uw[i] = Mem_vc.as()[i]; } else { @@ -940,7 +940,7 @@ decode QUADRANT default Unknown::unknown() { }}, inst_flags=VectorUnitStrideLoadOp); format VlSegOp { 0x01: vlseg2e64_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 2)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 2)) && i < this->microVl) { Vd_ud[i] = Mem_vc.as()[i]; } else { @@ -948,7 +948,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x02: vlseg3e64_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 3)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 3)) && i < this->microVl) { Vd_ud[i] = Mem_vc.as()[i]; } else { @@ -956,7 +956,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x03: vlseg4e64_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 4)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 4)) && i < this->microVl) { Vd_ud[i] = Mem_vc.as()[i]; } else { @@ -964,7 +964,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x04: vlseg5e64_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 5)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 5)) && i < this->microVl) { Vd_ud[i] = Mem_vc.as()[i]; } else { @@ -972,7 +972,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x05: vlseg6e64_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 6)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 6)) && i < this->microVl) { Vd_ud[i] = Mem_vc.as()[i]; } else { @@ -980,7 +980,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x06: vlseg7e64_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 7)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 7)) && i < this->microVl) { Vd_ud[i] = Mem_vc.as()[i]; } else { @@ -988,7 +988,7 @@ decode QUADRANT default Unknown::unknown() { } }}, inst_flags=VectorUnitStrideSegmentedLoadOp); 0x07: vlseg8e64_v({{ - if ((machInst.vm || elem_mask_vlseg(v0, ei + (field * micro_elems), 8)) && + if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 8)) && i < this->microVl) { Vd_ud[i] = Mem_vc.as()[i]; } else { @@ -1363,9 +1363,34 @@ decode QUADRANT default Unknown::unknown() { 0x0: decode MOP { 0x0: decode SUMOP { - 0x00: VseOp::vse8_v({{ - Mem_vc.as()[i] = Vs3_ub[i]; - }}, inst_flags=VectorUnitStrideStoreOp); + 0x00: decode NF { + 0x00: VseOp::vse8_v({{ + Mem_vc.as()[i] = Vs3_ub[i]; + }}, inst_flags=VectorUnitStrideStoreOp); + format VsSegOp { + 0x01: vsseg2e8_v({{ + Mem_vc.as()[i] = Vs3_ub[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x02: vsseg3e8_v({{ + Mem_vc.as()[i] = Vs3_ub[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x03: vsseg4e8_v({{ + Mem_vc.as()[i] = Vs3_ub[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x04: vsseg5e8_v({{ + Mem_vc.as()[i] = Vs3_ub[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x05: vsseg6e8_v({{ + Mem_vc.as()[i] = Vs3_ub[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x06: vsseg7e8_v({{ + Mem_vc.as()[i] = Vs3_ub[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x07: vsseg8e8_v({{ + Mem_vc.as()[i] = Vs3_ub[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + } + } format VsWholeOp { 0x8: decode NF { 0x0: vs1r_v({{ @@ -1402,9 +1427,34 @@ decode QUADRANT default Unknown::unknown() { } 0x5: decode MOP { 0x0: decode SUMOP { - 0x00: VseOp::vse16_v({{ - Mem_vc.as()[i] = Vs3_uh[i]; - }}, inst_flags=VectorUnitStrideStoreOp); + 0x00: decode NF { + 0x00: VseOp::vse16_v({{ + Mem_vc.as()[i] = Vs3_uh[i]; + }}, inst_flags=VectorUnitStrideStoreOp); + format VsSegOp { + 0x01: vsseg2e16_v({{ + Mem_vc.as()[i] = Vs3_uh[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x02: vsseg3e16_v({{ + Mem_vc.as()[i] = Vs3_uh[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x03: vsseg4e16_v({{ + Mem_vc.as()[i] = Vs3_uh[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x04: vsseg5e16_v({{ + Mem_vc.as()[i] = Vs3_uh[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x05: vsseg6e16_v({{ + Mem_vc.as()[i] = Vs3_uh[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x06: vsseg7e16_v({{ + Mem_vc.as()[i] = Vs3_uh[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x07: vsseg8e16_v({{ + Mem_vc.as()[i] = Vs3_uh[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + } + } } 0x1: VsIndexOp::vsuxei16_v({{ Mem_vc.as()[0] = Vs3_vu[vs3ElemIdx]; @@ -1422,9 +1472,34 @@ decode QUADRANT default Unknown::unknown() { } 0x6: decode MOP { 0x0: decode SUMOP { - 0x00: VseOp::vse32_v({{ - Mem_vc.as()[i] = Vs3_uw[i]; - }}, inst_flags=VectorUnitStrideStoreOp); + 0x00: decode NF { + 0x00: VseOp::vse32_v({{ + Mem_vc.as()[i] = Vs3_uw[i]; + }}, inst_flags=VectorUnitStrideStoreOp); + format VsSegOp { + 0x01: vsseg2e32_v({{ + Mem_vc.as()[i] = Vs3_uw[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x02: vsseg3e32_v({{ + Mem_vc.as()[i] = Vs3_uw[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x03: vsseg4e32_v({{ + Mem_vc.as()[i] = Vs3_uw[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x04: vsseg5e32_v({{ + Mem_vc.as()[i] = Vs3_uw[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x05: vsseg6e32_v({{ + Mem_vc.as()[i] = Vs3_uw[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x06: vsseg7e32_v({{ + Mem_vc.as()[i] = Vs3_uw[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x07: vsseg8e32_v({{ + Mem_vc.as()[i] = Vs3_uw[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + } + } } 0x1: VsIndexOp::vsuxei32_v({{ Mem_vc.as()[0] = Vs3_vu[vs3ElemIdx]; @@ -1442,9 +1517,34 @@ decode QUADRANT default Unknown::unknown() { } 0x7: decode MOP { 0x0: decode SUMOP { - 0x00: VseOp::vse64_v({{ - Mem_vc.as()[i] = Vs3_ud[i]; - }}, inst_flags=VectorUnitStrideStoreOp); + 0x00: decode NF { + 0x00: VseOp::vse64_v({{ + Mem_vc.as()[i] = Vs3_ud[i]; + }}, inst_flags=VectorUnitStrideStoreOp); + format VsSegOp { + 0x01: vsseg2e64_v({{ + Mem_vc.as()[i] = Vs3_ud[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x02: vsseg3e64_v({{ + Mem_vc.as()[i] = Vs3_ud[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x03: vsseg4e64_v({{ + Mem_vc.as()[i] = Vs3_ud[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x04: vsseg5e64_v({{ + Mem_vc.as()[i] = Vs3_ud[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x05: vsseg6e64_v({{ + Mem_vc.as()[i] = Vs3_ud[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x06: vsseg7e64_v({{ + Mem_vc.as()[i] = Vs3_ud[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + 0x07: vsseg8e64_v({{ + Mem_vc.as()[i] = Vs3_ud[i]; + }}, inst_flags=VectorUnitStrideSegmentedStoreOp); + } + } } 0x1: VsIndexOp::vsuxei64_v({{ Mem_vc.as()[0] = Vs3_vu[vs3ElemIdx]; diff --git a/src/arch/riscv/isa/formats/vector_mem.isa b/src/arch/riscv/isa/formats/vector_mem.isa index b86c7a37c4..560673f4cb 100644 --- a/src/arch/riscv/isa/formats/vector_mem.isa +++ b/src/arch/riscv/isa/formats/vector_mem.isa @@ -258,3 +258,16 @@ def format VlSegOp( VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags, 'VlSegMacroInst', exec_template_base='VlSeg') }}; + +def format VsSegOp( + memacc_code, + ea_code={{ + EA = Rs1 + mem_size * (microIdx + (field * numMicroops)); + }}, + mem_flags=[], + inst_flags=[] +) {{ + (header_output, decoder_output, decode_block, exec_output) = \ + VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags, + 'VsSegMacroInst', exec_template_base='VsSeg') +}}; diff --git a/src/arch/riscv/isa/templates/vector_mem.isa b/src/arch/riscv/isa/templates/vector_mem.isa index e691c86d95..6393906043 100644 --- a/src/arch/riscv/isa/templates/vector_mem.isa +++ b/src/arch/riscv/isa/templates/vector_mem.isa @@ -1867,6 +1867,235 @@ Fault }}; +def template VsSegConstructor {{ + +%(class_name)s::%(class_name)s(ExtMachInst _machInst, uint32_t _vlen) + : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s, _vlen) +{ + %(set_reg_idx_arr)s; + %(constructor)s; + + const int32_t micro_vlmax = vlen / width_EEW(_machInst.width); + const uint32_t num_microops = ceil((float) this->vl / (micro_vlmax)); + int32_t remaining_vl = this->vl; + int32_t micro_vl = std::min(remaining_vl, micro_vlmax); + size_t NFIELDS = machInst.nf + 1; + StaticInstPtr microop; + uint32_t size_per_elem = width_EEW(_machInst.width) / 8; + + if (micro_vl == 0) { + microop = new VectorNopMicroInst(_machInst); + this->microops.push_back(microop); + } else { + for (int f = 0; f < NFIELDS; ++f) { + remaining_vl = this->vl; + micro_vl = std::min(remaining_vl, micro_vlmax); + for (int i = 0; i < num_microops && micro_vl > 0; ++i) { + microop = new VsSegIntrlvMicroInst(_machInst, micro_vl, + _machInst.vs3, NFIELDS, i, num_microops, f, vlen, + size_per_elem); + this->microops.push_back(microop); + micro_vl = std::min(remaining_vl -= micro_vlmax, micro_vlmax); + } + } + for (int f = 0; f < NFIELDS; ++f) { + remaining_vl = this->vl; + micro_vl = std::min(remaining_vl, micro_vlmax); + for (int i = 0; i < num_microops && micro_vl > 0; ++i) { + microop = new %(class_name)sMicro(_machInst, micro_vl, i, + num_microops, f, NFIELDS, vlen); + microop->setDelayedCommit(); + microop->setFlag(IsStore); + this->microops.push_back(microop); + micro_vl = std::min(remaining_vl -= micro_vlmax, micro_vlmax); + } + } + } + + this->microops.front()->setFlag(IsFirstMicroop); + this->microops.back()->setFlag(IsLastMicroop); + this->flags[IsVector] = true; +} +}}; + +def template VsSegMicroDeclare {{ + +class %(class_name)s : public %(base_class)s +{ +private: + // rs1, rs2, vd, vm + RegId srcRegIdxArr[4]; + RegId destRegIdxArr[1]; + uint32_t field; + uint32_t numFields; + uint32_t numMicroops; +public: + %(class_name)s(ExtMachInst _machInst, uint32_t _microVl, + uint32_t _microIdx, uint32_t _numMicroops, uint32_t _field, + uint32_t _numFields, uint32_t _vlen); + + Fault execute(ExecContext *, trace::InstRecord *) const override; + Fault initiateAcc(ExecContext *, trace::InstRecord *) const override; + Fault completeAcc(PacketPtr, ExecContext *, + trace::InstRecord *) const override; + using %(base_class)s::generateDisassembly; +}; + +}}; + +def template VsSegMicroConstructor {{ + + %(class_name)s::%(class_name)s(ExtMachInst _machInst, uint32_t _microVl, + uint32_t _microIdx, uint32_t _numMicroops, uint32_t _field, + uint32_t _numFields, uint32_t _vlen) + : %(base_class)s("%(mnemonic)s", _machInst, %(op_class)s, _microVl, + _microIdx, _numMicroops, _field, _numFields, _vlen) +{ + %(set_reg_idx_arr)s; + + _numSrcRegs = 0; + _numDestRegs = 0; + field = _field; + numFields = _numFields; + numMicroops = _numMicroops; + + setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]); + setSrcRegIdx(_numSrcRegs++, vecRegClass[VecMemInternalReg0 + _microIdx + + (field * numMicroops)]); + + if (!_machInst.vm) { + setSrcRegIdx(_numSrcRegs++, vecRegClass[0]); + } + this->flags[IsVector] = true; + this->flags[IsStore] = true; +} + +}}; + +def template VsSegMicroExecute {{ + +Fault +%(class_name)s::execute(ExecContext *xc, trace::InstRecord *traceData) const +{ + Addr EA; + + const size_t eewb = width_EEW(machInst.width) / 8; + const size_t mem_size = eewb * microVl; + + RiscvISA::vreg_t tmp_v0; + uint8_t *v0; + MISA misa = xc->readMiscReg(MISCREG_ISA); + STATUS status = xc->readMiscReg(MISCREG_STATUS); + if (!misa.rvv || status.vs == VPUStatus::OFF) { + return std::make_shared( + "RVV is disabled or VPU is off", machInst); + } + if (machInst.vill) + return std::make_shared("VILL is set", machInst); + if(!machInst.vm) { + xc->getRegOperand(this, _numSrcRegs - 1, &tmp_v0); + v0 = tmp_v0.as(); + } + + %(op_decl)s; + %(op_rd)s; + %(set_vlen)s; + %(ea_code)s; + + const int64_t vlmul = vtype_vlmul(machInst.vtype8); + panic_if((pow(2, vlmul) * this->numFields) > 8, + "LMUL value is illegal for vsseg inst"); + + const size_t micro_vlmax = vlen / width_EEW(machInst.width); + + std::vector byte_enable(mem_size, false); + size_t ei; + for (size_t i = 0; i < microVl; i++) { + ei = i + micro_vlmax * microIdx; + if (machInst.vm || elem_mask_vseg(v0, ei + (field * microVl), + this->numFields)) { + %(memacc_code)s; + auto it = byte_enable.begin() + i * eewb; + std::fill(it, it + eewb, true); + } + } + + Fault fault; + fault = xc->writeMem(Mem.as(), mem_size, EA, memAccessFlags, + nullptr, byte_enable); + return fault; +} + +}}; + +def template VsSegMicroInitiateAcc {{ + +Fault +%(class_name)s::initiateAcc(ExecContext* xc, + trace::InstRecord* traceData) const +{ + + Addr EA; + + const size_t eewb = width_EEW(machInst.width) / 8; + const size_t mem_size = eewb * microVl; + + RiscvISA::vreg_t tmp_v0; + uint8_t *v0; + MISA misa = xc->readMiscReg(MISCREG_ISA); + STATUS status = xc->readMiscReg(MISCREG_STATUS); + if (!misa.rvv || status.vs == VPUStatus::OFF) { + return std::make_shared( + "RVV is disabled or VPU is off", machInst); + } + if (machInst.vill) + return std::make_shared("VILL is set", machInst); + if(!machInst.vm) { + xc->getRegOperand(this, _numSrcRegs - 1, &tmp_v0); + v0 = tmp_v0.as(); + } + + %(op_decl)s; + %(op_rd)s; + %(ea_code)s; + + const int64_t vlmul = vtype_vlmul(machInst.vtype8); + panic_if((pow(2, vlmul) * this->numFields) > 8, + "LMUL value is illegal for vsseg inst"); + + + const size_t micro_vlmax = vlen / width_EEW(machInst.width); + + std::vector byte_enable(mem_size, false); + size_t ei; + for (size_t i = 0; i < microVl; i++) { + ei = i + micro_vlmax * microIdx; + if (machInst.vm || elem_mask_vseg(v0, ei + (field * microVl), + this->numFields)) { + %(memacc_code)s; + auto it = byte_enable.begin() + i * eewb; + std::fill(it, it + eewb, true); + } + } + + Fault fault; + fault = xc->writeMem(Mem.as(), mem_size, EA, memAccessFlags, + nullptr, byte_enable); + return fault; +} + +}}; + +def template VsSegMicroCompleteAcc {{ + +Fault +%(class_name)s::completeAcc(PacketPtr pkt, ExecContext *xc, + trace::InstRecord *traceData) const +{ + return NoFault; +} + +}}; def template VMemBaseDecodeBlock {{ return new %(class_name)s(machInst, vlen); diff --git a/src/arch/riscv/utility.hh b/src/arch/riscv/utility.hh index cd1379b8d2..73cd7126ce 100644 --- a/src/arch/riscv/utility.hh +++ b/src/arch/riscv/utility.hh @@ -308,7 +308,7 @@ elem_mask(const T* vs, const int index) template inline int -elem_mask_vlseg(const T* vs, const int elem, const int num_fields) +elem_mask_vseg(const T* vs, const int elem, const int num_fields) { int index = floor(elem / num_fields); static_assert(std::is_integral_v); diff --git a/src/cpu/FuncUnit.py b/src/cpu/FuncUnit.py index a32138a29c..e6a5040b9c 100644 --- a/src/cpu/FuncUnit.py +++ b/src/cpu/FuncUnit.py @@ -102,6 +102,7 @@ class OpClass(Enum): "InstPrefetch", "VectorUnitStrideLoad", "VectorUnitStrideStore", + "VectorUnitStrideSegmentedStore", "VectorUnitStrideMaskLoad", "VectorUnitStrideMaskStore", "VectorStridedLoad", diff --git a/src/cpu/op_class.hh b/src/cpu/op_class.hh index 6690edcae5..54e9c272db 100644 --- a/src/cpu/op_class.hh +++ b/src/cpu/op_class.hh @@ -135,6 +135,8 @@ static const OpClass VectorMiscOp = enums::VectorMisc; static const OpClass VectorIntegerExtensionOp = enums::VectorIntegerExtension; static const OpClass VectorUnitStrideSegmentedLoadOp = enums::VectorUnitStrideSegmentedLoad; static const OpClass VectorConfigOp = enums::VectorConfig; +static const OpClass VectorUnitStrideSegmentedStoreOp + = enums::VectorUnitStrideSegmentedStore; static const OpClass Num_OpClasses = enums::Num_OpClass; } // namespace gem5