diff --git a/src/arch/generic/vec_reg.hh b/src/arch/generic/vec_reg.hh index d643c9db5b..e399aa2ab7 100644 --- a/src/arch/generic/vec_reg.hh +++ b/src/arch/generic/vec_reg.hh @@ -140,8 +140,11 @@ class VecRegContainer VecRegContainer() {} VecRegContainer(const VecRegContainer &) = default; + /** Set the container. */ + void set(uint8_t val) { memset(container.data(), val, SIZE); } + /** Zero the container. */ - void zero() { memset(container.data(), 0, SIZE); } + void zero() { set(0); } /** Assignment operators. */ /** @{ */ diff --git a/src/arch/riscv/insts/vector.cc b/src/arch/riscv/insts/vector.cc index d73fb93882..5bcfa05c84 100644 --- a/src/arch/riscv/insts/vector.cc +++ b/src/arch/riscv/insts/vector.cc @@ -444,14 +444,14 @@ VMaskMergeMicroInst::execute(ExecContext* xc, uint32_t vlenb = pc_ptr->as().vlenb(); const uint32_t elems_per_vreg = vlenb / elemSize; size_t bit_cnt = elems_per_vreg; + + // mask tails are always treated as agnostic: writting 1s + tmp_d0.set(0xff); + vreg_t tmp_s; - xc->getRegOperand(this, 0, &tmp_s); - auto s = tmp_s.as(); - // cp the first result and tail - memcpy(Vd, s, vlenb); - for (uint8_t i = 1; i < this->_numSrcRegs; i++) { + for (uint8_t i = 0; i < this->_numSrcRegs; i++) { xc->getRegOperand(this, i, &tmp_s); - s = tmp_s.as(); + auto s = tmp_s.as(); if (elems_per_vreg < 8) { const uint32_t m = (1 << elems_per_vreg) - 1; const uint32_t mask = m << (i * elems_per_vreg % 8); @@ -658,7 +658,13 @@ VlSegDeIntrlvMicroInst::execute(ExecContext* xc, trace::InstRecord* traceData) c for (uint32_t i = 0; i < numSrcs; i++) { xc->getRegOperand(this, i, &tmp_s); s = tmp_s.as(); - while(index < (i + 1) * elems_per_vreg) + + // copy tail/inactive elements from vtmp src + if (i == field) { + tmp_d0 = tmp_s; + } + + while (index < (i + 1) * elems_per_vreg) { memcpy(Vd + (elem * sizeOfElement), s + ((index % elems_per_vreg) * sizeOfElement), @@ -705,8 +711,6 @@ std::string VsSegMicroInst::generateDisassembly(Addr pc, ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " << '(' << registerName(srcRegIdx(0)) << ')' << ", "<< registerName(srcRegIdx(1)); - if (microIdx != 0 || machInst.vtype8.vma == 0 || machInst.vtype8.vta == 0) - ss << ", " << registerName(srcRegIdx(2)); if (!machInst.vm) ss << ", v0.t"; return ss.str(); @@ -797,5 +801,134 @@ VsSegIntrlvMicroInst::generateDisassembly(Addr pc, return ss.str(); } +VCpyVsMicroInst::VCpyVsMicroInst(ExtMachInst _machInst, uint32_t _microIdx, + uint8_t _vsRegIdx) + : VectorArithMicroInst("vcpyvs_v_micro", _machInst, SimdMiscOp, 0, + _microIdx) +{ + setRegIdxArrays( + reinterpret_cast( + &std::remove_pointer_t::srcRegIdxArr), + reinterpret_cast( + &std::remove_pointer_t::destRegIdxArr)); + + _numSrcRegs = 0; + _numDestRegs = 0; + setDestRegIdx(_numDestRegs++, vecRegClass[VecMemInternalReg0 + _microIdx]); + _numTypedDestRegs[VecRegClass]++; + setSrcRegIdx(_numSrcRegs++, vecRegClass[_vsRegIdx + _microIdx]); +} + +Fault +VCpyVsMicroInst::execute(ExecContext* xc, trace::InstRecord* traceData) const +{ + MISA misa = xc->readMiscReg(MISCREG_ISA); + STATUS status = xc->readMiscReg(MISCREG_STATUS); + + if (!misa.rvv || status.vs == VPUStatus::OFF) { + return std::make_shared( + "RVV is disabled or VPU is off", machInst); + } + + status.vs = VPUStatus::DIRTY; + xc->setMiscReg(MISCREG_STATUS, status); + + // copy vector source reg to vtmp + vreg_t& vtmp = *(vreg_t *)xc->getWritableRegOperand(this, 0); + vreg_t vs; + xc->getRegOperand(this, 0, &vs); + vtmp = vs; + + if (traceData) { + traceData->setData(vecRegClass, &vtmp); + } + + return NoFault; +} + +std::string +VCpyVsMicroInst::generateDisassembly(Addr pc, + const loader::SymbolTable *symtab) const +{ + std::stringstream ss; + ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " + << registerName(srcRegIdx(0)); + return ss.str(); +} + +VPinVdMicroInst::VPinVdMicroInst(ExtMachInst _machInst, uint32_t _microIdx, + uint32_t _numVdPins, bool _hasVdOffset) + : VectorArithMicroInst("vpinvd_v_micro", _machInst, SimdMiscOp, 0, + _microIdx) + , hasVdOffset(_hasVdOffset) +{ + setRegIdxArrays( + reinterpret_cast( + &std::remove_pointer_t::srcRegIdxArr), + reinterpret_cast( + &std::remove_pointer_t::destRegIdxArr)); + + _numSrcRegs = 0; + _numDestRegs = 0; + setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _microIdx]); + _numTypedDestRegs[VecRegClass]++; + if (!_machInst.vtype8.vta || (!_machInst.vm && !_machInst.vtype8.vma) + || hasVdOffset) { + setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _microIdx]); + } + RegId Vd = destRegIdx(0); + Vd.setNumPinnedWrites(_numVdPins); + setDestRegIdx(0, Vd); +} + +Fault +VPinVdMicroInst::execute(ExecContext* xc, trace::InstRecord* traceData) const +{ + MISA misa = xc->readMiscReg(MISCREG_ISA); + STATUS status = xc->readMiscReg(MISCREG_STATUS); + + if (!misa.rvv || status.vs == VPUStatus::OFF) { + return std::make_shared( + "RVV is disabled or VPU is off", machInst); + } + + status.vs = VPUStatus::DIRTY; + xc->setMiscReg(MISCREG_STATUS, status); + + // tail/mask policy: both undisturbed if one is, 1s if none + vreg_t& vd = *(vreg_t *)xc->getWritableRegOperand(this, 0); + if (!machInst.vtype8.vta || (!machInst.vm && !machInst.vtype8.vma) + || hasVdOffset) { + vreg_t old_vd; + xc->getRegOperand(this, 0, &old_vd); + vd = old_vd; + } else { + vd.set(0xff); + } + + if (traceData) { + traceData->setData(vecRegClass, &vd); + } + + return NoFault; +} + +std::string +VPinVdMicroInst::generateDisassembly(Addr pc, + const loader::SymbolTable *symtab) const +{ + std::stringstream ss; + ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "; + + if (!machInst.vtype8.vta || (!machInst.vm && !machInst.vtype8.vma) + || hasVdOffset) { + ss << registerName(srcRegIdx(0)); + } else { + ss << "~0"; + } + + return ss.str(); +} + } // namespace RiscvISA } // namespace gem5 diff --git a/src/arch/riscv/insts/vector.hh b/src/arch/riscv/insts/vector.hh index 07084fab36..e0c669f532 100644 --- a/src/arch/riscv/insts/vector.hh +++ b/src/arch/riscv/insts/vector.hh @@ -714,6 +714,35 @@ class VsSegIntrlvMicroInst : public VectorArithMicroInst const loader::SymbolTable *) const override; }; +class VCpyVsMicroInst : public VectorArithMicroInst +{ + private: + RegId srcRegIdxArr[1]; + RegId destRegIdxArr[1]; + + public: + VCpyVsMicroInst(ExtMachInst _machInst, uint32_t _microIdx, + uint8_t _vsRegIdx); + Fault execute(ExecContext *, trace::InstRecord *) const override; + std::string generateDisassembly( + Addr pc, const loader::SymbolTable *symtab) const override; +}; + +class VPinVdMicroInst : public VectorArithMicroInst +{ + private: + RegId srcRegIdxArr[1]; + RegId destRegIdxArr[1]; + bool hasVdOffset; + + public: + VPinVdMicroInst(ExtMachInst _machInst, uint32_t _microIdx, + uint32_t _numVdPins, bool _hasVdOffset=false); + Fault execute(ExecContext *, trace::InstRecord *) const override; + std::string generateDisassembly( + Addr pc, const loader::SymbolTable *symtab) const override; +}; + } // namespace RiscvISA } // namespace gem5 diff --git a/src/arch/riscv/isa/decoder.isa b/src/arch/riscv/isa/decoder.isa index 0eb10d89b2..98eedc102d 100644 --- a/src/arch/riscv/isa/decoder.isa +++ b/src/arch/riscv/isa/decoder.isa @@ -607,8 +607,6 @@ decode QUADRANT default Unknown::unknown() { if ((machInst.vm || elem_mask(v0, ei)) && i < this->microVl) { Vd_ub[i] = Mem_vc.as()[i]; - } else { - Vd_ub[i] = Vs2_ub[i]; } }}, inst_flags=SimdUnitStrideLoadOp); format VlSegOp { @@ -616,56 +614,42 @@ decode QUADRANT default Unknown::unknown() { if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 2)) && i < this->microVl) { Vd_ub[i] = Mem_vc.as()[i]; - } else { - Vd_ub[i] = Vs2_ub[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x02: vlseg3e8_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 3)) && i < this->microVl) { Vd_ub[i] = Mem_vc.as()[i]; - } else { - Vd_ub[i] = Vs2_ub[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x03: vlseg4e8_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 4)) && i < this->microVl) { Vd_ub[i] = Mem_vc.as()[i]; - } else { - Vd_ub[i] = Vs2_ub[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x04: vlseg5e8_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 5)) && i < this->microVl) { Vd_ub[i] = Mem_vc.as()[i]; - } else { - Vd_ub[i] = Vs2_ub[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x05: vlseg6e8_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 6)) && i < this->microVl) { Vd_ub[i] = Mem_vc.as()[i]; - } else { - Vd_ub[i] = Vs2_ub[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x06: vlseg7e8_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 7)) && i < this->microVl) { Vd_ub[i] = Mem_vc.as()[i]; - } else { - Vd_ub[i] = Vs2_ub[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x07: vlseg8e8_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 8)) && i < this->microVl) { Vd_ub[i] = Mem_vc.as()[i]; - } else { - Vd_ub[i] = Vs2_ub[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); } @@ -693,8 +677,6 @@ decode QUADRANT default Unknown::unknown() { if ((machInst.vm || elem_mask(v0, ei)) && i < this->microVl && i < this->faultIdx) { Vd_ub[i] = Mem_vc.as()[i]; - } else { - Vd_ub[i] = Vs2_ub[i]; } }}, inst_flags=SimdUnitStrideFaultOnlyFirstLoadOp); } @@ -719,8 +701,6 @@ decode QUADRANT default Unknown::unknown() { if ((machInst.vm || elem_mask(v0, ei)) && i < this->microVl) { Vd_uh[i] = Mem_vc.as()[i]; - } else { - Vd_uh[i] = Vs2_uh[i]; } }}, inst_flags=SimdUnitStrideLoadOp); format VlSegOp { @@ -728,56 +708,42 @@ decode QUADRANT default Unknown::unknown() { if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 2)) && i < this->microVl) { Vd_uh[i] = Mem_vc.as()[i]; - } else { - Vd_uh[i] = Vs2_uh[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x02: vlseg3e16_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 3)) && i < this->microVl) { Vd_uh[i] = Mem_vc.as()[i]; - } else { - Vd_uh[i] = Vs2_uh[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x03: vlseg4e16_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 4)) && i < this->microVl) { Vd_uh[i] = Mem_vc.as()[i]; - } else { - Vd_uh[i] = Vs2_uh[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x04: vlseg5e16_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 5)) && i < this->microVl) { Vd_uh[i] = Mem_vc.as()[i]; - } else { - Vd_uh[i] = Vs2_uh[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x05: vlseg6e16_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 6)) && i < this->microVl) { Vd_uh[i] = Mem_vc.as()[i]; - } else { - Vd_uh[i] = Vs2_uh[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x06: vlseg7e16_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 7)) && i < this->microVl) { Vd_uh[i] = Mem_vc.as()[i]; - } else { - Vd_uh[i] = Vs2_uh[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x07: vlseg8e16_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 8)) && i < this->microVl) { Vd_uh[i] = Mem_vc.as()[i]; - } else { - Vd_uh[i] = Vs2_uh[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); } @@ -802,8 +768,6 @@ decode QUADRANT default Unknown::unknown() { if ((machInst.vm || elem_mask(v0, ei)) && i < this->microVl && i < this->faultIdx) { Vd_uh[i] = Mem_vc.as()[i]; - } else { - Vd_uh[i] = Vs2_uh[i]; } }}, inst_flags=SimdUnitStrideFaultOnlyFirstLoadOp); } @@ -828,8 +792,6 @@ decode QUADRANT default Unknown::unknown() { if ((machInst.vm || elem_mask(v0, ei)) && i < this->microVl) { Vd_uw[i] = Mem_vc.as()[i]; - } else { - Vd_uw[i] = Vs2_uw[i]; } }}, inst_flags=SimdUnitStrideLoadOp); format VlSegOp { @@ -837,56 +799,42 @@ decode QUADRANT default Unknown::unknown() { if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 2)) && i < this->microVl) { Vd_uw[i] = Mem_vc.as()[i]; - } else { - Vd_uw[i] = Vs2_uw[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x02: vlseg3e32_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 3)) && i < this->microVl) { Vd_uw[i] = Mem_vc.as()[i]; - } else { - Vd_uw[i] = Vs2_uw[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x03: vlseg4e32_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 4)) && i < this->microVl) { Vd_uw[i] = Mem_vc.as()[i]; - } else { - Vd_uw[i] = Vs2_uw[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x04: vlseg5e32_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 5)) && i < this->microVl) { Vd_uw[i] = Mem_vc.as()[i]; - } else { - Vd_uw[i] = Vs2_uw[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x05: vlseg6e32_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 6)) && i < this->microVl) { Vd_uw[i] = Mem_vc.as()[i]; - } else { - Vd_uw[i] = Vs2_uw[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x06: vlseg7e32_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 7)) && i < this->microVl) { Vd_uw[i] = Mem_vc.as()[i]; - } else { - Vd_uw[i] = Vs2_uw[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x07: vlseg8e32_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 8)) && i < this->microVl) { Vd_uw[i] = Mem_vc.as()[i]; - } else { - Vd_uw[i] = Vs2_uw[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); } @@ -911,8 +859,6 @@ decode QUADRANT default Unknown::unknown() { if ((machInst.vm || elem_mask(v0, ei)) && i < this->microVl && i < this->faultIdx) { Vd_uw[i] = Mem_vc.as()[i]; - } else { - Vd_uw[i] = Vs2_uw[i]; } }}, inst_flags=SimdUnitStrideFaultOnlyFirstLoadOp); } @@ -937,8 +883,6 @@ decode QUADRANT default Unknown::unknown() { if ((machInst.vm || elem_mask(v0, ei)) && i < this->microVl) { Vd_ud[i] = Mem_vc.as()[i]; - } else { - Vd_ud[i] = Vs2_ud[i]; } }}, inst_flags=SimdUnitStrideLoadOp); format VlSegOp { @@ -946,56 +890,42 @@ decode QUADRANT default Unknown::unknown() { if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 2)) && i < this->microVl) { Vd_ud[i] = Mem_vc.as()[i]; - } else { - Vd_ud[i] = Vs2_ud[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x02: vlseg3e64_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 3)) && i < this->microVl) { Vd_ud[i] = Mem_vc.as()[i]; - } else { - Vd_ud[i] = Vs2_ud[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x03: vlseg4e64_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 4)) && i < this->microVl) { Vd_ud[i] = Mem_vc.as()[i]; - } else { - Vd_ud[i] = Vs2_ud[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x04: vlseg5e64_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 5)) && i < this->microVl) { Vd_ud[i] = Mem_vc.as()[i]; - } else { - Vd_ud[i] = Vs2_ud[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x05: vlseg6e64_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 6)) && i < this->microVl) { Vd_ud[i] = Mem_vc.as()[i]; - } else { - Vd_ud[i] = Vs2_ud[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x06: vlseg7e64_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 7)) && i < this->microVl) { Vd_ud[i] = Mem_vc.as()[i]; - } else { - Vd_ud[i] = Vs2_ud[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); 0x07: vlseg8e64_v({{ if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 8)) && i < this->microVl) { Vd_ud[i] = Mem_vc.as()[i]; - } else { - Vd_ud[i] = Vs2_ud[i]; } }}, inst_flags=SimdUnitStrideSegmentedLoadOp); } @@ -1020,8 +950,6 @@ decode QUADRANT default Unknown::unknown() { if ((machInst.vm || elem_mask(v0, ei)) && i < this->microVl && i < this->faultIdx) { Vd_ud[i] = Mem_vc.as()[i]; - } else { - Vd_ud[i] = Vs2_ud[i]; } }}, inst_flags=SimdUnitStrideFaultOnlyFirstLoadOp); } diff --git a/src/arch/riscv/isa/formats/vector_mem.isa b/src/arch/riscv/isa/formats/vector_mem.isa index 560673f4cb..4782278b9c 100644 --- a/src/arch/riscv/isa/formats/vector_mem.isa +++ b/src/arch/riscv/isa/formats/vector_mem.isa @@ -32,7 +32,7 @@ let {{ def setVlen(): return "uint32_t vlen = VlenbBits * 8;\n" def setVlenb(): - return "uint32_t vlenb = VlenbBits;\n" + return "[[maybe_unused]] uint32_t vlenb = VlenbBits;\n" def declareVMemTemplate(class_name): return f''' @@ -55,6 +55,17 @@ def getFaultCode(): } ''' +def getTailMaskPolicyCode(): + return ''' + if (!machInst.vtype8.vta || (!machInst.vm && !machInst.vtype8.vma)) { + RiscvISA::vreg_t old_vd; + xc->getRegOperand(this, 1, &old_vd); + tmp_d0 = old_vd; + } else { + tmp_d0.set(0xff); + } + ''' + def VMemBase(name, Name, ea_code, memacc_code, mem_flags, inst_flags, base_class, postacc_code='', declare_template_base=VMemMacroDeclare, @@ -94,7 +105,8 @@ def VMemBase(name, Name, ea_code, memacc_code, mem_flags, 'set_vlenb': setVlenb(), 'set_vlen': setVlen(), 'declare_vmem_template': declareVMemTemplate(Name + 'Micro'), - 'fault_code': getFaultCode() if fault_only_first else ''}, + 'fault_code': getFaultCode() if fault_only_first else '', + 'tail_mask_policy_code': getTailMaskPolicyCode()}, inst_flags) if mem_flags: diff --git a/src/arch/riscv/isa/templates/vector_mem.isa b/src/arch/riscv/isa/templates/vector_mem.isa index 77418664dd..1d9ce70589 100644 --- a/src/arch/riscv/isa/templates/vector_mem.isa +++ b/src/arch/riscv/isa/templates/vector_mem.isa @@ -128,7 +128,9 @@ def template VleMicroConstructor {{ setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _microIdx]); _numTypedDestRegs[VecRegClass]++; setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]); - setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _microIdx]); + if (!_machInst.vtype8.vta || (!_machInst.vm && !_machInst.vtype8.vma)) { + setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _microIdx]); + } if (!_machInst.vm) { setSrcRegIdx(_numSrcRegs++, vecRegClass[0]); } @@ -245,6 +247,9 @@ Fault status.vs = VPUStatus::DIRTY; xc->setMiscReg(MISCREG_STATUS, status); + // tail/mask policy: both undisturbed if one is, 1s if none + %(tail_mask_policy_code)s + RiscvISA::vreg_t tmp_v0; uint8_t *v0; if(!machInst.vm) { @@ -815,6 +820,17 @@ def template VlStrideConstructor {{ microop = new VectorNopMicroInst(_machInst); this->microops.push_back(microop); } + + const uint8_t num_pinvd_microops = ceil((float) this->vl / + num_elems_per_vreg); + for (uint32_t i = 0; i < num_pinvd_microops; i++) { + uint32_t vdNumElems = (vl >= num_elems_per_vreg*(i+1)) + ? num_elems_per_vreg : vl-num_elems_per_vreg*i; + microop = new VPinVdMicroInst(machInst, i, vdNumElems); + microop->setFlag(IsDelayedCommit); + this->microops.push_back(microop); + } + for (int i = 0; micro_vl > 0; ++i) { for (int j = 0; j < micro_vl; ++j) { microop = new %(class_name)sMicro(machInst, i, j, micro_vl); @@ -838,7 +854,7 @@ def template VlStrideMicroDeclare {{ class %(class_name)s : public %(base_class)s { private: - // rs1, rs2, vd, vm + // rs1, rs2, vtmp0, vm RegId srcRegIdxArr[4]; RegId destRegIdxArr[1]; public: @@ -869,8 +885,8 @@ def template VlStrideMicroConstructor {{ _numTypedDestRegs[VecRegClass]++; setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]); setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs2]); - // We treat agnostic as undistrubed - setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _regIdx]); + // vtmp0 as dummy src reg to create dependency with pin vd micro + setSrcRegIdx(_numSrcRegs++, vecRegClass[VecMemInternalReg0]); if (!_machInst.vm) { setSrcRegIdx(_numSrcRegs++, vecRegClass[0]); } @@ -983,14 +999,6 @@ Fault status.vs = VPUStatus::DIRTY; xc->setMiscReg(MISCREG_STATUS, status); - constexpr uint8_t elem_size = sizeof(Vd[0]); - - RiscvISA::vreg_t old_vd; - decltype(Vd) old_Vd = nullptr; - // We treat agnostic as undistrubed - xc->getRegOperand(this, 2, &old_vd); - old_Vd = old_vd.as >(); - RiscvISA::vreg_t tmp_v0; uint8_t *v0; if (!machInst.vm) { @@ -998,17 +1006,6 @@ Fault v0 = tmp_v0.as(); } - if (microIdx == 0) { - // treat vma as vmu - // if (machInst.vtype8.vma == 0) - memcpy(Vd, old_Vd, microVl * elem_size); - // treat vta as vtu - // if (machInst.vtype8.vta == 0) - memcpy(Vd + microVl, old_Vd + microVl, vlenb - microVl * elem_size); - } else { - memcpy(Vd, old_Vd, vlenb); - } - size_t ei = this->regIdx * vlenb / sizeof(Vd[0]) + this->microIdx; if (machInst.vm || elem_mask(v0, ei)) { memcpy(Mem.as(), pkt->getPtr(), pkt->getSize()); @@ -1220,6 +1217,21 @@ template microop = new VectorNopMicroInst(_machInst); this->microops.push_back(microop); } + + const uint32_t vd_vlmax = vlenb / vd_eewb; + const uint8_t num_pinvdcpyvs_microops = ceil((float) this->vl/vd_vlmax); + for (uint32_t i = 0; i < num_pinvdcpyvs_microops; i++) { + uint32_t vdNumElems = (vl >= vd_vlmax*(i+1)) ? vd_vlmax:vl-vd_vlmax*i; + + microop = new VCpyVsMicroInst(machInst, i, machInst.vs2); + microop->setFlag(IsDelayedCommit); + this->microops.push_back(microop); + + microop = new VPinVdMicroInst(machInst, i, vdNumElems); + microop->setFlag(IsDelayedCommit); + this->microops.push_back(microop); + } + for (uint32_t i = 0; micro_vl > 0; i++) { for (uint32_t j = 0; j < micro_vl; ++j) { uint32_t vdRegIdx = i / vd_split_num; @@ -1251,8 +1263,8 @@ template class %(class_name)s : public %(base_class)s { private: - // rs1, vs2, vd, vm - RegId srcRegIdxArr[4]; + // rs1, vs2, vm + RegId srcRegIdxArr[3]; RegId destRegIdxArr[1]; public: %(class_name)s(ExtMachInst _machInst, @@ -1283,9 +1295,7 @@ template setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _vdRegIdx]); _numTypedDestRegs[VecRegClass]++; setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]); - setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs2 + _vs2RegIdx]); - // We treat agnostic as undistrubed - setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _vdRegIdx]); + setSrcRegIdx(_numSrcRegs++, vecRegClass[VecMemInternalReg0 + _vs2RegIdx]); if (!_machInst.vm) { setSrcRegIdx(_numSrcRegs++, vecRegClass[0]); } @@ -1408,12 +1418,6 @@ Fault constexpr uint8_t elem_size = sizeof(Vd[0]); - RiscvISA::vreg_t old_vd;; - decltype(Vd) old_Vd = nullptr; - // We treat agnostic as undistrubed - xc->getRegOperand(this, 2, &old_vd); - old_Vd = old_vd.as >(); - RiscvISA::vreg_t tmp_v0; uint8_t *v0; if (!machInst.vm) { @@ -1421,8 +1425,6 @@ Fault v0 = tmp_v0.as(); } - memcpy(Vd, old_Vd, vlenb); - size_t ei = this->vdRegIdx * vlenb / elem_size + this->vdElemIdx; if (machInst.vm || elem_mask(v0, ei)) { memcpy(Mem.as(), pkt->getPtr(), pkt->getSize()); @@ -1690,8 +1692,8 @@ def template VlSegMicroDeclare {{ class %(class_name)s : public %(base_class)s { private: - // rs1, rs2, vd, vm - RegId srcRegIdxArr[4]; + // rs1, vd, vm + RegId srcRegIdxArr[3]; RegId destRegIdxArr[1]; uint32_t field; uint32_t numFields; @@ -1724,8 +1726,10 @@ def template VlSegMicroConstructor {{ (field * numMicroops)]); _numTypedDestRegs[VecRegClass]++; setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]); - setSrcRegIdx(_numSrcRegs++, vecRegClass[VecMemInternalReg0 + _microIdx + - (field * numMicroops)]); + if (!_machInst.vtype8.vta || (!_machInst.vm && !_machInst.vtype8.vma)) { + setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _microIdx + + (field * numMicroops)]); + } if (!_machInst.vm) { setSrcRegIdx(_numSrcRegs++, vecRegClass[0]); } @@ -1843,6 +1847,9 @@ Fault status.vs = VPUStatus::DIRTY; xc->setMiscReg(MISCREG_STATUS, status); + // tail/mask policy: both undisturbed if one is, 1s if none + %(tail_mask_policy_code)s + RiscvISA::vreg_t tmp_v0; uint8_t *v0; if(!machInst.vm) {