arch-riscv: add agnostic option to vector tail/mask policy for mem and arith instructions (#1135)

These two commits add agnostic capability for both tail/mask policies,
for vector memory and arithmetic instructions respectively. The common
policy for instructions is to act as undisturbed if one is (i.e. tail or
mask), or write all 1s if none.

For those instructions in which multiple micro instructions are
instantiated to write to the same register (`VlStride` and `VlIndex` for
memory, and `VectorGather`, `VectorSlideUp` and `VectorSlideDown` for
arithmetic), a (new) micro instruction named `VPinVdCpyVsMicroInst` has
been used to pin the destination register so that there's no need to
copy the partial results between them. This idea is similar to what's on
ARM's SVE code. This micro also implements the tail/mask policy for this
cases.

Finally, it's worth noting that while now using an agnostic policy for
both tail/mask should remove all dependencies with old destination
registers, there's an exception with `VectorSlideUp`. The
`vslideup_{vx,vi}` instructions need the elements in the offset to be
unchanged. The current implementation overrides the current vta/vma and
makes them act as undisturbed, since they require the old destination
register anyways. There's a minor issue with this though, as
`v{,f}slide1up` variants do not need this, but since they share the same
constructor, will act all the same.

Related issue #997.
This commit is contained in:
Jason Lowe-Power
2024-07-08 11:47:11 -07:00
committed by GitHub
8 changed files with 363 additions and 249 deletions

View File

@@ -140,8 +140,11 @@ class VecRegContainer
VecRegContainer() {}
VecRegContainer(const VecRegContainer &) = default;
/** Set the container. */
void set(uint8_t val) { memset(container.data(), val, SIZE); }
/** Zero the container. */
void zero() { memset(container.data(), 0, SIZE); }
void zero() { set(0); }
/** Assignment operators. */
/** @{ */

View File

@@ -444,14 +444,14 @@ VMaskMergeMicroInst::execute(ExecContext* xc,
uint32_t vlenb = pc_ptr->as<PCState>().vlenb();
const uint32_t elems_per_vreg = vlenb / elemSize;
size_t bit_cnt = elems_per_vreg;
// mask tails are always treated as agnostic: writting 1s
tmp_d0.set(0xff);
vreg_t tmp_s;
xc->getRegOperand(this, 0, &tmp_s);
auto s = tmp_s.as<uint8_t>();
// cp the first result and tail
memcpy(Vd, s, vlenb);
for (uint8_t i = 1; i < this->_numSrcRegs; i++) {
for (uint8_t i = 0; i < this->_numSrcRegs; i++) {
xc->getRegOperand(this, i, &tmp_s);
s = tmp_s.as<uint8_t>();
auto s = tmp_s.as<uint8_t>();
if (elems_per_vreg < 8) {
const uint32_t m = (1 << elems_per_vreg) - 1;
const uint32_t mask = m << (i * elems_per_vreg % 8);
@@ -658,7 +658,13 @@ VlSegDeIntrlvMicroInst::execute(ExecContext* xc, trace::InstRecord* traceData) c
for (uint32_t i = 0; i < numSrcs; i++) {
xc->getRegOperand(this, i, &tmp_s);
s = tmp_s.as<uint8_t>();
while(index < (i + 1) * elems_per_vreg)
// copy tail/inactive elements from vtmp src
if (i == field) {
tmp_d0 = tmp_s;
}
while (index < (i + 1) * elems_per_vreg)
{
memcpy(Vd + (elem * sizeOfElement),
s + ((index % elems_per_vreg) * sizeOfElement),
@@ -705,8 +711,6 @@ std::string VsSegMicroInst::generateDisassembly(Addr pc,
ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", " <<
'(' << registerName(srcRegIdx(0)) << ')' <<
", "<< registerName(srcRegIdx(1));
if (microIdx != 0 || machInst.vtype8.vma == 0 || machInst.vtype8.vta == 0)
ss << ", " << registerName(srcRegIdx(2));
if (!machInst.vm)
ss << ", v0.t";
return ss.str();
@@ -797,5 +801,134 @@ VsSegIntrlvMicroInst::generateDisassembly(Addr pc,
return ss.str();
}
VCpyVsMicroInst::VCpyVsMicroInst(ExtMachInst _machInst, uint32_t _microIdx,
uint8_t _vsRegIdx)
: VectorArithMicroInst("vcpyvs_v_micro", _machInst, SimdMiscOp, 0,
_microIdx)
{
setRegIdxArrays(
reinterpret_cast<RegIdArrayPtr>(
&std::remove_pointer_t<decltype(this)>::srcRegIdxArr),
reinterpret_cast<RegIdArrayPtr>(
&std::remove_pointer_t<decltype(this)>::destRegIdxArr));
_numSrcRegs = 0;
_numDestRegs = 0;
setDestRegIdx(_numDestRegs++, vecRegClass[VecMemInternalReg0 + _microIdx]);
_numTypedDestRegs[VecRegClass]++;
setSrcRegIdx(_numSrcRegs++, vecRegClass[_vsRegIdx + _microIdx]);
}
Fault
VCpyVsMicroInst::execute(ExecContext* xc, trace::InstRecord* traceData) const
{
MISA misa = xc->readMiscReg(MISCREG_ISA);
STATUS status = xc->readMiscReg(MISCREG_STATUS);
if (!misa.rvv || status.vs == VPUStatus::OFF) {
return std::make_shared<IllegalInstFault>(
"RVV is disabled or VPU is off", machInst);
}
status.vs = VPUStatus::DIRTY;
xc->setMiscReg(MISCREG_STATUS, status);
// copy vector source reg to vtmp
vreg_t& vtmp = *(vreg_t *)xc->getWritableRegOperand(this, 0);
vreg_t vs;
xc->getRegOperand(this, 0, &vs);
vtmp = vs;
if (traceData) {
traceData->setData(vecRegClass, &vtmp);
}
return NoFault;
}
std::string
VCpyVsMicroInst::generateDisassembly(Addr pc,
const loader::SymbolTable *symtab) const
{
std::stringstream ss;
ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", "
<< registerName(srcRegIdx(0));
return ss.str();
}
VPinVdMicroInst::VPinVdMicroInst(ExtMachInst _machInst, uint32_t _microIdx,
uint32_t _numVdPins, bool _hasVdOffset)
: VectorArithMicroInst("vpinvd_v_micro", _machInst, SimdMiscOp, 0,
_microIdx)
, hasVdOffset(_hasVdOffset)
{
setRegIdxArrays(
reinterpret_cast<RegIdArrayPtr>(
&std::remove_pointer_t<decltype(this)>::srcRegIdxArr),
reinterpret_cast<RegIdArrayPtr>(
&std::remove_pointer_t<decltype(this)>::destRegIdxArr));
_numSrcRegs = 0;
_numDestRegs = 0;
setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _microIdx]);
_numTypedDestRegs[VecRegClass]++;
if (!_machInst.vtype8.vta || (!_machInst.vm && !_machInst.vtype8.vma)
|| hasVdOffset) {
setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _microIdx]);
}
RegId Vd = destRegIdx(0);
Vd.setNumPinnedWrites(_numVdPins);
setDestRegIdx(0, Vd);
}
Fault
VPinVdMicroInst::execute(ExecContext* xc, trace::InstRecord* traceData) const
{
MISA misa = xc->readMiscReg(MISCREG_ISA);
STATUS status = xc->readMiscReg(MISCREG_STATUS);
if (!misa.rvv || status.vs == VPUStatus::OFF) {
return std::make_shared<IllegalInstFault>(
"RVV is disabled or VPU is off", machInst);
}
status.vs = VPUStatus::DIRTY;
xc->setMiscReg(MISCREG_STATUS, status);
// tail/mask policy: both undisturbed if one is, 1s if none
vreg_t& vd = *(vreg_t *)xc->getWritableRegOperand(this, 0);
if (!machInst.vtype8.vta || (!machInst.vm && !machInst.vtype8.vma)
|| hasVdOffset) {
vreg_t old_vd;
xc->getRegOperand(this, 0, &old_vd);
vd = old_vd;
} else {
vd.set(0xff);
}
if (traceData) {
traceData->setData(vecRegClass, &vd);
}
return NoFault;
}
std::string
VPinVdMicroInst::generateDisassembly(Addr pc,
const loader::SymbolTable *symtab) const
{
std::stringstream ss;
ss << mnemonic << ' ' << registerName(destRegIdx(0)) << ", ";
if (!machInst.vtype8.vta || (!machInst.vm && !machInst.vtype8.vma)
|| hasVdOffset) {
ss << registerName(srcRegIdx(0));
} else {
ss << "~0";
}
return ss.str();
}
} // namespace RiscvISA
} // namespace gem5

View File

@@ -714,6 +714,35 @@ class VsSegIntrlvMicroInst : public VectorArithMicroInst
const loader::SymbolTable *) const override;
};
class VCpyVsMicroInst : public VectorArithMicroInst
{
private:
RegId srcRegIdxArr[1];
RegId destRegIdxArr[1];
public:
VCpyVsMicroInst(ExtMachInst _machInst, uint32_t _microIdx,
uint8_t _vsRegIdx);
Fault execute(ExecContext *, trace::InstRecord *) const override;
std::string generateDisassembly(
Addr pc, const loader::SymbolTable *symtab) const override;
};
class VPinVdMicroInst : public VectorArithMicroInst
{
private:
RegId srcRegIdxArr[1];
RegId destRegIdxArr[1];
bool hasVdOffset;
public:
VPinVdMicroInst(ExtMachInst _machInst, uint32_t _microIdx,
uint32_t _numVdPins, bool _hasVdOffset=false);
Fault execute(ExecContext *, trace::InstRecord *) const override;
std::string generateDisassembly(
Addr pc, const loader::SymbolTable *symtab) const override;
};
} // namespace RiscvISA
} // namespace gem5

View File

@@ -607,8 +607,6 @@ decode QUADRANT default Unknown::unknown() {
if ((machInst.vm || elem_mask(v0, ei)) &&
i < this->microVl) {
Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
} else {
Vd_ub[i] = Vs2_ub[i];
}
}}, inst_flags=SimdUnitStrideLoadOp);
format VlSegOp {
@@ -616,56 +614,42 @@ decode QUADRANT default Unknown::unknown() {
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 2)) &&
i < this->microVl) {
Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
} else {
Vd_ub[i] = Vs2_ub[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x02: vlseg3e8_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 3)) &&
i < this->microVl) {
Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
} else {
Vd_ub[i] = Vs2_ub[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x03: vlseg4e8_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 4)) &&
i < this->microVl) {
Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
} else {
Vd_ub[i] = Vs2_ub[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x04: vlseg5e8_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 5)) &&
i < this->microVl) {
Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
} else {
Vd_ub[i] = Vs2_ub[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x05: vlseg6e8_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 6)) &&
i < this->microVl) {
Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
} else {
Vd_ub[i] = Vs2_ub[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x06: vlseg7e8_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 7)) &&
i < this->microVl) {
Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
} else {
Vd_ub[i] = Vs2_ub[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x07: vlseg8e8_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 8)) &&
i < this->microVl) {
Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
} else {
Vd_ub[i] = Vs2_ub[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
}
@@ -693,8 +677,6 @@ decode QUADRANT default Unknown::unknown() {
if ((machInst.vm || elem_mask(v0, ei)) &&
i < this->microVl && i < this->faultIdx) {
Vd_ub[i] = Mem_vc.as<uint8_t>()[i];
} else {
Vd_ub[i] = Vs2_ub[i];
}
}}, inst_flags=SimdUnitStrideFaultOnlyFirstLoadOp);
}
@@ -719,8 +701,6 @@ decode QUADRANT default Unknown::unknown() {
if ((machInst.vm || elem_mask(v0, ei)) &&
i < this->microVl) {
Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
} else {
Vd_uh[i] = Vs2_uh[i];
}
}}, inst_flags=SimdUnitStrideLoadOp);
format VlSegOp {
@@ -728,56 +708,42 @@ decode QUADRANT default Unknown::unknown() {
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 2)) &&
i < this->microVl) {
Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
} else {
Vd_uh[i] = Vs2_uh[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x02: vlseg3e16_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 3)) &&
i < this->microVl) {
Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
} else {
Vd_uh[i] = Vs2_uh[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x03: vlseg4e16_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 4)) &&
i < this->microVl) {
Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
} else {
Vd_uh[i] = Vs2_uh[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x04: vlseg5e16_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 5)) &&
i < this->microVl) {
Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
} else {
Vd_uh[i] = Vs2_uh[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x05: vlseg6e16_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 6)) &&
i < this->microVl) {
Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
} else {
Vd_uh[i] = Vs2_uh[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x06: vlseg7e16_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 7)) &&
i < this->microVl) {
Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
} else {
Vd_uh[i] = Vs2_uh[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x07: vlseg8e16_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 8)) &&
i < this->microVl) {
Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
} else {
Vd_uh[i] = Vs2_uh[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
}
@@ -802,8 +768,6 @@ decode QUADRANT default Unknown::unknown() {
if ((machInst.vm || elem_mask(v0, ei)) &&
i < this->microVl && i < this->faultIdx) {
Vd_uh[i] = Mem_vc.as<uint16_t>()[i];
} else {
Vd_uh[i] = Vs2_uh[i];
}
}}, inst_flags=SimdUnitStrideFaultOnlyFirstLoadOp);
}
@@ -828,8 +792,6 @@ decode QUADRANT default Unknown::unknown() {
if ((machInst.vm || elem_mask(v0, ei)) &&
i < this->microVl) {
Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
} else {
Vd_uw[i] = Vs2_uw[i];
}
}}, inst_flags=SimdUnitStrideLoadOp);
format VlSegOp {
@@ -837,56 +799,42 @@ decode QUADRANT default Unknown::unknown() {
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 2)) &&
i < this->microVl) {
Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
} else {
Vd_uw[i] = Vs2_uw[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x02: vlseg3e32_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 3)) &&
i < this->microVl) {
Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
} else {
Vd_uw[i] = Vs2_uw[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x03: vlseg4e32_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 4)) &&
i < this->microVl) {
Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
} else {
Vd_uw[i] = Vs2_uw[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x04: vlseg5e32_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 5)) &&
i < this->microVl) {
Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
} else {
Vd_uw[i] = Vs2_uw[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x05: vlseg6e32_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 6)) &&
i < this->microVl) {
Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
} else {
Vd_uw[i] = Vs2_uw[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x06: vlseg7e32_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 7)) &&
i < this->microVl) {
Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
} else {
Vd_uw[i] = Vs2_uw[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x07: vlseg8e32_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 8)) &&
i < this->microVl) {
Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
} else {
Vd_uw[i] = Vs2_uw[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
}
@@ -911,8 +859,6 @@ decode QUADRANT default Unknown::unknown() {
if ((machInst.vm || elem_mask(v0, ei)) &&
i < this->microVl && i < this->faultIdx) {
Vd_uw[i] = Mem_vc.as<uint32_t>()[i];
} else {
Vd_uw[i] = Vs2_uw[i];
}
}}, inst_flags=SimdUnitStrideFaultOnlyFirstLoadOp);
}
@@ -937,8 +883,6 @@ decode QUADRANT default Unknown::unknown() {
if ((machInst.vm || elem_mask(v0, ei)) &&
i < this->microVl) {
Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
} else {
Vd_ud[i] = Vs2_ud[i];
}
}}, inst_flags=SimdUnitStrideLoadOp);
format VlSegOp {
@@ -946,56 +890,42 @@ decode QUADRANT default Unknown::unknown() {
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 2)) &&
i < this->microVl) {
Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
} else {
Vd_ud[i] = Vs2_ud[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x02: vlseg3e64_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 3)) &&
i < this->microVl) {
Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
} else {
Vd_ud[i] = Vs2_ud[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x03: vlseg4e64_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 4)) &&
i < this->microVl) {
Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
} else {
Vd_ud[i] = Vs2_ud[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x04: vlseg5e64_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 5)) &&
i < this->microVl) {
Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
} else {
Vd_ud[i] = Vs2_ud[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x05: vlseg6e64_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 6)) &&
i < this->microVl) {
Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
} else {
Vd_ud[i] = Vs2_ud[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x06: vlseg7e64_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 7)) &&
i < this->microVl) {
Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
} else {
Vd_ud[i] = Vs2_ud[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
0x07: vlseg8e64_v({{
if ((machInst.vm || elem_mask_vseg(v0, ei + (field * micro_elems), 8)) &&
i < this->microVl) {
Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
} else {
Vd_ud[i] = Vs2_ud[i];
}
}}, inst_flags=SimdUnitStrideSegmentedLoadOp);
}
@@ -1020,8 +950,6 @@ decode QUADRANT default Unknown::unknown() {
if ((machInst.vm || elem_mask(v0, ei)) &&
i < this->microVl && i < this->faultIdx) {
Vd_ud[i] = Mem_vc.as<uint64_t>()[i];
} else {
Vd_ud[i] = Vs2_ud[i];
}
}}, inst_flags=SimdUnitStrideFaultOnlyFirstLoadOp);
}
@@ -2817,10 +2745,10 @@ decode QUADRANT default Unknown::unknown() {
if (this->vm || elem_mask(v0, ei)) {
const uint64_t idx = Vs1_vu[i]
- vs2_elems * vs2_idx;
auto res = (Vs1_vu[i] >= vlmax) ? 0
: (idx < vs2_elems) ? Vs2_vu[idx]
: Vs3_vu[i];
Vd_vu[i] = res;
if (Vs1_vu[i] >= vlmax)
Vd_vu[i] = 0;
else if (idx < vs2_elems)
Vd_vu[i] = Vs2_vu[idx];
}
}
}}, OPIVV, SimdMiscOp);
@@ -2830,10 +2758,10 @@ decode QUADRANT default Unknown::unknown() {
if (this->vm || elem_mask(v0, ei)) {
const uint32_t idx = Vs1_uh[i + vs1_bias]
- vs2_elems * vs2_idx;
auto res = (Vs1_uh[i + vs1_bias] >= vlmax) ? 0
: (idx < vs2_elems) ? Vs2_vu[idx]
: Vs3_vu[i + vd_bias];
Vd_vu[i + vd_bias] = res;
if (Vs1_uh[i + vs1_bias] >= vlmax)
Vd_vu[i + vd_bias] = 0;
else if (idx < vs2_elems)
Vd_vu[i + vd_bias] = Vs2_vu[idx];
}
}
}}, OPIVV, SimdMiscOp);
@@ -3736,9 +3664,10 @@ decode QUADRANT default Unknown::unknown() {
uint64_t zextImm = rvZext(SIMM5);
if (this->vm || elem_mask(v0, ei)) {
const uint64_t idx = zextImm - vs2_elems * vs2_idx;
Vd_vu[i] = (zextImm >= vlmax) ? 0
: (idx < vs2_elems) ? Vs2_vu[idx]
: Vs3_vu[i];
if (zextImm >= vlmax)
Vd_vu[i] = 0;
else if (idx < vs2_elems)
Vd_vu[i] = Vs2_vu[idx];
}
}
}}, OPIVI, SimdMiscOp);
@@ -4071,9 +4000,10 @@ decode QUADRANT default Unknown::unknown() {
uint64_t zextRs1 = rvZext(Rs1);
if (this->vm || elem_mask(v0, ei)) {
const uint64_t idx = zextRs1 - vs2_elems * vs2_idx;
Vd_vu[i] = (zextRs1 >= vlmax) ? 0
: (idx < vs2_elems) ? Vs2_vu[idx]
: Vs3_vu[i];
if (zextRs1 >= vlmax)
Vd_vu[i] = 0;
else if (idx < vs2_elems)
Vd_vu[i] = Vs2_vu[idx];
}
}
}}, OPIVX, SimdMiscOp);

View File

@@ -31,12 +31,17 @@ let {{
def setVlen():
return "uint32_t vlen = VlenbBits * 8;\n"
def setVlenb():
return "uint32_t vlenb = VlenbBits;\n"
return "[[maybe_unused]] uint32_t vlenb = VlenbBits;\n"
def setDestWrapper(destRegId):
return "setDestRegIdx(_numDestRegs++, " + destRegId + ");\n" + \
"_numTypedDestRegs[VecRegClass]++;\n"
def setSrcWrapper(srcRegId):
return "setSrcRegIdx(_numSrcRegs++, " + srcRegId + ");\n"
def tailMaskCondSetSrcWrapper(setSrcRegCode):
return f'''
if (!_machInst.vtype8.vta || (!_machInst.vm && !_machInst.vtype8.vma))
{setSrcRegCode}
'''
def setSrcVm():
return "if (!this->vm)\n" + \
" setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);"
@@ -164,6 +169,7 @@ def format VectorIntFormat(code, category, *flags) {{
v0_required = inst_name not in ["vmv"]
mask_cond = v0_required and (inst_suffix not in ['vvm', 'vxm', 'vim'])
need_elem_idx = mask_cond or code.find("ei") != -1
is_destructive_fused = iop.op_class == "SimdMultAccOp"
dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
@@ -185,7 +191,6 @@ def format VectorIntFormat(code, category, *flags) {{
error("not supported category for VectorIntFormat: %s" % category)
old_vd_idx = num_src_regs
src3_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
set_dest_reg_idx = setDestWrapper(dest_reg_id)
@@ -193,7 +198,12 @@ def format VectorIntFormat(code, category, *flags) {{
if category != "OPIVI":
set_src_reg_idx += setSrcWrapper(src1_reg_id)
set_src_reg_idx += setSrcWrapper(src2_reg_id)
set_src_reg_idx += setSrcWrapper(src3_reg_id)
dest_set_src_reg_idx = setSrcWrapper(dest_reg_id)
if not is_destructive_fused:
dest_set_src_reg_idx = tailMaskCondSetSrcWrapper(dest_set_src_reg_idx)
set_src_reg_idx += dest_set_src_reg_idx
if v0_required:
set_src_reg_idx += setSrcVm()
@@ -247,17 +257,17 @@ def format VectorIntExtFormat(code, category, *flags) {{
inst_name, inst_suffix = name.split("_", maxsplit=1)
ext_div = int(inst_suffix[-1])
old_vd_idx = 1
dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx / " + \
str(ext_div) + "]"
src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
old_vd_idx = 1
set_dest_reg_idx = setDestWrapper(dest_reg_id)
set_src_reg_idx = ""
set_src_reg_idx += setSrcWrapper(src2_reg_id)
set_src_reg_idx += setSrcWrapper(src3_reg_id)
set_src_reg_idx += tailMaskCondSetSrcWrapper(setSrcWrapper(dest_reg_id))
set_src_reg_idx += setSrcVm()
code = maskCondWrapper(code)
@@ -307,6 +317,8 @@ def format VectorIntWideningFormat(code, category, *flags) {{
v0_required = True
mask_cond = v0_required
need_elem_idx = mask_cond or code.find("ei") != -1
is_destructive_fused = iop.op_class == "SimdMultAccOp"
old_vd_idx = 2
dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
src1_reg_id = ""
@@ -321,14 +333,18 @@ def format VectorIntWideningFormat(code, category, *flags) {{
src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx / 2]"
elif inst_suffix in ["wv", "wx"]:
src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
set_dest_reg_idx = setDestWrapper(dest_reg_id)
set_src_reg_idx = ""
set_src_reg_idx += setSrcWrapper(src1_reg_id)
set_src_reg_idx += setSrcWrapper(src2_reg_id)
set_src_reg_idx += setSrcWrapper(src3_reg_id)
dest_set_src_reg_idx = setSrcWrapper(dest_reg_id)
if not is_destructive_fused:
dest_set_src_reg_idx = tailMaskCondSetSrcWrapper(dest_set_src_reg_idx)
set_src_reg_idx += dest_set_src_reg_idx
if v0_required:
set_src_reg_idx += setSrcVm()
@@ -395,14 +411,13 @@ def format VectorIntNarrowingFormat(code, category, *flags) {{
else:
error("not supported category for VectorIntFormat: %s" % category)
src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
old_dest_reg_id = "vecRegClass[_machInst.vs3 + _microIdx / 2]"
set_dest_reg_idx = setDestWrapper(dest_reg_id)
set_src_reg_idx = ""
if category != "OPIVI":
set_src_reg_idx += setSrcWrapper(src1_reg_id)
set_src_reg_idx += setSrcWrapper(src2_reg_id)
set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
set_src_reg_idx += tailMaskCondSetSrcWrapper(setSrcWrapper(dest_reg_id))
set_src_reg_idx += setSrcVm()
# code
code = maskCondWrapper(code)
@@ -452,7 +467,6 @@ def format VectorIntMaskFormat(code, category, *flags) {{
mask_cond = inst_name not in ['vmadc', 'vmsbc']
need_elem_idx = mask_cond or code.find("ei") != -1
old_vd_idx = 2
dest_reg_id = "vecRegClass[VecMemInternalReg0 + _microIdx]"
src1_reg_id = ""
if category == "OPIVV":
@@ -460,17 +474,15 @@ def format VectorIntMaskFormat(code, category, *flags) {{
elif category == "OPIVX":
src1_reg_id = "intRegClass[_machInst.rs1]"
elif category == "OPIVI":
old_vd_idx = 1
pass
else:
error("not supported category for VectorIntFormat: %s" % category)
src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
old_dest_reg_id = "vecRegClass[_machInst.vd]"
set_dest_reg_idx = setDestWrapper(dest_reg_id)
set_src_reg_idx = ""
if category != "OPIVI":
set_src_reg_idx += setSrcWrapper(src1_reg_id)
set_src_reg_idx += setSrcWrapper(src2_reg_id)
set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
if v0_required:
set_src_reg_idx += setSrcVm()
@@ -497,7 +509,6 @@ def format VectorIntMaskFormat(code, category, *flags) {{
'set_vlenb': set_vlenb,
'set_vlen': set_vlen,
'vm_decl_rd': vm_decl_rd,
'copy_old_vd': copyOldVd(old_vd_idx),
'declare_varith_template': declareVArithTemplate(Name + "Micro")},
flags)
@@ -522,7 +533,6 @@ def format VectorGatherFormat(code, category, *flags) {{
'code': code,
'declare_varith_template': declareGatherTemplate(Name, idx_type)},
flags)
old_vd_idx = 2
dest_reg_id = "vecRegClass[_machInst.vd + vd_idx]"
src1_reg_id = ""
if category in ["OPIVV"]:
@@ -534,7 +544,10 @@ def format VectorGatherFormat(code, category, *flags) {{
else:
error("not supported category for VectorIntFormat: %s" % category)
src2_reg_id = "vecRegClass[_machInst.vs2 + vs2_idx]"
src3_reg_id = "vecRegClass[_machInst.vs3 + vd_idx]"
src2_reg_id = "vecRegClass[_machInst.vs2 + vs2_idx]"
# vtmp0 as dummy src reg to create dependency with pin vd micro
src3_reg_id = "vecRegClass[VecMemInternalReg0 + vd_idx]"
set_dest_reg_idx = setDestWrapper(dest_reg_id)
@@ -562,7 +575,6 @@ def format VectorGatherFormat(code, category, *flags) {{
'set_vlenb': set_vlenb,
'set_vlen': set_vlen,
'vm_decl_rd': vm_decl_rd,
'copy_old_vd': copyOldVd(old_vd_idx),
'idx_type': idx_type,
'declare_varith_template': varith_micro_declare},
flags)
@@ -575,7 +587,6 @@ def format VectorGatherFormat(code, category, *flags) {{
VectorGatherMacroConstructor.subst(iop)
exec_output = VectorGatherMicroExecute.subst(microiop)
decode_block = VectorGatherDecodeBlock.subst(iop)
}};
def format VectorFloatFormat(code, category, *flags) {{
@@ -591,6 +602,7 @@ def format VectorFloatFormat(code, category, *flags) {{
v0_required = inst_name not in ["vfmv"]
mask_cond = v0_required and (inst_suffix not in ['vvm', 'vfm'])
need_elem_idx = mask_cond or code.find("ei") != -1
is_destructive_fused = iop.op_class == "SimdFloatMultAccOp"
dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
src1_reg_id = ""
@@ -601,16 +613,21 @@ def format VectorFloatFormat(code, category, *flags) {{
else:
error("not supported category for VectorFloatFormat: %s" % category)
src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
set_dest_reg_idx = setDestWrapper(dest_reg_id)
set_src_reg_idx = ""
set_src_reg_idx += setSrcWrapper(src1_reg_id)
set_src_reg_idx += setSrcWrapper(src2_reg_id)
set_src_reg_idx += setSrcWrapper(src3_reg_id)
dest_set_src_reg_idx = setSrcWrapper(dest_reg_id)
if not is_destructive_fused:
dest_set_src_reg_idx = tailMaskCondSetSrcWrapper(dest_set_src_reg_idx)
set_src_reg_idx += dest_set_src_reg_idx
if v0_required:
set_src_reg_idx += setSrcVm()
# code
if mask_cond:
code = maskCondWrapper(code)
@@ -663,13 +680,12 @@ def format VectorFloatCvtFormat(code, category, *flags) {{
old_vd_idx = 1
dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
set_dest_reg_idx = setDestWrapper(dest_reg_id)
set_src_reg_idx = ""
set_src_reg_idx += setSrcWrapper(src2_reg_id)
set_src_reg_idx += setSrcWrapper(src3_reg_id)
set_src_reg_idx += tailMaskCondSetSrcWrapper(setSrcWrapper(dest_reg_id))
set_src_reg_idx += setSrcVm()
code = maskCondWrapper(code)
code = eiDeclarePrefix(code)
@@ -719,6 +735,7 @@ def format VectorFloatWideningFormat(code, category, *flags) {{
v0_required = True
mask_cond = v0_required
need_elem_idx = mask_cond or code.find("ei") != -1
is_destructive_fused = iop.op_class == "SimdFloatMultAccOp"
dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
src1_reg_id = ""
@@ -733,14 +750,18 @@ def format VectorFloatWideningFormat(code, category, *flags) {{
src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx / 2]"
elif inst_suffix in ["wv", "wf"]:
src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
set_dest_reg_idx = setDestWrapper(dest_reg_id)
set_src_reg_idx = ""
set_src_reg_idx += setSrcWrapper(src1_reg_id)
set_src_reg_idx += setSrcWrapper(src2_reg_id)
set_src_reg_idx += setSrcWrapper(src3_reg_id)
dest_set_src_reg_idx = setSrcWrapper(dest_reg_id)
if not is_destructive_fused:
dest_set_src_reg_idx = tailMaskCondSetSrcWrapper(dest_set_src_reg_idx)
set_src_reg_idx += dest_set_src_reg_idx
if v0_required:
set_src_reg_idx += setSrcVm()
@@ -800,13 +821,12 @@ def format VectorFloatWideningCvtFormat(code, category, *flags) {{
old_vd_idx = 1
dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx / 2]"
src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
set_dest_reg_idx = setDestWrapper(dest_reg_id)
set_src_reg_idx = ""
set_src_reg_idx += setSrcWrapper(src2_reg_id)
set_src_reg_idx += setSrcWrapper(src3_reg_id)
set_src_reg_idx += tailMaskCondSetSrcWrapper(setSrcWrapper(dest_reg_id))
set_src_reg_idx += setSrcVm()
code = maskCondWrapper(code)
code = eiDeclarePrefix(code, widening=True)
@@ -857,13 +877,12 @@ def format VectorFloatNarrowingCvtFormat(code, category, *flags) {{
old_vd_idx = 1
dest_reg_id = "vecRegClass[_machInst.vd + _microIdx / 2]"
src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx / 2]"
set_dest_reg_idx = setDestWrapper(dest_reg_id)
set_src_reg_idx = ""
set_src_reg_idx += setSrcWrapper(src2_reg_id)
set_src_reg_idx += setSrcWrapper(src3_reg_id)
set_src_reg_idx += tailMaskCondSetSrcWrapper(setSrcWrapper(dest_reg_id))
set_src_reg_idx += setSrcVm()
code = maskCondWrapper(code)
code = eiDeclarePrefix(code, widening=True)
@@ -923,7 +942,6 @@ def format VectorFloatMaskFormat(code, category, *flags) {{
set_src_reg_idx = ""
set_src_reg_idx += setSrcWrapper(src1_reg_id)
set_src_reg_idx += setSrcWrapper(src2_reg_id)
set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
set_src_reg_idx += setSrcVm()
vm_decl_rd = vmDeclAndReadData()
set_vlenb = setVlenb()
@@ -944,7 +962,6 @@ def format VectorFloatMaskFormat(code, category, *flags) {{
'set_vlenb': set_vlenb,
'set_vlen': set_vlen,
'vm_decl_rd': vm_decl_rd,
'copy_old_vd': copyOldVd(2),
'declare_varith_template': varith_micro_declare},
flags)
@@ -991,14 +1008,10 @@ def format ViotaFormat(code, category, *flags){{
inst_name, inst_suffix = name.split("_", maxsplit=1)
dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
src2_reg_id = "vecRegClass[_machInst.vs2]"
# The tail of vector mask inst should be treated as tail-agnostic.
# We treat it with tail-undisturbed policy, since
# the test suits only support undisturbed policy.
old_dest_reg_id = "vecRegClass[_machInst.vd + _microIdx]"
set_src_reg_idx = ""
set_src_reg_idx += setSrcWrapper(src2_reg_id)
set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
set_src_reg_idx += tailMaskCondSetSrcWrapper(setSrcWrapper(dest_reg_id))
set_src_reg_idx += setSrcVm()
set_dest_reg_idx = setDestWrapper(dest_reg_id)
vm_decl_rd = vmDeclAndReadData()
@@ -1036,13 +1049,8 @@ def format Vector1Vs1VdMaskFormat(code, category, *flags){{
inst_name, inst_suffix = name.split("_", maxsplit=1)
dest_reg_id = "vecRegClass[_machInst.vd]"
src2_reg_id = "vecRegClass[_machInst.vs2]"
# The tail of vector mask inst should be treated as tail-agnostic.
# We treat it with tail-undisturbed policy, since
# the test suits only support undisturbed policy.
old_dest_reg_id = "vecRegClass[_machInst.vd]"
set_src_reg_idx = ""
set_src_reg_idx += setSrcWrapper(src2_reg_id)
set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
set_dest_reg_idx = setDestWrapper(dest_reg_id)
vm_decl_rd = vmDeclAndReadData()
set_vm_idx = setSrcVm()
@@ -1056,7 +1064,6 @@ def format Vector1Vs1VdMaskFormat(code, category, *flags){{
'set_vlenb': set_vlenb,
'vm_decl_rd': vm_decl_rd,
'set_vm_idx': set_vm_idx,
'copy_old_vd': copyOldVd(1),
'declare_varith_template': declareVArithTemplate(Name, 'uint', 8, 8),
},
flags)
@@ -1130,23 +1137,15 @@ def format VectorNonSplitFormat(code, category, *flags) {{
def format VectorMaskFormat(code, category, *flags) {{
inst_name, inst_suffix = name.split("_", maxsplit=1)
old_vd_idx = 2
if category not in ["OPMVV"]:
error("not supported category for VectorIntFormat: %s" % category)
dest_reg_id = "vecRegClass[_machInst.vd]"
src1_reg_id = "vecRegClass[_machInst.vs1]"
src2_reg_id = "vecRegClass[_machInst.vs2]"
# The tail of vector mask inst should be treated as tail-agnostic.
# We treat it with tail-undisturbed policy, since
# the test suits only support undisturbed policy.
# TODO: remove it
old_dest_reg_id = "vecRegClass[_machInst.vd]"
set_src_reg_idx = ""
set_src_reg_idx += setSrcWrapper(src1_reg_id)
set_src_reg_idx += setSrcWrapper(src2_reg_id)
set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
set_dest_reg_idx = setDestWrapper(dest_reg_id)
@@ -1161,7 +1160,6 @@ def format VectorMaskFormat(code, category, *flags) {{
'set_dest_reg_idx': set_dest_reg_idx,
'set_src_reg_idx': set_src_reg_idx,
'set_vlenb': set_vlenb,
'copy_old_vd': copyOldVd(old_vd_idx),
'declare_varith_template': declareVArithTemplate(Name, 'uint', 8, 8)
},
flags)
@@ -1185,13 +1183,10 @@ def format VectorReduceIntFormat(code, category, *flags) {{
dest_reg_id = "vecRegClass[_machInst.vd]"
src1_reg_id = "vecRegClass[_machInst.vs1]"
src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
old_dest_reg_id = "vecRegClass[_machInst.vd]"
set_dest_reg_idx = setDestWrapper(dest_reg_id)
set_src_reg_idx = setSrcWrapper(src1_reg_id)
set_src_reg_idx += setSrcWrapper(src2_reg_id)
# Treat tail undisturbed/agnostic as the same
# We always need old rd as src vreg
set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
set_src_reg_idx += tailMaskCondSetSrcWrapper(setSrcWrapper(dest_reg_id))
set_src_reg_idx += setSrcVm()
vm_decl_rd = vmDeclAndReadData()
set_vlenb = setVlenb()
@@ -1238,13 +1233,10 @@ def format VectorReduceFloatFormat(code, category, *flags) {{
dest_reg_id = "vecRegClass[_machInst.vd]"
src1_reg_id = "vecRegClass[_machInst.vs1]"
src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
old_dest_reg_id = "vecRegClass[_machInst.vd]"
set_dest_reg_idx = setDestWrapper(dest_reg_id)
set_src_reg_idx = setSrcWrapper(src1_reg_id)
set_src_reg_idx += setSrcWrapper(src2_reg_id)
# Treat tail undisturbed/agnostic as the same
# We always need old rd as src vreg
set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
set_src_reg_idx += tailMaskCondSetSrcWrapper(setSrcWrapper(dest_reg_id))
set_src_reg_idx += setSrcVm()
vm_decl_rd = vmDeclAndReadData()
set_vlenb = setVlenb()
@@ -1296,13 +1288,10 @@ def format VectorReduceFloatWideningFormat(code, category, *flags) {{
dest_reg_id = "vecRegClass[_machInst.vd]"
src1_reg_id = "vecRegClass[_machInst.vs1]"
src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
old_dest_reg_id = "vecRegClass[_machInst.vd]"
set_dest_reg_idx = setDestWrapper(dest_reg_id)
set_src_reg_idx = setSrcWrapper(src1_reg_id)
set_src_reg_idx += setSrcWrapper(src2_reg_id)
# Treat tail undisturbed/agnostic as the same
# We always need old rd as src vreg
set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
set_src_reg_idx += tailMaskCondSetSrcWrapper(setSrcWrapper(dest_reg_id))
set_src_reg_idx += setSrcVm()
vm_decl_rd = vmDeclAndReadData()
set_vlenb = setVlenb()
@@ -1362,14 +1351,13 @@ def format VectorIntVxsatFormat(code, category, *flags) {{
else:
error("not supported category for VectorIntVxsatFormat: %s" % category)
src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
src3_reg_id = "vecRegClass[_machInst.vs3 + _microIdx]"
set_dest_reg_idx = setDestWrapper(dest_reg_id)
set_src_reg_idx = ""
if category != "OPIVI":
set_src_reg_idx += setSrcWrapper(src1_reg_id)
set_src_reg_idx += setSrcWrapper(src2_reg_id)
set_src_reg_idx += setSrcWrapper(src3_reg_id)
set_src_reg_idx += tailMaskCondSetSrcWrapper(setSrcWrapper(dest_reg_id))
set_src_reg_idx += setSrcVm()
vm_decl_rd = vmDeclAndReadData()
@@ -1416,13 +1404,10 @@ def format VectorReduceIntWideningFormat(code, category, *flags) {{
dest_reg_id = "vecRegClass[_machInst.vd]"
src1_reg_id = "vecRegClass[_machInst.vs1]"
src2_reg_id = "vecRegClass[_machInst.vs2 + _microIdx]"
old_dest_reg_id = "vecRegClass[_machInst.vd]"
set_dest_reg_idx = setDestWrapper(dest_reg_id)
set_src_reg_idx = setSrcWrapper(src1_reg_id)
set_src_reg_idx += setSrcWrapper(src2_reg_id)
# Treat tail undisturbed/agnostic as the same
# We always need old rd as src vreg
set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
set_src_reg_idx += tailMaskCondSetSrcWrapper(setSrcWrapper(dest_reg_id))
set_src_reg_idx += setSrcVm()
vm_decl_rd = vmDeclAndReadData()
set_vlenb = setVlenb()
@@ -1480,12 +1465,8 @@ def VectorSlideBase(name, Name, category, code, flags, macro_construtor,
src1_ireg_id = "intRegClass[_machInst.rs1]"
src1_freg_id = "floatRegClass[_machInst.rs1]"
# The tail of vector mask inst should be treated as tail-agnostic.
# We treat it with tail-undisturbed policy, since
# the test suits only support undisturbed policy.
num_src_regs = 0
old_dest_reg_id = "vecRegClass[_machInst.vd + vdIdx]"
set_src_reg_idx = ""
if category in ["OPIVX", "OPMVX"]:
set_src_reg_idx += setSrcWrapper(src1_ireg_id)
@@ -1495,8 +1476,6 @@ def VectorSlideBase(name, Name, category, code, flags, macro_construtor,
num_src_regs += 1
set_src_reg_idx += setSrcWrapper(src2_reg_id)
num_src_regs += 1
old_vd_idx = num_src_regs
set_src_reg_idx += setSrcWrapper(old_dest_reg_id)
set_dest_reg_idx = setDestWrapper(dest_reg_id)
vm_decl_rd = vmDeclAndReadData()
set_src_reg_idx += setSrcVm()
@@ -1518,7 +1497,6 @@ def VectorSlideBase(name, Name, category, code, flags, macro_construtor,
'set_vlenb': set_vlenb,
'set_vlen': set_vlen,
'vm_decl_rd': vm_decl_rd,
'copy_old_vd': copyOldVd(old_vd_idx),
'declare_varith_template': varith_micro_declare},
flags)

View File

@@ -32,7 +32,7 @@ let {{
def setVlen():
return "uint32_t vlen = VlenbBits * 8;\n"
def setVlenb():
return "uint32_t vlenb = VlenbBits;\n"
return "[[maybe_unused]] uint32_t vlenb = VlenbBits;\n"
def declareVMemTemplate(class_name):
return f'''
@@ -55,6 +55,17 @@ def getFaultCode():
}
'''
def getTailMaskPolicyCode():
return '''
if (!machInst.vtype8.vta || (!machInst.vm && !machInst.vtype8.vma)) {
RiscvISA::vreg_t old_vd;
xc->getRegOperand(this, 1, &old_vd);
tmp_d0 = old_vd;
} else {
tmp_d0.set(0xff);
}
'''
def VMemBase(name, Name, ea_code, memacc_code, mem_flags,
inst_flags, base_class, postacc_code='',
declare_template_base=VMemMacroDeclare,
@@ -94,7 +105,8 @@ def VMemBase(name, Name, ea_code, memacc_code, mem_flags,
'set_vlenb': setVlenb(),
'set_vlen': setVlen(),
'declare_vmem_template': declareVMemTemplate(Name + 'Micro'),
'fault_code': getFaultCode() if fault_only_first else ''},
'fault_code': getFaultCode() if fault_only_first else '',
'tail_mask_policy_code': getTailMaskPolicyCode()},
inst_flags)
if mem_flags:

View File

@@ -31,12 +31,14 @@ output header {{
#define ASSIGN_VD_BIT(idx, bit) \
((Vd[(idx)/8] & ~(1 << (idx)%8)) | ((bit) << (idx)%8))
#define COPY_OLD_VD(idx) \
[[maybe_unused]] RiscvISA::vreg_t old_vd; \
[[maybe_unused]] decltype(Vd) old_Vd = nullptr; \
xc->getRegOperand(this, (idx), &old_vd); \
old_Vd = old_vd.as<std::remove_reference_t<decltype(Vd[0])> >(); \
memcpy(Vd, old_Vd, vlenb);
#define COPY_OLD_VD(idx) \
if (!machInst.vtype8.vta || (!machInst.vm && !machInst.vtype8.vma)) { \
RiscvISA::vreg_t old_vd; \
xc->getRegOperand(this, idx, &old_vd); \
tmp_d0 = old_vd; \
} else { \
tmp_d0.set(0xff); \
} \
#define VRM_REQUIRED \
uint_fast8_t frm = xc->readMiscReg(MISCREG_FRM); \
@@ -987,7 +989,7 @@ def template Vector1Vs1VdMaskDeclare {{
template<typename ElemType>
class %(class_name)s : public %(base_class)s {
private:
RegId srcRegIdxArr[3];
RegId srcRegIdxArr[2];
RegId destRegIdxArr[1];
bool vm;
public:
@@ -1040,7 +1042,6 @@ Fault
%(op_rd)s;
%(set_vlenb)s;
%(vm_decl_rd)s;
%(copy_old_vd)s;
%(code)s;
%(op_wb)s;
return NoFault;
@@ -1167,9 +1168,9 @@ template<typename ElemType>
class %(class_name)s : public %(base_class)s
{
private:
// vs1(rs1), vs2, old_vd, v0 for *.vv[m] or *.vx[m]
// vs2, old_vd, v0 for *.vi[m]
RegId srcRegIdxArr[4];
// vs1(rs1), vs2, v0 for *.vv[m] or *.vx[m]
// vs2, v0 for *.vi[m]
RegId srcRegIdxArr[3];
RegId destRegIdxArr[1];
bool vm;
public:
@@ -1228,7 +1229,6 @@ Fault
%(set_vlenb)s;
%(set_vlen)s;
%(vm_decl_rd)s;
%(copy_old_vd)s;
const uint32_t bit_offset = vlenb / sizeof(ElemType);
const uint32_t offset = bit_offset * microIdx;
@@ -1293,8 +1293,8 @@ template<typename ElemType>
class %(class_name)s : public %(base_class)s
{
private:
// vs1(rs1), vs2, old_vd, v0 for *.vv or *.vf
RegId srcRegIdxArr[4];
// vs1(rs1), vs2, v0 for *.vv or *.vf
RegId srcRegIdxArr[3];
RegId destRegIdxArr[1];
bool vm;
public:
@@ -1353,7 +1353,6 @@ Fault
%(set_vlenb)s;
%(set_vlen)s;
%(vm_decl_rd)s;
%(copy_old_vd)s;
const uint32_t bit_offset = vlenb / sizeof(ElemType);
const uint32_t offset = bit_offset * microIdx;
@@ -1470,7 +1469,7 @@ def template VectorMaskDeclare {{
template<typename ElemType>
class %(class_name)s : public %(base_class)s {
private:
RegId srcRegIdxArr[3];
RegId srcRegIdxArr[2];
RegId destRegIdxArr[1];
public:
%(class_name)s(ExtMachInst _machInst);
@@ -1516,11 +1515,14 @@ Fault
status.vs = VPUStatus::DIRTY;
xc->setMiscReg(MISCREG_STATUS, status);
%(op_decl)s;
%(op_rd)s;
// TODO: remove it
%(set_vlenb)s;
%(copy_old_vd)s;
// mask tails are always treated as agnostic: writting 1s
tmp_d0.set(0xff);
%(code)s;
%(op_wb)s;
@@ -1773,7 +1775,7 @@ Fault
auto reduce_loop =
[&, this](const auto& f, const auto* _, const auto* vs2) {
ElemType microop_result = this->microIdx != 0 ? old_Vd[0] : Vs1[0];
ElemType microop_result = Vs1[0];
for (uint32_t i = 0; i < this->microVl; i++) {
uint32_t ei = i + vtype_VLMAX(vtype, vlen, true) *
this->microIdx;
@@ -1822,8 +1824,6 @@ Fault
%(vm_decl_rd)s;
%(copy_old_vd)s;
Vd[0] = this->microIdx != 0 ? old_Vd[0] : Vs1[0];
auto reduce_loop =
[&, this](const auto& f, const auto* _, const auto* vs2) {
vu tmp_val = Vd[0];
@@ -1874,8 +1874,6 @@ Fault
%(vm_decl_rd)s;
%(copy_old_vd)s;
Vd[0] = this->microIdx != 0 ? old_Vd[0] : Vs1[0];
auto reduce_loop =
[&, this](const auto& f, const auto* _, const auto* vs2) {
vwu tmp_val = Vd[0];
@@ -1923,10 +1921,9 @@ template<typename ElemType, typename IndexType>
constexpr uint32_t vd_eewb = sizeof(ElemType);
constexpr uint32_t vs2_eewb = sizeof(ElemType);
constexpr uint32_t vs1_eewb = sizeof(IndexType);
constexpr bool vs1_split = vd_eewb > vs1_eewb;
const int8_t lmul = vtype_vlmul(vtype);
const int8_t vs1_emul = lmul +
(vs1_split ? -(vs2_eewb / vs1_eewb) : vs1_eewb / vs2_eewb);
const int8_t vs1_emul = lmul + __builtin_ctz(vs1_eewb)
- __builtin_ctz(vs2_eewb);
const uint8_t vs2_vregs = lmul < 0 ? 1 : 1 << lmul;
const uint8_t vs1_vregs = vs1_emul < 0 ? 1 : 1 << vs1_emul;
const uint8_t vd_vregs = vs2_vregs;
@@ -1940,6 +1937,21 @@ template<typename ElemType, typename IndexType>
microop = new VectorNopMicroInst(_machInst);
this->microops.push_back(microop);
}
uint32_t vd_vlmax = vlenb / vd_eewb;
uint32_t vs1_vlmax = vlenb / vs1_eewb;
for (uint32_t i = 0; i < ceil((float) this->vl / vd_vlmax); i++) {
uint32_t pinvd_micro_vl = (vd_vlmax*(i+1) <= remaining_vl)
? vd_vlmax : remaining_vl;
uint8_t num_vd_pins = ceil((float) pinvd_micro_vl/vs1_vlmax)*vs2_vregs;
microop = new VPinVdMicroInst(machInst, i, num_vd_pins);
microop->setFlag(IsDelayedCommit);
this->microops.push_back(microop);
remaining_vl -= pinvd_micro_vl;
}
remaining_vl = this->vl;
for (uint32_t i = 0; i < std::max(vs1_vregs, vd_vregs) && micro_vl > 0;
i++) {
for (uint8_t j = 0; j < vs2_vregs; j++) {
@@ -1965,7 +1977,7 @@ template<typename ElemType, typename IndexType>
class %(class_name)s : public %(base_class)s
{
private:
// vs2, vs1, vd, vm
// vs2, vs1, vtmp0, vm
RegId srcRegIdxArr[4];
RegId destRegIdxArr[1];
bool vm;
@@ -2037,7 +2049,6 @@ Fault
%(set_vlenb)s;
%(set_vlen)s;
%(vm_decl_rd)s;
%(copy_old_vd)s;
const uint32_t vlmax = vtype_VLMAX(vtype,vlen);
constexpr uint32_t vd_eewb = sizeof(ElemType);
constexpr uint32_t vs1_eewb = sizeof(IndexType);
@@ -2059,6 +2070,7 @@ Fault
[[maybe_unused]] const uint32_t vd_bias =
vd_elems * (vs1_idx % vd_split_num) / vd_split_num;
%(code)s;
%(op_wb)s;
@@ -2216,8 +2228,6 @@ Fault
%(vm_decl_rd)s;
%(copy_old_vd)s;
Vd[0] = this->microIdx != 0 ? old_Vd[0] : Vs1[0];
auto reduce_loop =
[&, this](const auto& f, const auto* _, const auto* vs2) {
vwu tmp_val = Vd[0];
@@ -2271,6 +2281,13 @@ template<typename ElemType>
microop = new VectorNopMicroInst(_machInst);
this->microops.push_back(microop);
}
for (uint32_t i = 0; i < ceil((float) this->vl/micro_vlmax); i++) {
microop = new VPinVdMicroInst(machInst, i, i+1, true);
microop->setFlag(IsDelayedCommit);
this->microops.push_back(microop);
}
// Todo static filter out useless uop
int micro_idx = 0;
for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
@@ -2308,6 +2325,13 @@ template<typename ElemType>
microop = new VectorNopMicroInst(_machInst);
this->microops.push_back(microop);
}
for (uint32_t i = 0; i < ceil((float) this->vl / micro_vlmax); i++) {
microop = new VPinVdMicroInst(machInst, i, num_microops-i, false);
microop->setFlag(IsDelayedCommit);
this->microops.push_back(microop);
}
// Todo static filter out useless uop
int micro_idx = 0;
for (int i = 0; i < num_microops && micro_vl > 0; ++i) {
@@ -2333,9 +2357,9 @@ template<typename ElemType>
class %(class_name)s : public %(base_class)s
{
private:
// vs2, vs1, vs3(old_vd), vm for *.vv, *.vx
// vs2, (old_vd), vm for *.vi
RegId srcRegIdxArr[4];
// vs2, vs1, vm for *.vv, *.vx
// vs2, vm for *.vi
RegId srcRegIdxArr[3];
RegId destRegIdxArr[1];
bool vm;
public:
@@ -2398,7 +2422,6 @@ Fault
[[maybe_unused]]const uint32_t vlmax = vtype_VLMAX(vtype, vlen);
%(vm_decl_rd)s;
%(copy_old_vd)s;
%(code)s;
%(op_wb)s;
@@ -2439,7 +2462,6 @@ Fault
[[maybe_unused]]const uint32_t vlmax = vtype_VLMAX(vtype, vlen);
%(vm_decl_rd)s;
%(copy_old_vd)s;
%(code)s;
%(op_wb)s;

View File

@@ -128,7 +128,9 @@ def template VleMicroConstructor {{
setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _microIdx]);
_numTypedDestRegs[VecRegClass]++;
setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _microIdx]);
if (!_machInst.vtype8.vta || (!_machInst.vm && !_machInst.vtype8.vma)) {
setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _microIdx]);
}
if (!_machInst.vm) {
setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
}
@@ -245,6 +247,9 @@ Fault
status.vs = VPUStatus::DIRTY;
xc->setMiscReg(MISCREG_STATUS, status);
// tail/mask policy: both undisturbed if one is, 1s if none
%(tail_mask_policy_code)s
RiscvISA::vreg_t tmp_v0;
uint8_t *v0;
if(!machInst.vm) {
@@ -815,6 +820,17 @@ def template VlStrideConstructor {{
microop = new VectorNopMicroInst(_machInst);
this->microops.push_back(microop);
}
const uint8_t num_pinvd_microops = ceil((float) this->vl /
num_elems_per_vreg);
for (uint32_t i = 0; i < num_pinvd_microops; i++) {
uint32_t vdNumElems = (vl >= num_elems_per_vreg*(i+1))
? num_elems_per_vreg : vl-num_elems_per_vreg*i;
microop = new VPinVdMicroInst(machInst, i, vdNumElems);
microop->setFlag(IsDelayedCommit);
this->microops.push_back(microop);
}
for (int i = 0; micro_vl > 0; ++i) {
for (int j = 0; j < micro_vl; ++j) {
microop = new %(class_name)sMicro(machInst, i, j, micro_vl);
@@ -838,7 +854,7 @@ def template VlStrideMicroDeclare {{
class %(class_name)s : public %(base_class)s
{
private:
// rs1, rs2, vd, vm
// rs1, rs2, vtmp0, vm
RegId srcRegIdxArr[4];
RegId destRegIdxArr[1];
public:
@@ -869,8 +885,8 @@ def template VlStrideMicroConstructor {{
_numTypedDestRegs[VecRegClass]++;
setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs2]);
// We treat agnostic as undistrubed
setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _regIdx]);
// vtmp0 as dummy src reg to create dependency with pin vd micro
setSrcRegIdx(_numSrcRegs++, vecRegClass[VecMemInternalReg0]);
if (!_machInst.vm) {
setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
}
@@ -983,14 +999,6 @@ Fault
status.vs = VPUStatus::DIRTY;
xc->setMiscReg(MISCREG_STATUS, status);
constexpr uint8_t elem_size = sizeof(Vd[0]);
RiscvISA::vreg_t old_vd;
decltype(Vd) old_Vd = nullptr;
// We treat agnostic as undistrubed
xc->getRegOperand(this, 2, &old_vd);
old_Vd = old_vd.as<std::remove_reference_t<decltype(Vd[0])> >();
RiscvISA::vreg_t tmp_v0;
uint8_t *v0;
if (!machInst.vm) {
@@ -998,17 +1006,6 @@ Fault
v0 = tmp_v0.as<uint8_t>();
}
if (microIdx == 0) {
// treat vma as vmu
// if (machInst.vtype8.vma == 0)
memcpy(Vd, old_Vd, microVl * elem_size);
// treat vta as vtu
// if (machInst.vtype8.vta == 0)
memcpy(Vd + microVl, old_Vd + microVl, vlenb - microVl * elem_size);
} else {
memcpy(Vd, old_Vd, vlenb);
}
size_t ei = this->regIdx * vlenb / sizeof(Vd[0]) + this->microIdx;
if (machInst.vm || elem_mask(v0, ei)) {
memcpy(Mem.as<uint8_t>(), pkt->getPtr<uint8_t>(), pkt->getSize());
@@ -1220,6 +1217,21 @@ template<typename ElemType>
microop = new VectorNopMicroInst(_machInst);
this->microops.push_back(microop);
}
const uint32_t vd_vlmax = vlenb / vd_eewb;
const uint8_t num_pinvdcpyvs_microops = ceil((float) this->vl/vd_vlmax);
for (uint32_t i = 0; i < num_pinvdcpyvs_microops; i++) {
uint32_t vdNumElems = (vl >= vd_vlmax*(i+1)) ? vd_vlmax:vl-vd_vlmax*i;
microop = new VCpyVsMicroInst(machInst, i, machInst.vs2);
microop->setFlag(IsDelayedCommit);
this->microops.push_back(microop);
microop = new VPinVdMicroInst(machInst, i, vdNumElems);
microop->setFlag(IsDelayedCommit);
this->microops.push_back(microop);
}
for (uint32_t i = 0; micro_vl > 0; i++) {
for (uint32_t j = 0; j < micro_vl; ++j) {
uint32_t vdRegIdx = i / vd_split_num;
@@ -1251,8 +1263,8 @@ template<typename ElemType>
class %(class_name)s : public %(base_class)s
{
private:
// rs1, vs2, vd, vm
RegId srcRegIdxArr[4];
// rs1, vs2, vm
RegId srcRegIdxArr[3];
RegId destRegIdxArr[1];
public:
%(class_name)s(ExtMachInst _machInst,
@@ -1283,9 +1295,7 @@ template<typename ElemType>
setDestRegIdx(_numDestRegs++, vecRegClass[_machInst.vd + _vdRegIdx]);
_numTypedDestRegs[VecRegClass]++;
setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vs2 + _vs2RegIdx]);
// We treat agnostic as undistrubed
setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _vdRegIdx]);
setSrcRegIdx(_numSrcRegs++, vecRegClass[VecMemInternalReg0 + _vs2RegIdx]);
if (!_machInst.vm) {
setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
}
@@ -1408,12 +1418,6 @@ Fault
constexpr uint8_t elem_size = sizeof(Vd[0]);
RiscvISA::vreg_t old_vd;;
decltype(Vd) old_Vd = nullptr;
// We treat agnostic as undistrubed
xc->getRegOperand(this, 2, &old_vd);
old_Vd = old_vd.as<std::remove_reference_t<decltype(Vd[0])> >();
RiscvISA::vreg_t tmp_v0;
uint8_t *v0;
if (!machInst.vm) {
@@ -1421,8 +1425,6 @@ Fault
v0 = tmp_v0.as<uint8_t>();
}
memcpy(Vd, old_Vd, vlenb);
size_t ei = this->vdRegIdx * vlenb / elem_size + this->vdElemIdx;
if (machInst.vm || elem_mask(v0, ei)) {
memcpy(Mem.as<uint8_t>(), pkt->getPtr<uint8_t>(), pkt->getSize());
@@ -1690,8 +1692,8 @@ def template VlSegMicroDeclare {{
class %(class_name)s : public %(base_class)s
{
private:
// rs1, rs2, vd, vm
RegId srcRegIdxArr[4];
// rs1, vd, vm
RegId srcRegIdxArr[3];
RegId destRegIdxArr[1];
uint32_t field;
uint32_t numFields;
@@ -1724,8 +1726,10 @@ def template VlSegMicroConstructor {{
(field * numMicroops)]);
_numTypedDestRegs[VecRegClass]++;
setSrcRegIdx(_numSrcRegs++, intRegClass[_machInst.rs1]);
setSrcRegIdx(_numSrcRegs++, vecRegClass[VecMemInternalReg0 + _microIdx +
(field * numMicroops)]);
if (!_machInst.vtype8.vta || (!_machInst.vm && !_machInst.vtype8.vma)) {
setSrcRegIdx(_numSrcRegs++, vecRegClass[_machInst.vd + _microIdx
+ (field * numMicroops)]);
}
if (!_machInst.vm) {
setSrcRegIdx(_numSrcRegs++, vecRegClass[0]);
}
@@ -1843,6 +1847,9 @@ Fault
status.vs = VPUStatus::DIRTY;
xc->setMiscReg(MISCREG_STATUS, status);
// tail/mask policy: both undisturbed if one is, 1s if none
%(tail_mask_policy_code)s
RiscvISA::vreg_t tmp_v0;
uint8_t *v0;
if(!machInst.vm) {