32 lines
923 B
Rust
32 lines
923 B
Rust
pub fn execute<const X16R: usize, const X16C: usize, const R: usize>(
|
|
matrix: &Matrix<X16R, X16C>,
|
|
input_vector: &Vector<X16C>,
|
|
output_partial_sum_vector: &mut SVector<F16x16, R>,
|
|
dummy: &impl PimOperand,
|
|
) {
|
|
// Load input vector into GRF-A registers
|
|
for chunk in input_vector.0.iter() {
|
|
chunk.execute_read();
|
|
}
|
|
|
|
// Execute the MAC instructions without memory barriers
|
|
for sub_matrix in matrix.0.iter() {
|
|
for column_block in sub_matrix.fixed_rows::<1>(0).iter() {
|
|
column_block.execute_read_async();
|
|
}
|
|
}
|
|
|
|
// Verify all memory accesses have finished
|
|
barrier::dsb(barrier::SY);
|
|
|
|
// Copy the partial sums into the bank
|
|
for chunk in output_partial_sum_vector
|
|
.fixed_rows_with_step_mut::<X16R>(0, 16)
|
|
.iter_mut() {
|
|
chunk.execute_write();
|
|
}
|
|
|
|
// Execute the EXIT instruction
|
|
dummy.execute_read();
|
|
}
|