pub fn execute( matrix: &Matrix, input_vector: &Vector, output_partial_sum_vector: &mut SVector, dummy: &impl PimOperand, ) { // Load input vector into GRF-A registers for chunk in input_vector.0.iter() { chunk.execute_read(); } // Execute the MAC instructions without memory barriers for sub_matrix in matrix.0.iter() { for column_block in sub_matrix.fixed_rows::<1>(0).iter() { column_block.execute_read_async(); } } // Verify all memory accesses have finished barrier::dsb(barrier::SY); // Copy the partial sums into the bank for chunk in output_partial_sum_vector .fixed_rows_with_step_mut::(0, 16) .iter_mut() { chunk.execute_write(); } // Execute the EXIT instruction dummy.execute_read(); }