Layout matrix correctly for additional rows

2024-02-21 16:45:03 +01:00
parent cf83e27f50
commit f04ee8e603
6 changed files with 73 additions and 35 deletions
--- a/pim-os/src/bin/samsung_matrix_vector_multiply.rs
+++ b/pim-os/src/bin/samsung_matrix_vector_multiply.rs
@@ -18,8 +18,9 @@ use pim_os::{
    uart::Uart0,
 };

-const ROWS: usize = 16;
+const ROWS: usize = 32;
 const COLUMNS: usize = 128;
+const X16_ROWS: usize = ROWS / 16;
 const X16_COLUMNS: usize = COLUMNS / 16;

 #[no_mangle]
@@ -29,7 +30,7 @@ pub extern "C" fn main() {
    let mut matrix = SMatrix::<_, ROWS, COLUMNS>::zeros();
    matrix.fill_lower_triangle(F16x1::one(), 0);

-    let pim_matrix = Box::new(pim::continuous_array::Matrix::from(matrix));
+    let pim_matrix = Box::new(pim::continuous_array::Matrix::<X16_ROWS, X16_COLUMNS>::from(matrix));

    let input_vector = SVector::<_, X16_COLUMNS>::from_element(F16x16::one());
    let interleaved_input_vector = Box::new(interleaved_array::Vector::from(input_vector));
--- a/pim-os/src/pim/continuous_array.rs
+++ b/pim-os/src/pim/continuous_array.rs
@@ -4,25 +4,31 @@ use nalgebra::SMatrix;

 #[repr(C, align(65536))]
 #[derive(Debug)]
-pub struct Matrix<const R: usize, const C: usize>(pub SMatrix<F16x16, R, C>);
+pub struct Matrix<const X16R: usize, const X16C: usize>(pub [SMatrix<F16x16, 16, X16C>; X16R]);

-impl<const R: usize, const C: usize> Display for Matrix<R, C> {
+impl<const X16R: usize, const X16C: usize> Display for Matrix<X16R, X16C> {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
-        self.0.fmt(f)
+        for block in self.0.iter() {
+            block.fmt(f)?
+        }
+        Ok(())
    }
 }

-impl<const R: usize, const C: usize, const X16C: usize> From<SMatrix<F16x1, R, C>>
-    for Matrix<R, X16C>
+impl<const R: usize, const X16R: usize, const C: usize, const X16C: usize>
+    From<SMatrix<F16x1, R, C>> for Matrix<X16R, X16C>
 {
    fn from(matrix: SMatrix<F16x1, R, C>) -> Self {
-        Self(SMatrix::from_row_iterator(
-            matrix
-                .transpose()
-                .iter()
-                .map(|e| *e)
-                .array_chunks::<16>()
-                .map(|chunk| F16x16(chunk)),
-        ))
+        Self(core::array::from_fn(|i| {
+            SMatrix::from_row_iterator(
+                matrix
+                    .fixed_rows::<16>(i * 16)
+                    .transpose()
+                    .iter()
+                    .map(|e| *e)
+                    .array_chunks::<16>()
+                    .map(|chunk| F16x16(chunk)),
+            )
+        }))
    }
 }
--- a/pim-os/src/pim/interleaved_array.rs
+++ b/pim-os/src/pim/interleaved_array.rs
@@ -3,19 +3,19 @@ use nalgebra::SVector;

 #[repr(C, align(512))]
 #[derive(Debug)]
-pub struct Vector<const R: usize>(pub [[F16x16; NUMBER_OF_BANKS]; R]);
+pub struct Vector<const X16R: usize>(pub [[F16x16; NUMBER_OF_BANKS]; X16R]);

-impl<const R: usize> Default for Vector<R> {
+impl<const X16R: usize> Default for Vector<X16R> {
    fn default() -> Self {
-        Self([[F16x16::default(); NUMBER_OF_BANKS]; R])
+        Self([[F16x16::default(); NUMBER_OF_BANKS]; X16R])
    }
 }

-impl<const R: usize> From<SVector<F16x16, R>> for Vector<R> {
-    fn from(input_vector: SVector<F16x16, R>) -> Self {
+impl<const X16R: usize> From<SVector<F16x16, X16R>> for Vector<X16R> {
+    fn from(input_vector: SVector<F16x16, X16R>) -> Self {
        let mut interleaved_vector = Self::default();

-        for block_index in 0..R {
+        for block_index in 0..X16R {
            let element = input_vector[block_index];
            for k in 0..NUMBER_OF_BANKS {
                interleaved_vector.0[block_index][k] = element;
--- a/pim-os/src/pim/kernel/samsung_matrix_vector_mul.rs
+++ b/pim-os/src/pim/kernel/samsung_matrix_vector_mul.rs
@@ -46,12 +46,16 @@ pub const KERNEL: Kernel = Kernel([
    },
    Instruction::JUMP {
        offset: -1,
-        count: 7,
+        count: 15,
    },
    Instruction::FILL {
        src: File::GrfB { index: 0 },
        dst: File::Bank,
    },
+    Instruction::FILL {
+        src: File::GrfB { index: 1 },
+        dst: File::Bank,
+    },
    Instruction::EXIT,
    Instruction::NOP,
    Instruction::NOP,
@@ -72,12 +76,11 @@ pub const KERNEL: Kernel = Kernel([
    Instruction::NOP,
    Instruction::NOP,
    Instruction::NOP,
-    Instruction::NOP,
 ]);

-pub fn execute<const R: usize, const C: usize>(
-    matrix: &Matrix<R, C>,
-    input_vector: &interleaved_array::Vector<C>,
+pub fn execute<const X16R: usize, const R: usize, const X16C: usize>(
+    matrix: &Matrix<X16R, X16C>,
+    input_vector: &interleaved_array::Vector<X16C>,
    output_partial_sum_vector: &mut SVector<F16x16, R>,
    dummy: &impl PimOperand,
 ) {
@@ -85,11 +88,18 @@ pub fn execute<const R: usize, const C: usize>(
        block.execute_read();
    }

-    for column_block in matrix.0.fixed_rows::<1>(0).iter() {
-        column_block.execute_read();
+    for row_block in matrix.0.iter() {
+        for column_block in row_block.fixed_rows::<1>(0).iter() {
+            column_block.execute_read();
+        }
    }

-    output_partial_sum_vector.execute_write();
+    for chunk in output_partial_sum_vector
+        .fixed_rows_with_step_mut::<X16R>(0, 16)
+        .iter_mut()
+    {
+        chunk.execute_write();
+    }

    dummy.execute_read();
 }
--- a/pim-os/src/pim/operation.rs
+++ b/pim-os/src/pim/operation.rs
@@ -1,13 +1,17 @@
+use aarch64_cpu::asm::barrier;
+
 pub trait PimOperand {
    fn ptr(&self) -> *const u8;
    fn ptr_mut(&mut self) -> *mut u8;

    fn execute_read(&self) {
        unsafe { core::ptr::read_volatile(self.ptr()) };
+        barrier::dsb(barrier::SY);
    }

    fn execute_write(&mut self) {
        unsafe { core::ptr::write_volatile(self.ptr_mut(), Default::default()) };
+        barrier::dsb(barrier::SY);
    }
 }