Implement Samsung's memory layout

2024-01-20 17:24:33 +01:00
parent 283b9e74a3
commit 400774df6f
8 changed files with 244 additions and 6 deletions
--- a/pim-os/src/bin/samsung_matrix_vector_multiply.rs
+++ b/pim-os/src/bin/samsung_matrix_vector_multiply.rs
@@ -0,0 +1,61 @@
+#![no_std]
+#![no_main]
+
+extern crate alloc;
+
+use aarch64_cpu::asm::barrier;
+use alloc::boxed::Box;
+use core::fmt::Write;
+use half::f16;
+use nalgebra::{SMatrix, SVector};
+use pim_isa::BankMode;
+use pim_os::{
+    pim::{self, interleaved_array, kernel::samsung_matrix_vector_mul, vector::F16x1},
+    uart::Uart0,
+};
+
+#[no_mangle]
+pub extern "C" fn main() {
+    pim::state::set_kernel(&samsung_matrix_vector_mul::KERNEL);
+
+    let mut matrix = Box::new(pim::continuous_array::Matrix::<32, 128>(SMatrix::zeros()));
+    matrix.0.fill_lower_triangle(F16x1(f16::ONE), 0);
+    let input_vector = pim::continuous_array::Matrix::<128, 1>(SVector::from_fn(|_, _| {
+        F16x1(f16::from_f32(1 as _))
+    }));
+    let mut output_partial_sum_vector =
+        Box::new(pim::continuous_array::Matrix::<32, 16>(SMatrix::zeros()));
+
+    let interleaved_input_vector = Box::new(interleaved_array::Vector::from(&input_vector));
+
+    let dummy = Box::new(0);
+
+    // Verify everything is correctly initialized before PIM operation
+    barrier::dsb(barrier::SY);
+
+    // Execute kernel
+    {
+        pim::state::set_bank_mode(BankMode::PimAllBank);
+
+        samsung_matrix_vector_mul::execute(
+            matrix.as_ref(),
+            interleaved_input_vector.as_ref(),
+            output_partial_sum_vector.as_mut(),
+            dummy.as_ref(),
+        );
+
+        pim::state::set_bank_mode(BankMode::SingleBank);
+    }
+
+    writeln!(Uart0, "{output_partial_sum_vector}").unwrap();
+
+    let output_vector = SVector::<F16x1, 32>::from_fn(|r, _| {
+        output_partial_sum_vector
+            .0
+            .row(r)
+            .iter()
+            .fold(F16x1::default(), |acc, val| acc + *val)
+    });
+
+    writeln!(Uart0, "{output_vector}").unwrap();
+}
--- a/pim-os/src/pim.rs
+++ b/pim-os/src/pim.rs
@@ -1,5 +1,8 @@
 pub mod array;
 pub mod config;
+pub mod continuous_array;
+pub mod interleaved_array;
 pub mod kernel;
-pub mod vector;
+pub mod operation;
 pub mod state;
+pub mod vector;
--- a/pim-os/src/pim/continuous_array.rs
+++ b/pim-os/src/pim/continuous_array.rs
@@ -0,0 +1,15 @@
+use super::vector::F16x1;
+use core::fmt::Display;
+use nalgebra::SMatrix;
+
+#[repr(C, align(65536))]
+#[derive(Debug)]
+pub struct Matrix<const R: usize, const C: usize>(pub SMatrix<F16x1, R, C>);
+
+pub type Vector<const R: usize> = Matrix<R, 1>;
+
+impl<const R: usize, const C: usize> Display for Matrix<R, C> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        self.0.fmt(f)
+    }
+}
--- a/pim-os/src/pim/interleaved_array.rs
+++ b/pim-os/src/pim/interleaved_array.rs
@@ -0,0 +1,37 @@
+use super::{array::NUMBER_OF_BANKS, continuous_array, vector::F16x16, vector::ELEMENT_COUNT};
+
+#[repr(C, align(65536))]
+#[derive(Debug)]
+pub struct Vector<const R: usize>(pub [[F16x16; NUMBER_OF_BANKS]; R / ELEMENT_COUNT])
+where
+    [(); R / ELEMENT_COUNT]:;
+
+impl<const R: usize> Default for Vector<R>
+where
+    [(); R / ELEMENT_COUNT]:,
+{
+    fn default() -> Self {
+        Self([[F16x16::default(); NUMBER_OF_BANKS]; R / ELEMENT_COUNT])
+    }
+}
+
+impl<const R: usize> From<&continuous_array::Vector<R>> for Vector<R>
+where
+    [(); R / ELEMENT_COUNT]:,
+{
+    fn from(continuous_vector: &continuous_array::Vector<R>) -> Self {
+        let mut vector = Self::default();
+        let blocks: usize = R / ELEMENT_COUNT;
+        for block_index in 0..blocks {
+            let element =
+                unsafe { *(continuous_vector.0.as_ptr() as *const F16x16).add(block_index) };
+            for k in 0..NUMBER_OF_BANKS {
+                let interleaved_block_index = block_index * NUMBER_OF_BANKS + k;
+                unsafe {
+                    *(vector.0.as_mut_ptr() as *mut F16x16).add(interleaved_block_index) = element;
+                }
+            }
+        }
+        vector
+    }
+}
--- a/pim-os/src/pim/kernel.rs
+++ b/pim-os/src/pim/kernel.rs
@@ -2,3 +2,4 @@ pub mod matrix_matrix_add;
 pub mod matrix_matrix_mul;
 pub mod matrix_scalar_mul;
 pub mod matrix_vector_mul;
+pub mod samsung_matrix_vector_mul;
--- a/pim-os/src/pim/kernel/samsung_matrix_vector_mul.rs
+++ b/pim-os/src/pim/kernel/samsung_matrix_vector_mul.rs
@@ -0,0 +1,99 @@
+use crate::pim::{continuous_array::Matrix, interleaved_array, operation::PimOperand, vector};
+use pim_isa::{File, Instruction, Kernel};
+
+pub const KERNEL: Kernel = Kernel([
+    Instruction::MOV {
+        src: File::Bank,
+        dst: File::GrfA { index: 0 },
+    },
+    Instruction::MOV {
+        src: File::Bank,
+        dst: File::GrfA { index: 1 },
+    },
+    Instruction::MOV {
+        src: File::Bank,
+        dst: File::GrfA { index: 2 },
+    },
+    Instruction::MOV {
+        src: File::Bank,
+        dst: File::GrfA { index: 3 },
+    },
+    Instruction::MOV {
+        src: File::Bank,
+        dst: File::GrfA { index: 4 },
+    },
+    Instruction::MOV {
+        src: File::Bank,
+        dst: File::GrfA { index: 5 },
+    },
+    Instruction::MOV {
+        src: File::Bank,
+        dst: File::GrfA { index: 6 },
+    },
+    Instruction::MOV {
+        src: File::Bank,
+        dst: File::GrfA { index: 7 },
+    },
+    Instruction::MAC {
+        src0: File::Bank,
+        src1: File::GrfA { index: 0 },
+        src2: File::GrfB { index: 0 },
+        dst: File::GrfB { index: 0 },
+        aam: true,
+    },
+    Instruction::JUMP {
+        offset: -1,
+        count: 7,
+    },
+    Instruction::FILL {
+        src: File::GrfB { index: 0 },
+        dst: File::Bank,
+    },
+    Instruction::EXIT,
+    Instruction::NOP,
+    Instruction::NOP,
+    Instruction::NOP,
+    Instruction::NOP,
+    Instruction::NOP,
+    Instruction::NOP,
+    Instruction::NOP,
+    Instruction::NOP,
+    Instruction::NOP,
+    Instruction::NOP,
+    Instruction::NOP,
+    Instruction::NOP,
+    Instruction::NOP,
+    Instruction::NOP,
+    Instruction::NOP,
+    Instruction::NOP,
+    Instruction::NOP,
+    Instruction::NOP,
+    Instruction::NOP,
+    Instruction::NOP,
+]);
+
+pub fn execute<const R: usize, const C: usize>(
+    matrix: &Matrix<R, C>,
+    input_vector: &interleaved_array::Vector<C>,
+    output_partial_sum_vector: &mut Matrix<R, { vector::ELEMENT_COUNT }>,
+    dummy: &impl PimOperand,
+) where
+    [(); C / vector::ELEMENT_COUNT]:,
+{
+    for block in input_vector.0.as_slice().iter() {
+        block.execute_read();
+    }
+
+    for matrix_column in matrix
+        .0
+        .fixed_rows::<1>(0)
+        .fixed_columns_with_step::<{ C / vector::ELEMENT_COUNT }>(0, vector::ELEMENT_COUNT)
+        .iter()
+    {
+        matrix_column.execute_read();
+    }
+
+    output_partial_sum_vector.execute_write();
+
+    dummy.execute_read();
+}
--- a/pim-os/src/pim/operation.rs
+++ b/pim-os/src/pim/operation.rs
@@ -0,0 +1,22 @@
+pub trait PimOperand {
+    fn ptr(&self) -> *const u8;
+    fn ptr_mut(&mut self) -> *mut u8;
+
+    fn execute_read(&self) {
+        unsafe { core::ptr::read_volatile(self.ptr()) };
+    }
+
+    fn execute_write(&mut self) {
+        unsafe { core::ptr::write_volatile(self.ptr_mut(), Default::default()) };
+    }
+}
+
+impl<T> PimOperand for T {
+    fn ptr(&self) -> *const u8 {
+        core::ptr::addr_of!(*self) as *const _
+    }
+
+    fn ptr_mut(&mut self) -> *mut u8 {
+        core::ptr::addr_of_mut!(*self) as *mut _
+    }
+}
--- a/pim-os/src/pim/vector.rs
+++ b/pim-os/src/pim/vector.rs
@@ -1,6 +1,6 @@
 use half::f16;

-const FLOATING_POINT_UNITS: usize = 16;
+pub const ELEMENT_COUNT: usize = 16;

 #[repr(C)]
 #[derive(Default, Clone, Copy, PartialEq)]
@@ -64,21 +64,21 @@ impl core::ops::MulAssign<F16x1> for F16x1 {

 #[repr(C)]
 #[derive(Default, Debug, Clone, Copy, PartialEq)]
-pub struct F16x16(pub [F16x1; FLOATING_POINT_UNITS]);
+pub struct F16x16(pub [F16x1; ELEMENT_COUNT]);

 impl num_traits::identities::Zero for F16x16 {
    fn zero() -> Self {
-        Self([F16x1::zero(); FLOATING_POINT_UNITS])
+        Self([F16x1::zero(); ELEMENT_COUNT])
    }

    fn is_zero(&self) -> bool {
-        self.0 == [F16x1::zero(); FLOATING_POINT_UNITS]
+        self.0 == [F16x1::zero(); ELEMENT_COUNT]
    }
 }

 impl num_traits::identities::One for F16x16 {
    fn one() -> Self {
-        Self([F16x1::one(); FLOATING_POINT_UNITS])
+        Self([F16x1::one(); ELEMENT_COUNT])
    }
 }