diff --git a/pim-os/src/bin/samsung_matrix_vector_multiply.rs b/pim-os/src/bin/samsung_matrix_vector_multiply.rs new file mode 100644 index 0000000..1983ac1 --- /dev/null +++ b/pim-os/src/bin/samsung_matrix_vector_multiply.rs @@ -0,0 +1,61 @@ +#![no_std] +#![no_main] + +extern crate alloc; + +use aarch64_cpu::asm::barrier; +use alloc::boxed::Box; +use core::fmt::Write; +use half::f16; +use nalgebra::{SMatrix, SVector}; +use pim_isa::BankMode; +use pim_os::{ + pim::{self, interleaved_array, kernel::samsung_matrix_vector_mul, vector::F16x1}, + uart::Uart0, +}; + +#[no_mangle] +pub extern "C" fn main() { + pim::state::set_kernel(&samsung_matrix_vector_mul::KERNEL); + + let mut matrix = Box::new(pim::continuous_array::Matrix::<32, 128>(SMatrix::zeros())); + matrix.0.fill_lower_triangle(F16x1(f16::ONE), 0); + let input_vector = pim::continuous_array::Matrix::<128, 1>(SVector::from_fn(|_, _| { + F16x1(f16::from_f32(1 as _)) + })); + let mut output_partial_sum_vector = + Box::new(pim::continuous_array::Matrix::<32, 16>(SMatrix::zeros())); + + let interleaved_input_vector = Box::new(interleaved_array::Vector::from(&input_vector)); + + let dummy = Box::new(0); + + // Verify everything is correctly initialized before PIM operation + barrier::dsb(barrier::SY); + + // Execute kernel + { + pim::state::set_bank_mode(BankMode::PimAllBank); + + samsung_matrix_vector_mul::execute( + matrix.as_ref(), + interleaved_input_vector.as_ref(), + output_partial_sum_vector.as_mut(), + dummy.as_ref(), + ); + + pim::state::set_bank_mode(BankMode::SingleBank); + } + + writeln!(Uart0, "{output_partial_sum_vector}").unwrap(); + + let output_vector = SVector::::from_fn(|r, _| { + output_partial_sum_vector + .0 + .row(r) + .iter() + .fold(F16x1::default(), |acc, val| acc + *val) + }); + + writeln!(Uart0, "{output_vector}").unwrap(); +} diff --git a/pim-os/src/pim.rs b/pim-os/src/pim.rs index 58ee855..2ccbcd4 100644 --- a/pim-os/src/pim.rs +++ b/pim-os/src/pim.rs @@ -1,5 +1,8 @@ pub mod array; pub mod config; +pub mod continuous_array; +pub mod interleaved_array; pub mod kernel; -pub mod vector; +pub mod operation; pub mod state; +pub mod vector; diff --git a/pim-os/src/pim/continuous_array.rs b/pim-os/src/pim/continuous_array.rs new file mode 100644 index 0000000..5975fdc --- /dev/null +++ b/pim-os/src/pim/continuous_array.rs @@ -0,0 +1,15 @@ +use super::vector::F16x1; +use core::fmt::Display; +use nalgebra::SMatrix; + +#[repr(C, align(65536))] +#[derive(Debug)] +pub struct Matrix(pub SMatrix); + +pub type Vector = Matrix; + +impl Display for Matrix { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + self.0.fmt(f) + } +} diff --git a/pim-os/src/pim/interleaved_array.rs b/pim-os/src/pim/interleaved_array.rs new file mode 100644 index 0000000..49da9f1 --- /dev/null +++ b/pim-os/src/pim/interleaved_array.rs @@ -0,0 +1,37 @@ +use super::{array::NUMBER_OF_BANKS, continuous_array, vector::F16x16, vector::ELEMENT_COUNT}; + +#[repr(C, align(65536))] +#[derive(Debug)] +pub struct Vector(pub [[F16x16; NUMBER_OF_BANKS]; R / ELEMENT_COUNT]) +where + [(); R / ELEMENT_COUNT]:; + +impl Default for Vector +where + [(); R / ELEMENT_COUNT]:, +{ + fn default() -> Self { + Self([[F16x16::default(); NUMBER_OF_BANKS]; R / ELEMENT_COUNT]) + } +} + +impl From<&continuous_array::Vector> for Vector +where + [(); R / ELEMENT_COUNT]:, +{ + fn from(continuous_vector: &continuous_array::Vector) -> Self { + let mut vector = Self::default(); + let blocks: usize = R / ELEMENT_COUNT; + for block_index in 0..blocks { + let element = + unsafe { *(continuous_vector.0.as_ptr() as *const F16x16).add(block_index) }; + for k in 0..NUMBER_OF_BANKS { + let interleaved_block_index = block_index * NUMBER_OF_BANKS + k; + unsafe { + *(vector.0.as_mut_ptr() as *mut F16x16).add(interleaved_block_index) = element; + } + } + } + vector + } +} diff --git a/pim-os/src/pim/kernel.rs b/pim-os/src/pim/kernel.rs index eb48da6..b7f3750 100644 --- a/pim-os/src/pim/kernel.rs +++ b/pim-os/src/pim/kernel.rs @@ -2,3 +2,4 @@ pub mod matrix_matrix_add; pub mod matrix_matrix_mul; pub mod matrix_scalar_mul; pub mod matrix_vector_mul; +pub mod samsung_matrix_vector_mul; diff --git a/pim-os/src/pim/kernel/samsung_matrix_vector_mul.rs b/pim-os/src/pim/kernel/samsung_matrix_vector_mul.rs new file mode 100644 index 0000000..56eaf7f --- /dev/null +++ b/pim-os/src/pim/kernel/samsung_matrix_vector_mul.rs @@ -0,0 +1,99 @@ +use crate::pim::{continuous_array::Matrix, interleaved_array, operation::PimOperand, vector}; +use pim_isa::{File, Instruction, Kernel}; + +pub const KERNEL: Kernel = Kernel([ + Instruction::MOV { + src: File::Bank, + dst: File::GrfA { index: 0 }, + }, + Instruction::MOV { + src: File::Bank, + dst: File::GrfA { index: 1 }, + }, + Instruction::MOV { + src: File::Bank, + dst: File::GrfA { index: 2 }, + }, + Instruction::MOV { + src: File::Bank, + dst: File::GrfA { index: 3 }, + }, + Instruction::MOV { + src: File::Bank, + dst: File::GrfA { index: 4 }, + }, + Instruction::MOV { + src: File::Bank, + dst: File::GrfA { index: 5 }, + }, + Instruction::MOV { + src: File::Bank, + dst: File::GrfA { index: 6 }, + }, + Instruction::MOV { + src: File::Bank, + dst: File::GrfA { index: 7 }, + }, + Instruction::MAC { + src0: File::Bank, + src1: File::GrfA { index: 0 }, + src2: File::GrfB { index: 0 }, + dst: File::GrfB { index: 0 }, + aam: true, + }, + Instruction::JUMP { + offset: -1, + count: 7, + }, + Instruction::FILL { + src: File::GrfB { index: 0 }, + dst: File::Bank, + }, + Instruction::EXIT, + Instruction::NOP, + Instruction::NOP, + Instruction::NOP, + Instruction::NOP, + Instruction::NOP, + Instruction::NOP, + Instruction::NOP, + Instruction::NOP, + Instruction::NOP, + Instruction::NOP, + Instruction::NOP, + Instruction::NOP, + Instruction::NOP, + Instruction::NOP, + Instruction::NOP, + Instruction::NOP, + Instruction::NOP, + Instruction::NOP, + Instruction::NOP, + Instruction::NOP, +]); + +pub fn execute( + matrix: &Matrix, + input_vector: &interleaved_array::Vector, + output_partial_sum_vector: &mut Matrix, + dummy: &impl PimOperand, +) where + [(); C / vector::ELEMENT_COUNT]:, +{ + for block in input_vector.0.as_slice().iter() { + block.execute_read(); + } + + for matrix_column in matrix + .0 + .fixed_rows::<1>(0) + .fixed_columns_with_step::<{ C / vector::ELEMENT_COUNT }>(0, vector::ELEMENT_COUNT) + .iter() + { + matrix_column.execute_read(); + } + + output_partial_sum_vector.execute_write(); + + dummy.execute_read(); +} diff --git a/pim-os/src/pim/operation.rs b/pim-os/src/pim/operation.rs new file mode 100644 index 0000000..728fb2e --- /dev/null +++ b/pim-os/src/pim/operation.rs @@ -0,0 +1,22 @@ +pub trait PimOperand { + fn ptr(&self) -> *const u8; + fn ptr_mut(&mut self) -> *mut u8; + + fn execute_read(&self) { + unsafe { core::ptr::read_volatile(self.ptr()) }; + } + + fn execute_write(&mut self) { + unsafe { core::ptr::write_volatile(self.ptr_mut(), Default::default()) }; + } +} + +impl PimOperand for T { + fn ptr(&self) -> *const u8 { + core::ptr::addr_of!(*self) as *const _ + } + + fn ptr_mut(&mut self) -> *mut u8 { + core::ptr::addr_of_mut!(*self) as *mut _ + } +} diff --git a/pim-os/src/pim/vector.rs b/pim-os/src/pim/vector.rs index cc55721..3f6e764 100644 --- a/pim-os/src/pim/vector.rs +++ b/pim-os/src/pim/vector.rs @@ -1,6 +1,6 @@ use half::f16; -const FLOATING_POINT_UNITS: usize = 16; +pub const ELEMENT_COUNT: usize = 16; #[repr(C)] #[derive(Default, Clone, Copy, PartialEq)] @@ -64,21 +64,21 @@ impl core::ops::MulAssign for F16x1 { #[repr(C)] #[derive(Default, Debug, Clone, Copy, PartialEq)] -pub struct F16x16(pub [F16x1; FLOATING_POINT_UNITS]); +pub struct F16x16(pub [F16x1; ELEMENT_COUNT]); impl num_traits::identities::Zero for F16x16 { fn zero() -> Self { - Self([F16x1::zero(); FLOATING_POINT_UNITS]) + Self([F16x1::zero(); ELEMENT_COUNT]) } fn is_zero(&self) -> bool { - self.0 == [F16x1::zero(); FLOATING_POINT_UNITS] + self.0 == [F16x1::zero(); ELEMENT_COUNT] } } impl num_traits::identities::One for F16x16 { fn one() -> Self { - Self([F16x1::one(); FLOATING_POINT_UNITS]) + Self([F16x1::one(); ELEMENT_COUNT]) } }