diff --git a/pim-os/src/bin/samsung_matrix_vector_multiply.rs b/pim-os/src/bin/samsung_matrix_vector_multiply.rs index 707c838..bb7f240 100644 --- a/pim-os/src/bin/samsung_matrix_vector_multiply.rs +++ b/pim-os/src/bin/samsung_matrix_vector_multiply.rs @@ -6,38 +6,42 @@ extern crate alloc; use aarch64_cpu::asm::barrier; use alloc::boxed::Box; use core::fmt::Write; -use half::f16; use nalgebra::{SMatrix, SVector}; +use num_traits::{One, Zero}; use pim_isa::BankMode; use pim_os::{ pim::{ self, interleaved_array, kernel::samsung_matrix_vector_mul, - vector::{self, F16x1}, + vector::{F16x1, F16x16}, }, uart::Uart0, }; -const ROWS: usize = 32; +const ROWS: usize = 16; const COLUMNS: usize = 128; +const X16_COLUMNS: usize = COLUMNS / 16; #[no_mangle] pub extern "C" fn main() { pim::state::set_kernel(&samsung_matrix_vector_mul::KERNEL); - let mut matrix = Box::new(pim::continuous_array::Matrix::( - SMatrix::zeros(), - )); - matrix.0.fill_lower_triangle(F16x1(f16::ONE), 0); - let input_vector = pim::continuous_array::Matrix::(SVector::from_fn(|_, _| { - F16x1(f16::from_f32(1 as _)) - })); - let mut output_partial_sum_vector = Box::new(pim::continuous_array::Matrix::< - ROWS, - { vector::ELEMENT_COUNT }, - >(SMatrix::zeros())); + let matrix = Box::new(pim::continuous_array::Matrix::( + SMatrix::from_fn(|r, c| { + if c > 0 { + return F16x16::zero(); + } + + let mut entry = F16x16::zero(); + entry.0.iter_mut().take(r).for_each(|val| *val = F16x1::one()); + + entry + }))); - let interleaved_input_vector = Box::new(interleaved_array::Vector::from(&input_vector)); + let input_vector = SVector::<_, X16_COLUMNS>::from_element(F16x16::one()); + let interleaved_input_vector = Box::new(interleaved_array::Vector::from(input_vector)); + + let mut output_partial_sum_vector = Box::new(SVector::::zeros()); let dummy = Box::new(0); @@ -61,11 +65,10 @@ pub extern "C" fn main() { writeln!(Uart0, "{output_partial_sum_vector}").unwrap(); let output_vector = SVector::::from_fn(|r, _| { - output_partial_sum_vector + output_partial_sum_vector[r] .0 - .row(r) .iter() - .fold(F16x1::default(), |acc, val| acc + *val) + .fold(F16x1::zero(), |acc, val| acc + *val) }); writeln!(Uart0, "{output_vector}").unwrap(); diff --git a/pim-os/src/lib.rs b/pim-os/src/lib.rs index f58c6ea..f0537aa 100644 --- a/pim-os/src/lib.rs +++ b/pim-os/src/lib.rs @@ -1,4 +1,3 @@ -#![feature(generic_const_exprs)] #![no_std] use core::sync::atomic::{compiler_fence, Ordering}; diff --git a/pim-os/src/pim/continuous_array.rs b/pim-os/src/pim/continuous_array.rs index 5975fdc..3451acf 100644 --- a/pim-os/src/pim/continuous_array.rs +++ b/pim-os/src/pim/continuous_array.rs @@ -1,12 +1,10 @@ -use super::vector::F16x1; +use super::vector::F16x16; use core::fmt::Display; use nalgebra::SMatrix; #[repr(C, align(65536))] #[derive(Debug)] -pub struct Matrix(pub SMatrix); - -pub type Vector = Matrix; +pub struct Matrix(pub SMatrix); impl Display for Matrix { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { diff --git a/pim-os/src/pim/interleaved_array.rs b/pim-os/src/pim/interleaved_array.rs index 49da9f1..aea4b25 100644 --- a/pim-os/src/pim/interleaved_array.rs +++ b/pim-os/src/pim/interleaved_array.rs @@ -1,37 +1,27 @@ -use super::{array::NUMBER_OF_BANKS, continuous_array, vector::F16x16, vector::ELEMENT_COUNT}; +use super::{array::NUMBER_OF_BANKS, vector::F16x16}; +use nalgebra::SVector; -#[repr(C, align(65536))] +#[repr(C, align(512))] #[derive(Debug)] -pub struct Vector(pub [[F16x16; NUMBER_OF_BANKS]; R / ELEMENT_COUNT]) -where - [(); R / ELEMENT_COUNT]:; +pub struct Vector(pub [[F16x16; NUMBER_OF_BANKS]; R]); -impl Default for Vector -where - [(); R / ELEMENT_COUNT]:, -{ +impl Default for Vector { fn default() -> Self { - Self([[F16x16::default(); NUMBER_OF_BANKS]; R / ELEMENT_COUNT]) + Self([[F16x16::default(); NUMBER_OF_BANKS]; R]) } } -impl From<&continuous_array::Vector> for Vector -where - [(); R / ELEMENT_COUNT]:, -{ - fn from(continuous_vector: &continuous_array::Vector) -> Self { - let mut vector = Self::default(); - let blocks: usize = R / ELEMENT_COUNT; - for block_index in 0..blocks { - let element = - unsafe { *(continuous_vector.0.as_ptr() as *const F16x16).add(block_index) }; +impl From> for Vector { + fn from(input_vector: SVector) -> Self { + let mut interleaved_vector = Self::default(); + + for block_index in 0..R { + let element = input_vector[block_index]; for k in 0..NUMBER_OF_BANKS { - let interleaved_block_index = block_index * NUMBER_OF_BANKS + k; - unsafe { - *(vector.0.as_mut_ptr() as *mut F16x16).add(interleaved_block_index) = element; - } + interleaved_vector.0[block_index][k] = element; } } - vector + + interleaved_vector } } diff --git a/pim-os/src/pim/kernel/samsung_matrix_vector_mul.rs b/pim-os/src/pim/kernel/samsung_matrix_vector_mul.rs index ee6df2d..1a7a0e6 100644 --- a/pim-os/src/pim/kernel/samsung_matrix_vector_mul.rs +++ b/pim-os/src/pim/kernel/samsung_matrix_vector_mul.rs @@ -1,4 +1,7 @@ -use crate::pim::{continuous_array::Matrix, interleaved_array, operation::PimOperand, vector}; +use crate::pim::{ + continuous_array::Matrix, interleaved_array, operation::PimOperand, vector::F16x16, +}; +use nalgebra::SVector; use pim_isa::{File, Instruction, Kernel}; pub const KERNEL: Kernel = Kernel([ @@ -45,17 +48,6 @@ pub const KERNEL: Kernel = Kernel([ offset: -1, count: 7, }, - Instruction::MAC { - src0: File::Bank, - src1: File::GrfA { index: 0 }, - src2: File::GrfB { index: 0 }, - dst: File::GrfB { index: 0 }, - aam: true, - }, - Instruction::JUMP { - offset: -1, - count: 7, - }, Instruction::FILL { src: File::GrfB { index: 0 }, dst: File::Bank, @@ -79,42 +71,22 @@ pub const KERNEL: Kernel = Kernel([ Instruction::NOP, Instruction::NOP, Instruction::NOP, + Instruction::NOP, + Instruction::NOP, ]); pub fn execute( matrix: &Matrix, input_vector: &interleaved_array::Vector, - output_partial_sum_vector: &mut Matrix, + output_partial_sum_vector: &mut SVector, dummy: &impl PimOperand, -) where - [(); C / vector::ELEMENT_COUNT]:, -{ +) { for block in input_vector.0.as_slice().iter() { block.execute_read(); } - for matrix_column in matrix - .0 - .fixed_rows::<1>(0) - .fixed_columns_with_step::<{ C / vector::ELEMENT_COUNT }>(0, vector::ELEMENT_COUNT - 1) - .iter() - { - use core::fmt::Write; - writeln!( - crate::uart::Uart0, - "{:?}", - core::ptr::addr_of!(*matrix_column) - ); - matrix_column.execute_read(); - } - - for matrix_column in matrix - .0 - .fixed_rows::<1>(0) - .fixed_columns_with_step::<{ C / vector::ELEMENT_COUNT }>(4, vector::ELEMENT_COUNT - 1) - .iter() - { - matrix_column.execute_read(); + for column_block in matrix.0.fixed_rows::<1>(0).iter() { + column_block.execute_read(); } output_partial_sum_vector.execute_write(); diff --git a/pim-os/src/pim/vector.rs b/pim-os/src/pim/vector.rs index 3f6e764..d74ce3d 100644 --- a/pim-os/src/pim/vector.rs +++ b/pim-os/src/pim/vector.rs @@ -1,3 +1,5 @@ +use core::fmt::{Debug, Display}; + use half::f16; pub const ELEMENT_COUNT: usize = 16; @@ -8,13 +10,13 @@ pub struct F16x1(pub f16); impl core::fmt::Debug for F16x1 { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - self.0.fmt(f) + Debug::fmt(&self.0, f) } } impl core::fmt::Display for F16x1 { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - self.0.fmt(f) + Display::fmt(&self.0, f) } } @@ -66,6 +68,12 @@ impl core::ops::MulAssign for F16x1 { #[derive(Default, Debug, Clone, Copy, PartialEq)] pub struct F16x16(pub [F16x1; ELEMENT_COUNT]); +impl Display for F16x16 { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "{:?}", self.0) + } +} + impl num_traits::identities::Zero for F16x16 { fn zero() -> Self { Self([F16x1::zero(); ELEMENT_COUNT])