Implement Samsung's memory layout
This commit is contained in:
61
pim-os/src/bin/samsung_matrix_vector_multiply.rs
Normal file
61
pim-os/src/bin/samsung_matrix_vector_multiply.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
#![no_std]
|
||||
#![no_main]
|
||||
|
||||
extern crate alloc;
|
||||
|
||||
use aarch64_cpu::asm::barrier;
|
||||
use alloc::boxed::Box;
|
||||
use core::fmt::Write;
|
||||
use half::f16;
|
||||
use nalgebra::{SMatrix, SVector};
|
||||
use pim_isa::BankMode;
|
||||
use pim_os::{
|
||||
pim::{self, interleaved_array, kernel::samsung_matrix_vector_mul, vector::F16x1},
|
||||
uart::Uart0,
|
||||
};
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn main() {
|
||||
pim::state::set_kernel(&samsung_matrix_vector_mul::KERNEL);
|
||||
|
||||
let mut matrix = Box::new(pim::continuous_array::Matrix::<32, 128>(SMatrix::zeros()));
|
||||
matrix.0.fill_lower_triangle(F16x1(f16::ONE), 0);
|
||||
let input_vector = pim::continuous_array::Matrix::<128, 1>(SVector::from_fn(|_, _| {
|
||||
F16x1(f16::from_f32(1 as _))
|
||||
}));
|
||||
let mut output_partial_sum_vector =
|
||||
Box::new(pim::continuous_array::Matrix::<32, 16>(SMatrix::zeros()));
|
||||
|
||||
let interleaved_input_vector = Box::new(interleaved_array::Vector::from(&input_vector));
|
||||
|
||||
let dummy = Box::new(0);
|
||||
|
||||
// Verify everything is correctly initialized before PIM operation
|
||||
barrier::dsb(barrier::SY);
|
||||
|
||||
// Execute kernel
|
||||
{
|
||||
pim::state::set_bank_mode(BankMode::PimAllBank);
|
||||
|
||||
samsung_matrix_vector_mul::execute(
|
||||
matrix.as_ref(),
|
||||
interleaved_input_vector.as_ref(),
|
||||
output_partial_sum_vector.as_mut(),
|
||||
dummy.as_ref(),
|
||||
);
|
||||
|
||||
pim::state::set_bank_mode(BankMode::SingleBank);
|
||||
}
|
||||
|
||||
writeln!(Uart0, "{output_partial_sum_vector}").unwrap();
|
||||
|
||||
let output_vector = SVector::<F16x1, 32>::from_fn(|r, _| {
|
||||
output_partial_sum_vector
|
||||
.0
|
||||
.row(r)
|
||||
.iter()
|
||||
.fold(F16x1::default(), |acc, val| acc + *val)
|
||||
});
|
||||
|
||||
writeln!(Uart0, "{output_vector}").unwrap();
|
||||
}
|
||||
@@ -1,5 +1,8 @@
|
||||
pub mod array;
|
||||
pub mod config;
|
||||
pub mod continuous_array;
|
||||
pub mod interleaved_array;
|
||||
pub mod kernel;
|
||||
pub mod vector;
|
||||
pub mod operation;
|
||||
pub mod state;
|
||||
pub mod vector;
|
||||
|
||||
15
pim-os/src/pim/continuous_array.rs
Normal file
15
pim-os/src/pim/continuous_array.rs
Normal file
@@ -0,0 +1,15 @@
|
||||
use super::vector::F16x1;
|
||||
use core::fmt::Display;
|
||||
use nalgebra::SMatrix;
|
||||
|
||||
#[repr(C, align(65536))]
|
||||
#[derive(Debug)]
|
||||
pub struct Matrix<const R: usize, const C: usize>(pub SMatrix<F16x1, R, C>);
|
||||
|
||||
pub type Vector<const R: usize> = Matrix<R, 1>;
|
||||
|
||||
impl<const R: usize, const C: usize> Display for Matrix<R, C> {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
self.0.fmt(f)
|
||||
}
|
||||
}
|
||||
37
pim-os/src/pim/interleaved_array.rs
Normal file
37
pim-os/src/pim/interleaved_array.rs
Normal file
@@ -0,0 +1,37 @@
|
||||
use super::{array::NUMBER_OF_BANKS, continuous_array, vector::F16x16, vector::ELEMENT_COUNT};
|
||||
|
||||
#[repr(C, align(65536))]
|
||||
#[derive(Debug)]
|
||||
pub struct Vector<const R: usize>(pub [[F16x16; NUMBER_OF_BANKS]; R / ELEMENT_COUNT])
|
||||
where
|
||||
[(); R / ELEMENT_COUNT]:;
|
||||
|
||||
impl<const R: usize> Default for Vector<R>
|
||||
where
|
||||
[(); R / ELEMENT_COUNT]:,
|
||||
{
|
||||
fn default() -> Self {
|
||||
Self([[F16x16::default(); NUMBER_OF_BANKS]; R / ELEMENT_COUNT])
|
||||
}
|
||||
}
|
||||
|
||||
impl<const R: usize> From<&continuous_array::Vector<R>> for Vector<R>
|
||||
where
|
||||
[(); R / ELEMENT_COUNT]:,
|
||||
{
|
||||
fn from(continuous_vector: &continuous_array::Vector<R>) -> Self {
|
||||
let mut vector = Self::default();
|
||||
let blocks: usize = R / ELEMENT_COUNT;
|
||||
for block_index in 0..blocks {
|
||||
let element =
|
||||
unsafe { *(continuous_vector.0.as_ptr() as *const F16x16).add(block_index) };
|
||||
for k in 0..NUMBER_OF_BANKS {
|
||||
let interleaved_block_index = block_index * NUMBER_OF_BANKS + k;
|
||||
unsafe {
|
||||
*(vector.0.as_mut_ptr() as *mut F16x16).add(interleaved_block_index) = element;
|
||||
}
|
||||
}
|
||||
}
|
||||
vector
|
||||
}
|
||||
}
|
||||
@@ -2,3 +2,4 @@ pub mod matrix_matrix_add;
|
||||
pub mod matrix_matrix_mul;
|
||||
pub mod matrix_scalar_mul;
|
||||
pub mod matrix_vector_mul;
|
||||
pub mod samsung_matrix_vector_mul;
|
||||
|
||||
99
pim-os/src/pim/kernel/samsung_matrix_vector_mul.rs
Normal file
99
pim-os/src/pim/kernel/samsung_matrix_vector_mul.rs
Normal file
@@ -0,0 +1,99 @@
|
||||
use crate::pim::{continuous_array::Matrix, interleaved_array, operation::PimOperand, vector};
|
||||
use pim_isa::{File, Instruction, Kernel};
|
||||
|
||||
pub const KERNEL: Kernel = Kernel([
|
||||
Instruction::MOV {
|
||||
src: File::Bank,
|
||||
dst: File::GrfA { index: 0 },
|
||||
},
|
||||
Instruction::MOV {
|
||||
src: File::Bank,
|
||||
dst: File::GrfA { index: 1 },
|
||||
},
|
||||
Instruction::MOV {
|
||||
src: File::Bank,
|
||||
dst: File::GrfA { index: 2 },
|
||||
},
|
||||
Instruction::MOV {
|
||||
src: File::Bank,
|
||||
dst: File::GrfA { index: 3 },
|
||||
},
|
||||
Instruction::MOV {
|
||||
src: File::Bank,
|
||||
dst: File::GrfA { index: 4 },
|
||||
},
|
||||
Instruction::MOV {
|
||||
src: File::Bank,
|
||||
dst: File::GrfA { index: 5 },
|
||||
},
|
||||
Instruction::MOV {
|
||||
src: File::Bank,
|
||||
dst: File::GrfA { index: 6 },
|
||||
},
|
||||
Instruction::MOV {
|
||||
src: File::Bank,
|
||||
dst: File::GrfA { index: 7 },
|
||||
},
|
||||
Instruction::MAC {
|
||||
src0: File::Bank,
|
||||
src1: File::GrfA { index: 0 },
|
||||
src2: File::GrfB { index: 0 },
|
||||
dst: File::GrfB { index: 0 },
|
||||
aam: true,
|
||||
},
|
||||
Instruction::JUMP {
|
||||
offset: -1,
|
||||
count: 7,
|
||||
},
|
||||
Instruction::FILL {
|
||||
src: File::GrfB { index: 0 },
|
||||
dst: File::Bank,
|
||||
},
|
||||
Instruction::EXIT,
|
||||
Instruction::NOP,
|
||||
Instruction::NOP,
|
||||
Instruction::NOP,
|
||||
Instruction::NOP,
|
||||
Instruction::NOP,
|
||||
Instruction::NOP,
|
||||
Instruction::NOP,
|
||||
Instruction::NOP,
|
||||
Instruction::NOP,
|
||||
Instruction::NOP,
|
||||
Instruction::NOP,
|
||||
Instruction::NOP,
|
||||
Instruction::NOP,
|
||||
Instruction::NOP,
|
||||
Instruction::NOP,
|
||||
Instruction::NOP,
|
||||
Instruction::NOP,
|
||||
Instruction::NOP,
|
||||
Instruction::NOP,
|
||||
Instruction::NOP,
|
||||
]);
|
||||
|
||||
pub fn execute<const R: usize, const C: usize>(
|
||||
matrix: &Matrix<R, C>,
|
||||
input_vector: &interleaved_array::Vector<C>,
|
||||
output_partial_sum_vector: &mut Matrix<R, { vector::ELEMENT_COUNT }>,
|
||||
dummy: &impl PimOperand,
|
||||
) where
|
||||
[(); C / vector::ELEMENT_COUNT]:,
|
||||
{
|
||||
for block in input_vector.0.as_slice().iter() {
|
||||
block.execute_read();
|
||||
}
|
||||
|
||||
for matrix_column in matrix
|
||||
.0
|
||||
.fixed_rows::<1>(0)
|
||||
.fixed_columns_with_step::<{ C / vector::ELEMENT_COUNT }>(0, vector::ELEMENT_COUNT)
|
||||
.iter()
|
||||
{
|
||||
matrix_column.execute_read();
|
||||
}
|
||||
|
||||
output_partial_sum_vector.execute_write();
|
||||
|
||||
dummy.execute_read();
|
||||
}
|
||||
22
pim-os/src/pim/operation.rs
Normal file
22
pim-os/src/pim/operation.rs
Normal file
@@ -0,0 +1,22 @@
|
||||
pub trait PimOperand {
|
||||
fn ptr(&self) -> *const u8;
|
||||
fn ptr_mut(&mut self) -> *mut u8;
|
||||
|
||||
fn execute_read(&self) {
|
||||
unsafe { core::ptr::read_volatile(self.ptr()) };
|
||||
}
|
||||
|
||||
fn execute_write(&mut self) {
|
||||
unsafe { core::ptr::write_volatile(self.ptr_mut(), Default::default()) };
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> PimOperand for T {
|
||||
fn ptr(&self) -> *const u8 {
|
||||
core::ptr::addr_of!(*self) as *const _
|
||||
}
|
||||
|
||||
fn ptr_mut(&mut self) -> *mut u8 {
|
||||
core::ptr::addr_of_mut!(*self) as *mut _
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
use half::f16;
|
||||
|
||||
const FLOATING_POINT_UNITS: usize = 16;
|
||||
pub const ELEMENT_COUNT: usize = 16;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Default, Clone, Copy, PartialEq)]
|
||||
@@ -64,21 +64,21 @@ impl core::ops::MulAssign<F16x1> for F16x1 {
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Default, Debug, Clone, Copy, PartialEq)]
|
||||
pub struct F16x16(pub [F16x1; FLOATING_POINT_UNITS]);
|
||||
pub struct F16x16(pub [F16x1; ELEMENT_COUNT]);
|
||||
|
||||
impl num_traits::identities::Zero for F16x16 {
|
||||
fn zero() -> Self {
|
||||
Self([F16x1::zero(); FLOATING_POINT_UNITS])
|
||||
Self([F16x1::zero(); ELEMENT_COUNT])
|
||||
}
|
||||
|
||||
fn is_zero(&self) -> bool {
|
||||
self.0 == [F16x1::zero(); FLOATING_POINT_UNITS]
|
||||
self.0 == [F16x1::zero(); ELEMENT_COUNT]
|
||||
}
|
||||
}
|
||||
|
||||
impl num_traits::identities::One for F16x16 {
|
||||
fn one() -> Self {
|
||||
Self([F16x1::one(); FLOATING_POINT_UNITS])
|
||||
Self([F16x1::one(); ELEMENT_COUNT])
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user