Add rest of slides

This commit is contained in:
2024-09-17 14:20:12 +02:00
parent 8b5923fdef
commit d4f3d7982a
4 changed files with 219 additions and 33 deletions

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 54 KiB

After

Width:  |  Height:  |  Size: 69 KiB

31
kernel.rs Normal file
View File

@@ -0,0 +1,31 @@
pub fn execute<const X16R: usize, const X16C: usize, const R: usize>(
matrix: &Matrix<X16R, X16C>,
input_vector: &Vector<X16C>,
output_partial_sum_vector: &mut SVector<F16x16, R>,
dummy: &impl PimOperand,
) {
// Load input vector into GRF-A registers
for chunk in input_vector.0.iter() {
chunk.execute_read();
}
// Execute the MAC instructions without memory barriers
for sub_matrix in matrix.0.iter() {
for column_block in sub_matrix.fixed_rows::<1>(0).iter() {
column_block.execute_read_async();
}
}
// Verify all memory accesses have finished
barrier::dsb(barrier::SY);
// Copy the partial sums into the bank
for chunk in output_partial_sum_vector
.fixed_rows_with_step_mut::<X16R>(0, 16)
.iter_mut() {
chunk.execute_write();
}
// Execute the EXIT instruction
dummy.execute_read();
}

12
kernel.s Normal file
View File

@@ -0,0 +1,12 @@
MOV GRF_A #0, BANK
MOV GRF_A #1, BANK
MOV GRF_A #2, BANK
MOV GRF_A #3, BANK
MOV GRF_A #4, BANK
MOV GRF_A #5, BANK
MOV GRF_A #6, BANK
MOV GRF_A #7, BANK
MAC(AAM) GRF_B, BANK, GRF_A
JUMP -1, 7
FILL BANK, GRF_B #0
EXIT

155
main.tex
View File

@@ -3,6 +3,7 @@
\usepackage[style=verbose-ibid]{biblatex} \usepackage[style=verbose-ibid]{biblatex}
\usepackage{datetime} \usepackage{datetime}
\usepackage{tabularray}
\usepackage[inkscapeversion=1]{svg} \usepackage[inkscapeversion=1]{svg}
\addbibresource{references.bib} \addbibresource{references.bib}
@@ -225,8 +226,162 @@
\end{frame} \end{frame}
\begin{frame}[fragile]{Example: GEMV Kernel} \begin{frame}[fragile]{Example: GEMV Kernel}
\begin{center}
\only<1>{
\inputminted[firstline=1,lastline=8, fontsize=\footnotesize]{gas}{kernel.s}
\hrule
\inputminted[firstline=7,lastline=10,fontsize=\footnotesize]{rust}{kernel.rs}
}
\only<2>{
\inputminted[firstline=9,lastline=10, fontsize=\footnotesize]{gas}{kernel.s}
\hrule
\inputminted[firstline=12,lastline=20,fontsize=\footnotesize]{rust}{kernel.rs}
}
\only<3>{
\inputminted[firstline=11,lastline=12,fontsize=\footnotesize]{gas}{kernel.s}
\hrule
\inputminted[firstline=22,lastline=30,fontsize=\footnotesize]{rust}{kernel.rs}
}
\end{center}
\end{frame} \end{frame}
\begin{frame}{Virtual Prototype Platform}
\begin{columns}
\begin{column}{0.5\textwidth}
\begin{itemize}
\item ARM processor model
\item Bare-metal kernel
\item Custom page table configuration
\begin{itemize}
\item Non-PIM DRAM region mapped as cacheable memory
\item PIM DRAM region mapped as non-cacheable memory
\end{itemize}
\end{itemize}
\end{column}
\begin{column}{0.5\textwidth}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/bare_metal.svg}
\end{figure}
\end{column}
\end{columns}
\end{frame}
\section{Simulations}
\begin{frame}{Microbenchmarks}
\begin{columns}
\begin{column}{0.5\textwidth}
\begin{itemize}
\item Vector benchmarks (BLAS level 1)
\item VADD: $z = x + y$
\item VMUL: $z = x \cdot y$
\item HAXPY: $z = a \cdot x + y$
\item Vector-Matrix benchmarks (BLAS level 2)
\begin{itemize}
\item GEMV: $z = A \cdot x$
\item Simple DNN:
\begin{itemize}
\item $f(x) = z = ReLU(A \cdot x)$
\item $z_{n+1} = f(z_n)$
\item 5 layers in total
\end{itemize}
\end{itemize}
\end{itemize}
\end{column}
\begin{column}{0.5\textwidth}
\begin{figure}
\begin{tblr}{
hlines,
column{1} = {c},
column{2} = {r},
column{3} = {r},
column{4} = {r},
row{1} = {l},
hline{2} = {2}{-}{solid,black},
}
Level & Vector & GEMV & DNN \\
X1 & 2M & (1k $\times$ 4k) & (256 $\times$ 256) \\
X2 & 4M & (2k $\times$ 4k) & (512 $\times$ 512) \\
X3 & 8M & (4k $\times$ 8k) & (1k $\times$ 1k) \\
X4 & 16M & (8k $\times$ 8k) & (2k $\times$ 2k)
\end{tblr}
\end{figure}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{System Configuration}
\begin{columns}[t]
\begin{column}{0.5\textwidth}
Two simulated systems:
\begin{itemize}
\item Generic ARM system
\item Infinite compute system
\begin{itemize}
\item Unrealistic high frequency of 100 GHz
\item Completely memory bound
\item Lower bound of possible speedup
\end{itemize}
\end{itemize}
\end{column}
\begin{column}{0.5\textwidth}
Two real GPUs using HBM2:
\begin{itemize}
\item AMD RX Vega 56
\item NVIDIA Tesla V100
\end{itemize}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{Speedups / Generic ARM System}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/speedup_normal.svg}
\end{figure}
\end{frame}
\begin{frame}{Speedups / Infinite Compute System}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/speedup_inf.svg}
\end{figure}
\end{frame}
\begin{frame}{Speedups / Samsung}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/samsung.svg}
\end{figure}
\end{frame}
\begin{frame}{Runtimes / Vector Benchmarks}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/runtimes_vector.svg}
\end{figure}
\end{frame}
\begin{frame}{Runtimes / Matrix Benchmarks}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg}
\end{figure}
\end{frame}
\begin{frame}{Conclusion and Future Work}
\begin{itemize}
\item PIM can accelerate memory-bound workloads
\item Special PIM-friendly memory layouts are required
\end{itemize}
Future work:
\begin{itemize}
\item Implementation of Linux driver
\item Comparison with real neural networks
\item Consider replacing library approach with compiler approach
\item Implement a power model to analyze the power efficiency gains
\end{itemize}
\end{frame}
\section{Thank you for your attention!}
\begin{frame} \begin{frame}
\frametitle{Outline} \frametitle{Outline}
\tableofcontents \tableofcontents