From d4f3d7982a64edbe2cd522e79b5af0d5b74d5fee Mon Sep 17 00:00:00 2001 From: Derek Christ Date: Tue, 17 Sep 2024 14:20:12 +0200 Subject: [PATCH] Add rest of slides --- images/bare_metal.svg | 54 ++++++--------- kernel.rs | 31 +++++++++ kernel.s | 12 ++++ main.tex | 155 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 219 insertions(+), 33 deletions(-) create mode 100644 kernel.rs create mode 100644 kernel.s diff --git a/images/bare_metal.svg b/images/bare_metal.svg index 23b03be..742724d 100644 --- a/images/bare_metal.svg +++ b/images/bare_metal.svg @@ -8,7 +8,7 @@ version="1.1" id="svg1" xml:space="preserve" - inkscape:version="1.3.2 (091e20ef0f, 2023-11-25, custom)" + inkscape:version="1.3.2 (091e20ef0f, 2023-11-25)" sodipodi:docname="bare_metal.svg" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" @@ -25,14 +25,14 @@ inkscape:deskcolor="#d1d1d1" inkscape:document-units="mm" inkscape:zoom="1.6997475" - inkscape:cx="169.14277" - inkscape:cy="38.829297" - inkscape:window-width="2194" - inkscape:window-height="1158" + inkscape:cx="168.84861" + inkscape:cy="37.946813" + inkscape:window-width="1536" + inkscape:window-height="932" inkscape:window-x="0" inkscape:window-y="0" inkscape:window-maximized="1" - inkscape:current-layer="layer1" />Bare-MetalKernelARM Processor Model + clip-path="url(#clipPath4052)" /> diff --git a/kernel.rs b/kernel.rs new file mode 100644 index 0000000..80a52ba --- /dev/null +++ b/kernel.rs @@ -0,0 +1,31 @@ +pub fn execute( + matrix: &Matrix, + input_vector: &Vector, + output_partial_sum_vector: &mut SVector, + dummy: &impl PimOperand, +) { + // Load input vector into GRF-A registers + for chunk in input_vector.0.iter() { + chunk.execute_read(); + } + + // Execute the MAC instructions without memory barriers + for sub_matrix in matrix.0.iter() { + for column_block in sub_matrix.fixed_rows::<1>(0).iter() { + column_block.execute_read_async(); + } + } + + // Verify all memory accesses have finished + barrier::dsb(barrier::SY); + + // Copy the partial sums into the bank + for chunk in output_partial_sum_vector + .fixed_rows_with_step_mut::(0, 16) + .iter_mut() { + chunk.execute_write(); + } + + // Execute the EXIT instruction + dummy.execute_read(); +} diff --git a/kernel.s b/kernel.s new file mode 100644 index 0000000..473805d --- /dev/null +++ b/kernel.s @@ -0,0 +1,12 @@ +MOV GRF_A #0, BANK +MOV GRF_A #1, BANK +MOV GRF_A #2, BANK +MOV GRF_A #3, BANK +MOV GRF_A #4, BANK +MOV GRF_A #5, BANK +MOV GRF_A #6, BANK +MOV GRF_A #7, BANK +MAC(AAM) GRF_B, BANK, GRF_A +JUMP -1, 7 +FILL BANK, GRF_B #0 +EXIT diff --git a/main.tex b/main.tex index 87c7008..9cf1ec7 100644 --- a/main.tex +++ b/main.tex @@ -3,6 +3,7 @@ \usepackage[style=verbose-ibid]{biblatex} \usepackage{datetime} +\usepackage{tabularray} \usepackage[inkscapeversion=1]{svg} \addbibresource{references.bib} @@ -225,8 +226,162 @@ \end{frame} \begin{frame}[fragile]{Example: GEMV Kernel} + \begin{center} + \only<1>{ + \inputminted[firstline=1,lastline=8, fontsize=\footnotesize]{gas}{kernel.s} + \hrule + \inputminted[firstline=7,lastline=10,fontsize=\footnotesize]{rust}{kernel.rs} + } + \only<2>{ + \inputminted[firstline=9,lastline=10, fontsize=\footnotesize]{gas}{kernel.s} + \hrule + \inputminted[firstline=12,lastline=20,fontsize=\footnotesize]{rust}{kernel.rs} + } + \only<3>{ + \inputminted[firstline=11,lastline=12,fontsize=\footnotesize]{gas}{kernel.s} + \hrule + \inputminted[firstline=22,lastline=30,fontsize=\footnotesize]{rust}{kernel.rs} + } + \end{center} \end{frame} +\begin{frame}{Virtual Prototype Platform} + \begin{columns} + \begin{column}{0.5\textwidth} + \begin{itemize} + \item ARM processor model + \item Bare-metal kernel + \item Custom page table configuration + \begin{itemize} + \item Non-PIM DRAM region mapped as cacheable memory + \item PIM DRAM region mapped as non-cacheable memory + \end{itemize} + \end{itemize} + \end{column} + \begin{column}{0.5\textwidth} + \begin{figure} + \includesvg[width=0.8\textwidth]{images/bare_metal.svg} + \end{figure} + \end{column} + \end{columns} +\end{frame} + +\section{Simulations} + +\begin{frame}{Microbenchmarks} + \begin{columns} + \begin{column}{0.5\textwidth} + \begin{itemize} + \item Vector benchmarks (BLAS level 1) + \item VADD: $z = x + y$ + \item VMUL: $z = x \cdot y$ + \item HAXPY: $z = a \cdot x + y$ + + \item Vector-Matrix benchmarks (BLAS level 2) + \begin{itemize} + \item GEMV: $z = A \cdot x$ + \item Simple DNN: + \begin{itemize} + \item $f(x) = z = ReLU(A \cdot x)$ + \item $z_{n+1} = f(z_n)$ + \item 5 layers in total + \end{itemize} + \end{itemize} + \end{itemize} + \end{column} + \begin{column}{0.5\textwidth} + \begin{figure} + \begin{tblr}{ + hlines, + column{1} = {c}, + column{2} = {r}, + column{3} = {r}, + column{4} = {r}, + row{1} = {l}, + hline{2} = {2}{-}{solid,black}, + } + Level & Vector & GEMV & DNN \\ + X1 & 2M & (1k $\times$ 4k) & (256 $\times$ 256) \\ + X2 & 4M & (2k $\times$ 4k) & (512 $\times$ 512) \\ + X3 & 8M & (4k $\times$ 8k) & (1k $\times$ 1k) \\ + X4 & 16M & (8k $\times$ 8k) & (2k $\times$ 2k) + \end{tblr} + \end{figure} + \end{column} + \end{columns} +\end{frame} + +\begin{frame}{System Configuration} + \begin{columns}[t] + \begin{column}{0.5\textwidth} + Two simulated systems: + \begin{itemize} + \item Generic ARM system + \item Infinite compute system + \begin{itemize} + \item Unrealistic high frequency of 100 GHz + \item Completely memory bound + \item Lower bound of possible speedup + \end{itemize} + \end{itemize} + \end{column} + \begin{column}{0.5\textwidth} + Two real GPUs using HBM2: + \begin{itemize} + \item AMD RX Vega 56 + \item NVIDIA Tesla V100 + \end{itemize} + \end{column} + \end{columns} +\end{frame} + +\begin{frame}{Speedups / Generic ARM System} + \begin{figure} + \includesvg[width=0.8\textwidth]{images/speedup_normal.svg} + \end{figure} +\end{frame} + +\begin{frame}{Speedups / Infinite Compute System} + \begin{figure} + \includesvg[width=0.8\textwidth]{images/speedup_inf.svg} + \end{figure} +\end{frame} + +\begin{frame}{Speedups / Samsung} + \begin{figure} + \includesvg[width=0.8\textwidth]{images/samsung.svg} + \end{figure} +\end{frame} + +\begin{frame}{Runtimes / Vector Benchmarks} + \begin{figure} + \includesvg[width=0.8\textwidth]{images/runtimes_vector.svg} + \end{figure} +\end{frame} + +\begin{frame}{Runtimes / Matrix Benchmarks} + \begin{figure} + \includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg} + \end{figure} +\end{frame} + +\begin{frame}{Conclusion and Future Work} + \begin{itemize} + \item PIM can accelerate memory-bound workloads + \item Special PIM-friendly memory layouts are required + \end{itemize} + + Future work: + \begin{itemize} + \item Implementation of Linux driver + \item Comparison with real neural networks + \item Consider replacing library approach with compiler approach + \item Implement a power model to analyze the power efficiency gains + \end{itemize} +\end{frame} + +\section{Thank you for your attention!} + \begin{frame} \frametitle{Outline} \tableofcontents