Add rest of slides

2024-09-17 14:20:12 +02:00
parent 8b5923fdef
commit d4f3d7982a
4 changed files with 219 additions and 33 deletions
--- a/images/bare_metal.svg
+++ b/images/bare_metal.svg
--- a/kernel.rs
+++ b/kernel.rs
@@ -0,0 +1,31 @@
+pub fn execute<const X16R: usize, const X16C: usize, const R: usize>(
+    matrix: &Matrix<X16R, X16C>,
+    input_vector: &Vector<X16C>,
+    output_partial_sum_vector: &mut SVector<F16x16, R>,
+    dummy: &impl PimOperand,
+) {
+    // Load input vector into GRF-A registers
+    for chunk in input_vector.0.iter() {
+        chunk.execute_read();
+    }
+
+    // Execute the MAC instructions without memory barriers
+    for sub_matrix in matrix.0.iter() {
+        for column_block in sub_matrix.fixed_rows::<1>(0).iter() {
+            column_block.execute_read_async();
+        }
+    }
+
+    // Verify all memory accesses have finished
+    barrier::dsb(barrier::SY);
+
+    // Copy the partial sums into the bank
+    for chunk in output_partial_sum_vector
+        .fixed_rows_with_step_mut::<X16R>(0, 16)
+        .iter_mut() {
+            chunk.execute_write();
+    }
+
+    // Execute the EXIT instruction
+    dummy.execute_read();
+}
--- a/kernel.s
+++ b/kernel.s
@@ -0,0 +1,12 @@
+MOV GRF_A #0, BANK
+MOV GRF_A #1, BANK
+MOV GRF_A #2, BANK
+MOV GRF_A #3, BANK
+MOV GRF_A #4, BANK
+MOV GRF_A #5, BANK
+MOV GRF_A #6, BANK
+MOV GRF_A #7, BANK
+MAC(AAM) GRF_B, BANK, GRF_A
+JUMP -1, 7
+FILL BANK, GRF_B #0
+EXIT
--- a/main.tex
+++ b/main.tex
@@ -3,6 +3,7 @@

 \usepackage[style=verbose-ibid]{biblatex}
 \usepackage{datetime}
+\usepackage{tabularray}
 \usepackage[inkscapeversion=1]{svg}

 \addbibresource{references.bib}
@@ -225,8 +226,162 @@
 \end{frame}

 \begin{frame}[fragile]{Example: GEMV Kernel}
+    \begin{center}
+        \only<1>{
+            \inputminted[firstline=1,lastline=8, fontsize=\footnotesize]{gas}{kernel.s}
+            \hrule
+            \inputminted[firstline=7,lastline=10,fontsize=\footnotesize]{rust}{kernel.rs}
+        }
+        \only<2>{
+            \inputminted[firstline=9,lastline=10, fontsize=\footnotesize]{gas}{kernel.s}
+            \hrule
+            \inputminted[firstline=12,lastline=20,fontsize=\footnotesize]{rust}{kernel.rs}
+        }
+        \only<3>{
+            \inputminted[firstline=11,lastline=12,fontsize=\footnotesize]{gas}{kernel.s}
+            \hrule
+            \inputminted[firstline=22,lastline=30,fontsize=\footnotesize]{rust}{kernel.rs}
+        }
+    \end{center}
 \end{frame}

+\begin{frame}{Virtual Prototype Platform}
+    \begin{columns}
+        \begin{column}{0.5\textwidth}
+            \begin{itemize}
+                \item ARM processor model
+                \item Bare-metal kernel
+                \item Custom page table configuration
+                      \begin{itemize}
+                          \item Non-PIM DRAM region mapped as cacheable memory
+                          \item PIM DRAM region mapped as non-cacheable memory
+                      \end{itemize}
+            \end{itemize}
+        \end{column}
+        \begin{column}{0.5\textwidth}
+            \begin{figure}
+                \includesvg[width=0.8\textwidth]{images/bare_metal.svg}
+            \end{figure}
+        \end{column}
+    \end{columns}
+\end{frame}
+
+\section{Simulations}
+
+\begin{frame}{Microbenchmarks}
+    \begin{columns}
+        \begin{column}{0.5\textwidth}
+            \begin{itemize}
+                \item Vector benchmarks (BLAS level 1)
+                \item VADD: $z = x + y$
+                \item VMUL: $z = x \cdot y$
+                \item HAXPY: $z = a \cdot x + y$
+
+                \item Vector-Matrix benchmarks (BLAS level 2)
+                      \begin{itemize}
+                          \item GEMV: $z = A \cdot x$
+                          \item Simple DNN:
+                                \begin{itemize}
+                                    \item $f(x) = z = ReLU(A \cdot x)$
+                                    \item $z_{n+1} = f(z_n)$
+                                    \item 5 layers in total
+                                \end{itemize}
+                      \end{itemize}
+            \end{itemize}
+        \end{column}
+        \begin{column}{0.5\textwidth}
+            \begin{figure}
+                \begin{tblr}{
+                        hlines,
+                        column{1} = {c},
+                        column{2} = {r},
+                        column{3} = {r},
+                        column{4} = {r},
+                        row{1} = {l},
+                        hline{2} = {2}{-}{solid,black},
+                    }
+                    Level & Vector & GEMV             & DNN                \\
+                    X1    & 2M     & (1k $\times$ 4k) & (256 $\times$ 256) \\
+                    X2    & 4M     & (2k $\times$ 4k) & (512 $\times$ 512) \\
+                    X3    & 8M     & (4k $\times$ 8k) & (1k $\times$ 1k)   \\
+                    X4    & 16M    & (8k $\times$ 8k) & (2k $\times$ 2k)
+                \end{tblr}
+            \end{figure}
+        \end{column}
+    \end{columns}
+\end{frame}
+
+\begin{frame}{System Configuration}
+    \begin{columns}[t]
+        \begin{column}{0.5\textwidth}
+            Two simulated systems:
+            \begin{itemize}
+                \item Generic ARM system
+                \item Infinite compute system
+                      \begin{itemize}
+                          \item Unrealistic high frequency of 100 GHz
+                          \item Completely memory bound
+                          \item Lower bound of possible speedup
+                      \end{itemize}
+            \end{itemize}
+        \end{column}
+        \begin{column}{0.5\textwidth}
+            Two real GPUs using HBM2:
+            \begin{itemize}
+                \item AMD RX Vega 56
+                \item NVIDIA Tesla V100
+            \end{itemize}
+        \end{column}
+    \end{columns}
+\end{frame}
+
+\begin{frame}{Speedups / Generic ARM System}
+    \begin{figure}
+        \includesvg[width=0.8\textwidth]{images/speedup_normal.svg}
+    \end{figure}
+\end{frame}
+
+\begin{frame}{Speedups / Infinite Compute System}
+    \begin{figure}
+        \includesvg[width=0.8\textwidth]{images/speedup_inf.svg}
+    \end{figure}
+\end{frame}
+
+\begin{frame}{Speedups / Samsung}
+    \begin{figure}
+        \includesvg[width=0.8\textwidth]{images/samsung.svg}
+    \end{figure}
+\end{frame}
+
+\begin{frame}{Runtimes / Vector Benchmarks}
+    \begin{figure}
+        \includesvg[width=0.8\textwidth]{images/runtimes_vector.svg}
+    \end{figure}
+\end{frame}
+
+\begin{frame}{Runtimes / Matrix Benchmarks}
+    \begin{figure}
+        \includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg}
+    \end{figure}
+\end{frame}
+
+\begin{frame}{Conclusion and Future Work}
+    \begin{itemize}
+        \item PIM can accelerate memory-bound workloads
+        \item Special PIM-friendly memory layouts are required
+    \end{itemize}
+
+    Future work:
+    \begin{itemize}
+        \item Implementation of Linux driver
+        \item Comparison with real neural networks
+        \item Consider replacing library approach with compiler approach
+        \item Implement a power model to analyze the power efficiency gains
+    \end{itemize}
+\end{frame}
+
+\section{Thank you for your attention!}
+
 \begin{frame}
    \frametitle{Outline}
    \tableofcontents