Add rest of slides

2024-09-17 14:20:12 +02:00
parent 8b5923fdef
commit d4f3d7982a
4 changed files with 219 additions and 33 deletions
--- a/images/bare_metal.svg
+++ b/images/bare_metal.svg
--- a/kernel.rs
+++ b/kernel.rs
@@ -0,0 +1,31 @@
 pub fn execute<const X16R: usize, const X16C: usize, const R: usize>(
    matrix: &Matrix<X16R, X16C>,
    input_vector: &Vector<X16C>,
    output_partial_sum_vector: &mut SVector<F16x16, R>,
    dummy: &impl PimOperand,
 ) {
    // Load input vector into GRF-A registers
    for chunk in input_vector.0.iter() {
        chunk.execute_read();
    }
    // Execute the MAC instructions without memory barriers
    for sub_matrix in matrix.0.iter() {
        for column_block in sub_matrix.fixed_rows::<1>(0).iter() {
            column_block.execute_read_async();
        }
    }
    // Verify all memory accesses have finished
    barrier::dsb(barrier::SY);
    // Copy the partial sums into the bank
    for chunk in output_partial_sum_vector
        .fixed_rows_with_step_mut::<X16R>(0, 16)
        .iter_mut() {
            chunk.execute_write();
    }
    // Execute the EXIT instruction
    dummy.execute_read();
 }
--- a/kernel.s
+++ b/kernel.s
@@ -0,0 +1,12 @@
 MOV GRF_A #0, BANK
 MOV GRF_A #1, BANK
 MOV GRF_A #2, BANK
 MOV GRF_A #3, BANK
 MOV GRF_A #4, BANK
 MOV GRF_A #5, BANK
 MOV GRF_A #6, BANK
 MOV GRF_A #7, BANK
 MAC(AAM) GRF_B, BANK, GRF_A
 JUMP -1, 7
 FILL BANK, GRF_B #0
 EXIT
--- a/main.tex
+++ b/main.tex
@@ -3,6 +3,7 @@
 \usepackage[style=verbose-ibid]{biblatex}
 \usepackage{datetime}
 \usepackage{tabularray}
 \usepackage[inkscapeversion=1]{svg}
 \addbibresource{references.bib}
@@ -225,8 +226,162 @@
 \end{frame}
 \begin{frame}[fragile]{Example: GEMV Kernel}
    \begin{center}
        \only<1>{
            \inputminted[firstline=1,lastline=8, fontsize=\footnotesize]{gas}{kernel.s}
            \hrule
            \inputminted[firstline=7,lastline=10,fontsize=\footnotesize]{rust}{kernel.rs}
        }
        \only<2>{
            \inputminted[firstline=9,lastline=10, fontsize=\footnotesize]{gas}{kernel.s}
            \hrule
            \inputminted[firstline=12,lastline=20,fontsize=\footnotesize]{rust}{kernel.rs}
        }
        \only<3>{
            \inputminted[firstline=11,lastline=12,fontsize=\footnotesize]{gas}{kernel.s}
            \hrule
            \inputminted[firstline=22,lastline=30,fontsize=\footnotesize]{rust}{kernel.rs}
        }
    \end{center}
 \end{frame}
 \begin{frame}{Virtual Prototype Platform}
    \begin{columns}
        \begin{column}{0.5\textwidth}
            \begin{itemize}
                \item ARM processor model
                \item Bare-metal kernel
                \item Custom page table configuration
                      \begin{itemize}
                          \item Non-PIM DRAM region mapped as cacheable memory
                          \item PIM DRAM region mapped as non-cacheable memory
                      \end{itemize}
            \end{itemize}
        \end{column}
        \begin{column}{0.5\textwidth}
            \begin{figure}
                \includesvg[width=0.8\textwidth]{images/bare_metal.svg}
            \end{figure}
        \end{column}
    \end{columns}
 \end{frame}
 \section{Simulations}
 \begin{frame}{Microbenchmarks}
    \begin{columns}
        \begin{column}{0.5\textwidth}
            \begin{itemize}
                \item Vector benchmarks (BLAS level 1)
                \item VADD: $z = x + y$
                \item VMUL: $z = x \cdot y$
                \item HAXPY: $z = a \cdot x + y$
                \item Vector-Matrix benchmarks (BLAS level 2)
                      \begin{itemize}
                          \item GEMV: $z = A \cdot x$
                          \item Simple DNN:
                                \begin{itemize}
                                    \item $f(x) = z = ReLU(A \cdot x)$
                                    \item $z_{n+1} = f(z_n)$
                                    \item 5 layers in total
                                \end{itemize}
                      \end{itemize}
            \end{itemize}
        \end{column}
        \begin{column}{0.5\textwidth}
            \begin{figure}
                \begin{tblr}{
                        hlines,
                        column{1} = {c},
                        column{2} = {r},
                        column{3} = {r},
                        column{4} = {r},
                        row{1} = {l},
                        hline{2} = {2}{-}{solid,black},
                    }
                    Level & Vector & GEMV             & DNN                \\
                    X1    & 2M     & (1k $\times$ 4k) & (256 $\times$ 256) \\
                    X2    & 4M     & (2k $\times$ 4k) & (512 $\times$ 512) \\
                    X3    & 8M     & (4k $\times$ 8k) & (1k $\times$ 1k)   \\
                    X4    & 16M    & (8k $\times$ 8k) & (2k $\times$ 2k)
                \end{tblr}
            \end{figure}
        \end{column}
    \end{columns}
 \end{frame}
 \begin{frame}{System Configuration}
    \begin{columns}[t]
        \begin{column}{0.5\textwidth}
            Two simulated systems:
            \begin{itemize}
                \item Generic ARM system
                \item Infinite compute system
                      \begin{itemize}
                          \item Unrealistic high frequency of 100 GHz
                          \item Completely memory bound
                          \item Lower bound of possible speedup
                      \end{itemize}
            \end{itemize}
        \end{column}
        \begin{column}{0.5\textwidth}
            Two real GPUs using HBM2:
            \begin{itemize}
                \item AMD RX Vega 56
                \item NVIDIA Tesla V100
            \end{itemize}
        \end{column}
    \end{columns}
 \end{frame}
 \begin{frame}{Speedups / Generic ARM System}
    \begin{figure}
        \includesvg[width=0.8\textwidth]{images/speedup_normal.svg}
    \end{figure}
 \end{frame}
 \begin{frame}{Speedups / Infinite Compute System}
    \begin{figure}
        \includesvg[width=0.8\textwidth]{images/speedup_inf.svg}
    \end{figure}
 \end{frame}
 \begin{frame}{Speedups / Samsung}
    \begin{figure}
        \includesvg[width=0.8\textwidth]{images/samsung.svg}
    \end{figure}
 \end{frame}
 \begin{frame}{Runtimes / Vector Benchmarks}
    \begin{figure}
        \includesvg[width=0.8\textwidth]{images/runtimes_vector.svg}
    \end{figure}
 \end{frame}
 \begin{frame}{Runtimes / Matrix Benchmarks}
    \begin{figure}
        \includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg}
    \end{figure}
 \end{frame}
 \begin{frame}{Conclusion and Future Work}
    \begin{itemize}
        \item PIM can accelerate memory-bound workloads
        \item Special PIM-friendly memory layouts are required
    \end{itemize}
    Future work:
    \begin{itemize}
        \item Implementation of Linux driver
        \item Comparison with real neural networks
        \item Consider replacing library approach with compiler approach
        \item Implement a power model to analyze the power efficiency gains
    \end{itemize}
 \end{frame}
 \section{Thank you for your attention!}
 \begin{frame}
    \frametitle{Outline}
    \tableofcontents