memsys24-slides/main.tex

\documentclass[aspectratio=169]{beamer}
% \setbeameroption{show notes on second screen=right}
\usetheme{UniWue}

\usepackage[style=verbose-ibid]{biblatex}
\usepackage{datetime}
\usepackage{tabularray}
\usepackage{tikz}
\usepackage[inkscapeversion=1]{svg}

\usetikzlibrary{fit}
\usetikzlibrary{positioning}
\usetikzlibrary{matrix}

\addbibresource{references.bib}

\setbeamerfont{footnote}{size=\tiny}

\newdate{presentationday}{01}{10}{2024}

\title{PIMSys}
\subtitle{A Virtual Prototype for Processing in Memory}

\author{
    Derek~Christ\inst{1}
    \and
    Lukas~Steiner\inst{2}
    \and
    Matthias~Jung\inst{3}
    \and
    Norbert~Wehn\inst{2}
}

\institute{
    \inst{1}
    Fraunhofer IESE
    \quad
    \inst{2}
    RPTU Kaiserslautern-Landau
    \quad
    \inst{3}
    University of Würzburg
}

\date{MEMSYS~2024}

\begin{document}

\frame{\titlepage}

\section{Introduction}

\begin{frame}{Energy Demand of Applications}
    Total compute energy approaches world’s energy production\autocite{src2021}
    \begin{figure}
        \includesvg[width=0.6\textwidth]{images/world_energy}
    \end{figure}
\end{frame}

\note[itemize]{
    \item compute 2x every two years
    \item energy production 2\% per year
    \item to meet future compute demands, drastic improvements in energy efficiency
}

\begin{frame}{Memory Bound Workloads}
    AI applications become increasingly memory-bound\autocite{ivobolsens2023}
    \begin{figure}
        \includesvg[width=0.5\textwidth]{images/gpt}
    \end{figure}
\end{frame}

\note[itemize]{
    \item Emerging AI applications become increasingly memory-bound
    \item Roofline model
    \item Not limited by compute power but by memory
    \item researchers begin to consider PIM to circumvent memory bottleneck
    \item (drastically more parameters in GPT-3, operational intensity goes down)
}

\section{Processing-in-Memory}

\begin{frame}{Workloads for PIM}
    Fully connected neural network layers:
    \begin{itemize}
        \item Large weight matrix -\alert{does not fit onto cache}
        \item No data reuse - \alert{cache is useless}
    \end{itemize}
    \begin{figure}
        \includesvg[width=0.6\textwidth]{images/dnn}
    \end{figure}
\end{frame}

\begin{frame}{Workloads for PIM}
    Convolutional layers:
    \begin{itemize}
        \item Small filter matrix - \alert{does fit onto cache}
        \item Excessive data reuse - \alert{cache is useful}
    \end{itemize}
    \begin{figure}
        \begin{figure}
            \input{images/cnn}
        \end{figure}
    \end{figure}
\end{frame}

\begin{frame}{Workloads for PIM}
    \begin{columns}[T]
        \begin{column}{0.5\textwidth}
            \begin{center} \includesvg[height=50px]{images/thumbs-up} \end{center}
            \begin{itemize}
                \item Fully connected layers in multilayer perceptrons (MLPs)
                \item Layers in recurrent neural networks (RNNs)
            \end{itemize}
        \end{column}
        \begin{column}{0.5\textwidth}
            \begin{center} \includesvg[height=50px]{images/thumbs-unsure} \end{center}
            \begin{itemize}
                \item Convolutional neural network (CNNs)
            \end{itemize}
        \end{column}
    \end{columns}
\end{frame}

\note{
    To summarize...
}

\begin{frame}{PIM Architectures}
    \begin{columns}[T]
        \begin{column}{0.4\textwidth}
            \begin{itemize}
                \item<2-> Inside the memory subarray
                \item<3-> Near the subarray in the PSA output region
                \item<4-> Near the bank in its peripheral region
                \item<5-> In the I/O region of the memory
            \end{itemize}
        \end{column}
        \begin{column}{0.6\textwidth}
            \only<1>{\includesvg[height=115px]{images/pim_positions_0}}
            \only<2>{\includesvg[height=115px]{images/pim_positions_1}}
            \only<3>{\includesvg[height=115px]{images/pim_positions_2}}
            \only<4>{\includesvg[height=115px]{images/pim_positions_3}}
            \only<5->{\includesvg[height=115px]{images/pim_positions_4}}
        \end{column}
    \end{columns}
    \visible<6>{\begin{block}{Remark}The nearer the computation is to the memory cells, the higher the achievable bandwidth!\end{block}}
\end{frame}

\note[itemize]{
    \item Architecture space of PIM:
    \item Inside the memory SA - simple bulk logic
    \item Near SA in PSA output region - logic gates in the region
    \item Near a bank in its peripheral region - computation units with control
    \item I/O region of memory - limited by memory bus
}

\section{Samsung HBM-PIM/FIMDRAM}

\begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
    \begin{itemize}
        \item Real-world PIM implementation based on HBM2
        \item PIM units embedded at the bank level
    \end{itemize}
    \begin{figure}
        \includesvg[width=0.8\textwidth]{images/hbm-pim}
    \end{figure}
\end{frame}

\note[itemize]{
    \item One PIM unit shared by two banks
    \item 16-wide SIMD FPUs are 16-wide
    \item All-Bank mode: All PIM units operate in parallel
}

\begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
    \begin{columns}
        \begin{column}{0.4\textwidth}
            Processing units:
            \begin{itemize}
                \item Two 16-wide 16-bit FPUs
                \item Register files and control unit
            \end{itemize}
            Instructions:
            \begin{itemize}
                \item Control: NOP, JUMP, EXIT
                \item Data: MOV (ReLU), FILL
                \item Arithmetic: ADD, MUL, MAC, MAD
            \end{itemize}
        \end{column}
        \begin{column}{0.6\textwidth}
            \begin{figure}
                \includesvg[width=\textwidth]{images/pu}
            \end{figure}
        \end{column}
    \end{columns}
\end{frame}

\note[itemize]{
    \item Two SIMD FPUs (ADD,MUL)
    \item CRF: 32 instructions, stores the program
    \item GRF: 16 entries, one memory fetch
    \item SRF: 16 entries
    \item Control units executes one instruction when RD or WR command is issued
}

\begin{frame}{HBM-PIM/FIMDRAM GEMV Operation}
    \begin{figure}
        \only<1>{\includesvg[width=0.8\textwidth]{images/gemv_normal}}
        \only<2>{\includesvg[width=0.9\textwidth]{images/gemv_interleaved}}
    \end{figure}
\end{frame}

\begin{frame}{HBM-PIM/FIMDRAM GEMV Operation\autocite{kang2022}}
    \begin{figure}
        \only<1>{\includesvg[width=0.55\textwidth]{images/gemv.svg}}
        \only<2>{\includesvg[width=0.55\textwidth]{images/gemv_0.svg}}
        \only<3>{\includesvg[width=0.55\textwidth]{images/gemv_1.svg}}
        \only<4>{\includesvg[width=0.55\textwidth]{images/gemv_2.svg}}
        \only<5>{\includesvg[width=0.55\textwidth]{images/gemv_3.svg}}
        \only<6>{\includesvg[width=0.55\textwidth]{images/gemv_4.svg}}
    \end{figure}
\end{frame}

\note[itemize]{
    \item Procedure of GEMV operation
    \item multiple cycles
    \item each PIM unit operatates on one matrix row
    \item partial sum, reduced by host
}

\begin{frame}{HBM-PIM/FIMDRAM}
    \begin{huge}
        How fast is it?
    \end{huge}\\

    Research should ...
    \begin{itemize}
        \item ... conduct simulations to explore \alert{performance gains}
        \item ... consider also the programmability to identify \alert{challenges}
    \end{itemize}
\end{frame}

\section{Virtual Prototype}

\begin{frame}{Virtual Prototype}
    \begin{itemize}
        \item Coupling of gem5 and DRAMSys
        \item Implementation of HBM-PIM in DRAM model
    \end{itemize}

    \begin{figure}
        \includesvg[width=0.8\textwidth]{images/dramsys.svg}
    \end{figure}
\end{frame}

\note[itemize]{
    \item VP interprets the programmed microkernel
    \item not yet drop-in replacement
}

\begin{frame}{Software Library}
    Software support library
    \begin{columns}
        \begin{column}{0.5\textwidth}
            \begin{itemize}
                \item Initialization
                      \begin{itemize}
                          \item Assembly and loading of microkernel
                      \end{itemize}
                \item Execution
                      \begin{itemize}
                          \item Generation of RD and WR requests
                          \item Insetion of memory barriers
                      \end{itemize}
            \end{itemize}
        \end{column}

        \begin{column}{0.5\textwidth}
            \begin{figure}
                \includesvg[width=0.8\textwidth]{images/data_structures.svg}
            \end{figure}
        \end{column}
    \end{columns}
\end{frame}

\begin{frame}[fragile]{Example: GEMV Kernel}
    \begin{center}
        \only<1>{
            \inputminted[firstline=1,lastline=8, fontsize=\footnotesize]{gas}{kernel.s}
            \hrule
            \inputminted[firstline=7,lastline=10,fontsize=\footnotesize]{rust}{kernel.rs}
        }
        \only<2>{
            \inputminted[firstline=9,lastline=10, fontsize=\footnotesize]{gas}{kernel.s}
            \hrule
            \inputminted[firstline=12,lastline=20,fontsize=\footnotesize]{rust}{kernel.rs}
        }
        \only<3>{
            \inputminted[firstline=11,lastline=12,fontsize=\footnotesize]{gas}{kernel.s}
            \hrule
            \inputminted[firstline=22,lastline=30,fontsize=\footnotesize]{rust}{kernel.rs}
        }
    \end{center}
\end{frame}

\begin{frame}{Virtual Prototype Platform}
    \begin{columns}
        \begin{column}{0.5\textwidth}
            \begin{itemize}
                \item ARM processor model
                \item Bare-metal kernel
                \item Custom page table configuration
                      \begin{itemize}
                          \item Non-PIM DRAM region mapped as cacheable memory
                          \item PIM DRAM region mapped as non-cacheable memory
                      \end{itemize}
            \end{itemize}
        \end{column}
        \begin{column}{0.5\textwidth}
            \begin{figure}
                \includesvg[width=0.8\textwidth]{images/bare_metal.svg}
            \end{figure}
        \end{column}
    \end{columns}
\end{frame}

\note[itemize]{
    \item bare metal offers most control
}

\section{Simulations}

\begin{frame}{Microbenchmarks}
    \begin{columns}
        \begin{column}{0.5\textwidth}
            \begin{itemize}
                \item Vector benchmarks (BLAS level 1)
                \item VADD: $z = x + y$
                \item VMUL: $z = x \cdot y$
                \item HAXPY: $z = a \cdot x + y$

                \item Vector-Matrix benchmarks (BLAS level 2)
                      \begin{itemize}
                          \item GEMV: $z = A \cdot x$
                          \item Simple DNN:
                                \begin{itemize}
                                    \item $f(x) = z = ReLU(A \cdot x)$
                                    \item $z_{n+1} = f(z_n)$
                                    \item 5 layers in total
                                \end{itemize}
                      \end{itemize}
            \end{itemize}
        \end{column}
        \begin{column}{0.5\textwidth}
            \begin{figure}
                \begin{tblr}{
                        hlines,
                        column{1} = {c},
                        column{2} = {r},
                        column{3} = {r},
                        column{4} = {r},
                        row{1} = {l},
                        hline{2} = {2}{-}{solid,black},
                    }
                    Level & Vector & GEMV             & DNN                \\
                    X1    & 2M     & (1k $\times$ 4k) & (256 $\times$ 256) \\
                    X2    & 4M     & (2k $\times$ 4k) & (512 $\times$ 512) \\
                    X3    & 8M     & (4k $\times$ 8k) & (1k $\times$ 1k)   \\
                    X4    & 16M    & (8k $\times$ 8k) & (2k $\times$ 2k)
                \end{tblr}
            \end{figure}
        \end{column}
    \end{columns}
\end{frame}

\note[itemize]{
    \item operand data significantly larger than on-chip cache
}

\begin{frame}{System Configuration}
    \begin{columns}[t]
        \begin{column}{0.5\textwidth}
            Two simulated systems:
            \begin{itemize}
                \item Generic ARM system
                \item Infinite compute system
                      \begin{itemize}
                          \item Unrealistic high frequency of 100 GHz
                          \item Completely memory bound
                          \item Lower bound of possible speedup
                      \end{itemize}
            \end{itemize}
        \end{column}
        \begin{column}{0.5\textwidth}
            Two real GPUs using HBM2:
            \begin{itemize}
                \item AMD RX Vega 56
                \item NVIDIA Tesla V100
            \end{itemize}
        \end{column}
    \end{columns}
\end{frame}

\begin{frame}{Speedups / Generic ARM System}
    \begin{figure}
        \includesvg[width=0.8\textwidth]{images/speedup_normal.svg}
    \end{figure}
\end{frame}

\begin{frame}{Speedups / Infinite Compute System}
    \begin{figure}
        \includesvg[width=0.8\textwidth]{images/speedup_inf.svg}
    \end{figure}
\end{frame}

\note[itemize]{
    \item VADD: 12.7x
    \item GEMV: 9.0x
}

\begin{frame}{Speedups / Samsung}
    \begin{figure}
        \includesvg[width=0.8\textwidth]{images/samsung.svg}
    \end{figure}
\end{frame}

\note[itemize]{
    \item GEMV matches good
    \item ADD shows deviation
    \item -> differences in hardware architecture
    \item GPU has no speculative execution
}

\begin{frame}{Runtimes / Vector Benchmarks}
    \begin{figure}
        \includesvg[width=0.8\textwidth]{images/runtimes_vector.svg}
    \end{figure}
\end{frame}

\note[itemize]{
    \item Real GPUs use multiple memory channels
    \item Memory barriers
    \item Also architectural differences
}

\begin{frame}{Runtimes / Matrix Benchmarks}
    \begin{figure}
        \includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg}
    \end{figure}
\end{frame}

\begin{frame}{Conclusion and Future Work}
    \begin{itemize}
        \item PIM can accelerate memory-bound workloads
        \item Special PIM-friendly memory layouts are required
    \end{itemize}

    Future work:
    \begin{itemize}
        \item Implementation of Linux driver
        \item Comparison with real neural networks
        \item Consider replacing library approach with compiler approach
        \item Implement a power model to analyze the power efficiency gains
    \end{itemize}
\end{frame}

\section{Thank you for your attention!}

\begin{frame}
    \frametitle{Outline}
    \tableofcontents
\end{frame}

\begin{frame}{Memory Layout}
    \begin{figure}
        \includesvg[width=\textwidth]{images/complete_layout.svg}
    \end{figure}
\end{frame}


\note[itemize]{
    \item Data layout in program and address mapping must match
}

\end{document}