\documentclass[aspectratio=169]{beamer} % \setbeameroption{show notes on second screen=right} \usetheme{UniWue} \usepackage{appendixnumberbeamer} \usepackage[style=verbose-ibid]{biblatex} \usepackage{datetime} \usepackage{tabularray} \usepackage{tikz} \usepackage[inkscapeversion=1]{svg} \usetikzlibrary{fit} \usetikzlibrary{positioning} \usetikzlibrary{matrix} \addbibresource{references.bib} \setbeamerfont{footnote}{size=\tiny} \setbeamercolor{alerted text}{fg=uniwuered} \newdate{presentationday}{01}{10}{2024} \title{PIMSys} \subtitle{A Virtual Prototype for Processing in Memory} \author{ Derek~Christ\inst{1,3} \and Lukas~Steiner\inst{2} \and Matthias~Jung\inst{3} \and Norbert~Wehn\inst{2} } \institute{ \inst{1} Fraunhofer IESE \quad \inst{2} RPTU Kaiserslautern-Landau \quad \inst{3} University of Würzburg } \date{MEMSYS~2024} \begin{document} \frame{\titlepage} % \section{Introduction} % \section{Processing-in-Memory} \begin{frame}{Workloads for PIM} Fully connected neural network layers: \begin{itemize} \item Large weight matrix - \alert{does not fit onto cache} \item No data reuse - \alert{cache is almost useless} \end{itemize} \begin{figure} \includesvg[width=0.6\textwidth]{images/dnn} \end{figure} \end{frame} \note[itemize]{ \item Let's start by having a look on which workloads can be accelerated by PIM... \item memory-bound: each entry only used once } \begin{frame}{Workloads for PIM} Convolutional layers: \begin{itemize} \item Small filter matrix - \alert{does fit onto cache} \item Excessive data reuse - \alert{cache is useful} \end{itemize} \begin{figure} \begin{figure} \input{images/cnn} \end{figure} \end{figure} \end{frame} \note[itemize]{ \item compute-bound: elements of filter matrix used often } \begin{frame}{Workloads for PIM} \begin{columns}[T] \begin{column}{0.5\textwidth} \begin{center} \includesvg[height=50px]{images/thumbs-up} \end{center} \begin{center}\alert{(memory-bound)}\end{center} \begin{itemize} \item Fully connected layers in multilayer perceptrons (MLPs) \item Layers in recurrent neural networks (RNNs) \end{itemize} \end{column} \begin{column}{0.5\textwidth} \begin{center} \includesvg[height=50px]{images/thumbs-unsure} \end{center} \begin{center}\alert{(compute-bound)}\end{center} \begin{itemize} \item Convolutional neural network (CNNs) \end{itemize} \end{column} \end{columns} \end{frame} \note{ To summarize... } \begin{frame}{PIM Architectures} \begin{columns}[T] \begin{column}{0.4\textwidth} \begin{itemize} \item<2-> Inside the memory subarray \item<3-> Near the subarray in the PSA region \item<4-> Near the bank in its peripheral region \item<5-> In the I/O region of the memory \end{itemize} \end{column} \begin{column}{0.6\textwidth} \only<1>{\includesvg[height=115px]{images/pim_positions_0}} \only<2>{\includesvg[height=115px]{images/pim_positions_1}} \only<3>{\includesvg[height=115px]{images/pim_positions_2}} \only<4>{\includesvg[height=115px]{images/pim_positions_3}} \only<5->{\includesvg[height=115px]{images/pim_positions_4}} \end{column} \end{columns} \visible<6>{\begin{block}{Remark}The nearer the computation is to the memory cells, the higher the achievable bandwidth!\end{block}} \end{frame} \note[itemize]{ \item Architecture space of PIM: \item Inside the memory SA - simple bulk logic \item Near SA in PSA region - logic gates in the region \item Near a bank in its peripheral region - computation units with control \item I/O region of memory - limited by memory bus } % \section{Samsung HBM-PIM/FIMDRAM} \begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}} \begin{itemize} \item Real-world PIM implementation based on HBM2 \item PIM units embedded at the bank level - in parallel \end{itemize} \begin{figure} \includesvg[width=0.8\textwidth]{images/hbm-pim} \end{figure} \end{frame} \note[itemize]{ \item One PIM unit shared by two banks \item 16-wide SIMD FPUs are 16-wide \item All-Bank mode: All PIM units operate in parallel } \begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}} \begin{columns} \begin{column}{0.4\textwidth} Processing units: \begin{itemize} \item Two 16-wide 16-bit FPUs \item Register files and control unit \end{itemize} Instructions: \begin{itemize} \item Control: NOP, JUMP, EXIT \item Data: MOV (ReLU), FILL \item Arithmetic: ADD, MUL, MAC, MAD \end{itemize} \end{column} \begin{column}{0.6\textwidth} \begin{figure} \includesvg[width=\textwidth]{images/pu} \end{figure} \end{column} \end{columns} \end{frame} \note[itemize]{ \item Two SIMD FPUs (ADD,MUL) \item CRF: 32 instructions, stores the program \item GRF: 16 entries, one memory fetch \item SRF: 16 entries \item Control units executes one instruction when RD or WR command is issued } \begin{frame}{HBM-PIM/FIMDRAM} \begin{huge} How fast is it? \end{huge}\\ Research should ... \begin{itemize} \item ... conduct simulations to explore \alert{performance gains} \item ... consider also the programmability to identify \alert{challenges} \end{itemize} \end{frame} % \section{Virtual Prototype} \begin{frame}{Virtual Prototype} \begin{itemize} \item Coupling of gem5 and DRAMSys \item Implementation of HBM-PIM in DRAM model \end{itemize} \begin{figure} \includesvg[width=0.8\textwidth]{images/dramsys.svg} \end{figure} \end{frame} \note[itemize]{ \item VP interprets the programmed microkernel \item not yet drop-in replacement } \begin{frame}{Software Library} \begin{columns} \begin{column}{0.5\textwidth} \includesvg[width=0.2\textwidth]{images/rust}\\ \textbf{Rust-based support library:} \begin{itemize} \item Initialization \begin{itemize} \item Assembly and loading of microkernel \item Arrange input data in special memory layout \end{itemize} \item Execution \begin{itemize} \item Generation of RD and WR requests \end{itemize} \end{itemize} \end{column} \begin{column}{0.5\textwidth} \begin{figure} \includesvg[width=0.8\textwidth]{images/data_structures.svg} \end{figure} \end{column} \end{columns} \end{frame} \begin{frame}[fragile]{Example: GEMV Kernel} \begin{center} \only<1>{ \inputminted[firstline=1,lastline=8, fontsize=\footnotesize]{gas}{kernel.s} \hrule \inputminted[firstline=7,lastline=10,fontsize=\footnotesize]{rust}{kernel.rs} } \only<2>{ \inputminted[firstline=9,lastline=10, fontsize=\footnotesize]{gas}{kernel.s} \hrule \inputminted[firstline=12,lastline=20,fontsize=\footnotesize]{rust}{kernel.rs} } \only<3>{ \inputminted[firstline=11,lastline=12,fontsize=\footnotesize]{gas}{kernel.s} \hrule \inputminted[firstline=22,lastline=30,fontsize=\footnotesize]{rust}{kernel.rs} } \end{center} \end{frame} \begin{frame}{Virtual Prototype Platform} \begin{columns} \begin{column}{0.4\textwidth} \begin{itemize} \item ARM processor model \item Bare-metal kernel \item Custom page table configuration \begin{itemize} \item Non-PIM DRAM region mapped as cacheable memory \item PIM DRAM region mapped as non-cacheable memory \end{itemize} \end{itemize} \end{column} \begin{column}{0.6\textwidth} \begin{figure} \includesvg[width=0.8\textwidth]{images/bare_metal.svg} \end{figure} \end{column} \end{columns} \end{frame} \note[itemize]{ \item bare metal offers most control } % \section{Simulations} \begin{frame}{Microbenchmarks} \begin{columns} \begin{column}{0.5\textwidth} \begin{itemize} \item Vector benchmarks (BLAS level 1) \begin{itemize} \item VADD: $z = x + y$ \item VMUL: $z = x \cdot y$ \item HAXPY: $z = a \cdot x + y$ \end{itemize} \item Vector-Matrix benchmarks (BLAS level 2) \begin{itemize} \item GEMV: $z = A \cdot x$ \item Simple DNN: \begin{itemize} \item $f(x) = z = ReLU(A \cdot x)$ \item $z_{n+1} = f(z_n)$ \item 5 layers in total \end{itemize} \end{itemize} \end{itemize} \end{column} \begin{column}{0.5\textwidth} \begin{figure} \begin{tblr}{ hlines, column{1} = {c}, column{2} = {r}, column{3} = {r}, column{4} = {r}, row{1} = {l}, hline{2} = {2}{-}{solid,black}, } Level & Vector & GEMV & DNN \\ X1 & 2M & (1k $\times$ 4k) & (256 $\times$ 256) \\ X2 & 4M & (2k $\times$ 4k) & (512 $\times$ 512) \\ X3 & 8M & (4k $\times$ 8k) & (1k $\times$ 1k) \\ X4 & 16M & (8k $\times$ 8k) & (2k $\times$ 2k) \end{tblr} \end{figure} \end{column} \end{columns} \end{frame} \note[itemize]{ \item operand data significantly larger than on-chip cache } \begin{frame}{System Configuration} % \begin{columns}[t] % \begin{column}{0.5\textwidth} Two simulated systems: \begin{itemize} \item Generic ARM system \item Infinite compute system \begin{itemize} \item Infinite CPU clock frequency \item Completely memory bound \item Lower bound of possible PIM speedup \end{itemize} \end{itemize} % \end{column} % \begin{column}{0.5\textwidth} % Two real GPUs using HBM2: % \begin{itemize} % \item AMD RX Vega 56 % \item NVIDIA Tesla V100 % \end{itemize} % \end{column} % \end{columns} \end{frame} \begin{frame}{Speedups / Generic ARM System} \begin{figure} \includesvg[width=0.8\textwidth]{images/speedup_normal.svg} \end{figure} \end{frame} \begin{frame}{Speedups / Infinite Compute System} \begin{figure} \includesvg[width=0.8\textwidth]{images/speedup_inf.svg} \end{figure} \end{frame} \note[itemize]{ \item VADD: 12.7x \item GEMV: 9.0x } \begin{frame}{Speedups / Samsung\autocite{lee2021}} \begin{figure} \includesvg[width=0.8\textwidth]{images/samsung.svg} \end{figure} \end{frame} \note[itemize]{ \item GEMV matches good \item ADD shows deviation \item -> differences in hardware architecture \item GPU has no speculative execution } \begin{frame}{Conclusion and Future Work} \textbf{Conclusion} \begin{itemize} \item PIM can accelerate memory-bound workloads \item Special PIM-friendly memory layouts are required \end{itemize} \textbf{Future work} \begin{itemize} \item Implementation of Linux driver \item Comparison with real neural networks \item Consider replacing library approach with compiler approach \item Implement a power model to analyze the power efficiency gains \end{itemize} \end{frame} \section{Thank you for your attention!} % \begin{frame} % \frametitle{Outline} % \tableofcontents % \end{frame} \appendix \begin{frame}{Energy Demand of Applications} Total compute energy approaches world’s energy production\autocite{src2021} \begin{figure} \includesvg[width=0.6\textwidth]{images/world_energy} \end{figure} \end{frame} \note[itemize]{ \item compute 2x every two years \item energy production 2\% per year \item to meet future compute demands, drastic improvements in energy efficiency } \begin{frame}{Memory Bound Workloads} AI applications become increasingly memory-bound\autocite{ivobolsens2023} \begin{figure} \includesvg[width=0.5\textwidth]{images/gpt} \end{figure} \end{frame} \note[itemize]{ \item Emerging AI applications become increasingly memory-bound \item Roofline model \item Not limited by compute power but by memory \item researchers begin to consider PIM to circumvent memory bottleneck \item (drastically more parameters in GPT-3, operational intensity goes down) } \begin{frame}{HBM-PIM/FIMDRAM GEMV Operation} \begin{figure} \only<1>{\includesvg[width=0.8\textwidth]{images/gemv_normal}} \only<2>{\includesvg[width=0.9\textwidth]{images/gemv_interleaved}} \end{figure} \end{frame} \begin{frame}{HBM-PIM/FIMDRAM GEMV Operation\autocite{kang2022}} \begin{figure} \only<1>{\includesvg[width=0.55\textwidth]{images/gemv.svg}} \only<2>{\includesvg[width=0.55\textwidth]{images/gemv_0.svg}} \only<3>{\includesvg[width=0.55\textwidth]{images/gemv_1.svg}} \only<4>{\includesvg[width=0.55\textwidth]{images/gemv_2.svg}} \only<5>{\includesvg[width=0.55\textwidth]{images/gemv_3.svg}} \only<6>{\includesvg[width=0.55\textwidth]{images/gemv_4.svg}} \end{figure} \end{frame} \note[itemize]{ \item Procedure of GEMV operation \item multiple cycles \item each PIM unit operatates on one matrix row \item partial sum, reduced by host } \begin{frame}{Memory Layout} \begin{figure} \includesvg[width=\textwidth]{images/complete_layout.svg} \end{figure} \end{frame} \note[itemize]{ \item Data layout in program and address mapping must match } \begin{frame}{Runtimes / Vector Benchmarks} \begin{figure} \includesvg[width=0.8\textwidth]{images/runtimes_vector.svg} \end{figure} \end{frame} \note[itemize]{ \item Real GPUs use multiple memory channels \item Memory barriers \item Also architectural differences } \begin{frame}{Runtimes / Matrix Benchmarks} \begin{figure} \includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg} \end{figure} \end{frame} \end{document}