Files
memsys24-slides/main.tex
2024-09-17 14:20:12 +02:00

391 lines
12 KiB
TeX
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
\documentclass[aspectratio=169]{beamer}
\usetheme{UniWue}
\usepackage[style=verbose-ibid]{biblatex}
\usepackage{datetime}
\usepackage{tabularray}
\usepackage[inkscapeversion=1]{svg}
\addbibresource{references.bib}
\setbeamerfont{footnote}{size=\tiny}
\newdate{presentationday}{01}{10}{2024}
\title{PIMSys}
\subtitle{A Virtual Prototype for Processing in Memory}
\author{
Derek~Christ\inst{1}
\and
Lukas~Steiner\inst{2}
\and
Matthias~Jung\inst{3}
\and
Norbert~Wehn\inst{2}
}
\institute{
\inst{1}
Fraunhofer IESE
\quad
\inst{2}
RPTU Kaiserslautern-Landau
\quad
\inst{3}
University of Würzburg
}
\date{MEMSYS~2024}
\begin{document}
\frame{\titlepage}
\section{Introduction}
\begin{frame}{Energy Demand of Applications}
Total compute energy approaches worlds energy production\autocite{src2021}
\begin{figure}
\includesvg[width=0.6\textwidth]{images/world_energy}
\end{figure}
\end{frame}
\begin{frame}{Memory Bound Workloads}
AI applications become increasingly memory-bound\autocite{ivobolsens2023}
\begin{figure}
\includesvg[width=0.5\textwidth]{images/gpt}
\end{figure}
\end{frame}
\section{Processing-in-Memory}
\begin{frame}{Workloads for PIM}
Fully connected neural network layers:
\begin{itemize}
\item Large weight matrix - does not fit onto cache
\item No data reuse - cache is useless
\end{itemize}
\begin{figure}
\includesvg[width=0.6\textwidth]{images/dnn}
\end{figure}
\end{frame}
\begin{frame}{Workloads for PIM}
Convolutional layers:
\begin{itemize}
\item Small filter matrix - does fit onto cache
\item Excessive data reuse - cache is useful
\end{itemize}
\begin{figure}
TODO Tikz Image
% \includesvg[width=0.6\textwidth]{images/dnn}
\end{figure}
\end{frame}
\begin{frame}{Workloads for PIM}
\begin{columns}[T]
\begin{column}{0.5\textwidth}
\begin{center} \includesvg[height=50px]{images/thumbs-up} \end{center}
\begin{itemize}
\item Fully connected layers in multilayer perceptrons (MLPs)
\item Layers in recurrent neural networks (RNNs)
\end{itemize}
\end{column}
\begin{column}{0.5\textwidth}
\begin{center} \includesvg[height=50px]{images/thumbs-unsure} \end{center}
\begin{itemize}
\item Convolutional neural network (CNNs)
\end{itemize}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{PIM Architectures}
\begin{columns}[T]
\begin{column}{0.4\textwidth}
\begin{itemize}
\item<2-> Inside the memory subarray
\item<3-> Near the subarray in the PSA output region
\item<4-> Near the bank in its peripheral region
\item<5-> In the I/O region of the memory
\end{itemize}
\end{column}
\begin{column}{0.6\textwidth}
\only<1>{\includesvg[height=115px]{images/pim_positions_0}}
\only<2>{\includesvg[height=115px]{images/pim_positions_1}}
\only<3>{\includesvg[height=115px]{images/pim_positions_2}}
\only<4>{\includesvg[height=115px]{images/pim_positions_3}}
\only<5->{\includesvg[height=115px]{images/pim_positions_4}}
\end{column}
\end{columns}
\visible<6>{\begin{block}{Remark}The nearer the computation is to the memory cells, the higher the achievable bandwidth!\end{block}}
\end{frame}
\section{Samsung HBM-PIM/FIMDRAM}
\begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
\begin{itemize}
\item Real-world PIM implementation based on HBM2
\item PIM units embedded at the bank level
\end{itemize}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/hbm-pim}
\end{figure}
\end{frame}
\begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
\begin{columns}
\begin{column}{0.4\textwidth}
Processing units:
\begin{itemize}
\item Two 16-wide 16-bit FPUs
\item Register files and control unit
\end{itemize}
Instructions:
\begin{itemize}
\item Control: NOP, JUMP, EXIT
\item Data: MOV (ReLU), FILL
\item Arithmetic: ADD, MUL, MAC, MAD
\end{itemize}
\end{column}
\begin{column}{0.6\textwidth}
\begin{figure}
\includesvg[width=\textwidth]{images/pu}
\end{figure}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{HBM-PIM/FIMDRAM GEMV Operation}
\begin{figure}
\only<1>{\includesvg[width=0.8\textwidth]{images/gemv_normal}}
\only<2>{\includesvg[width=0.9\textwidth]{images/gemv_interleaved}}
\end{figure}
\end{frame}
\begin{frame}{HBM-PIM/FIMDRAM GEMV Operation\autocite{kang2022}}
\begin{figure}
\only<1>{\includesvg[width=0.55\textwidth]{images/gemv.svg}}
\only<2>{\includesvg[width=0.55\textwidth]{images/gemv_0.svg}}
\only<3>{\includesvg[width=0.55\textwidth]{images/gemv_1.svg}}
\only<4>{\includesvg[width=0.55\textwidth]{images/gemv_2.svg}}
\only<5>{\includesvg[width=0.55\textwidth]{images/gemv_3.svg}}
\only<6>{\includesvg[width=0.55\textwidth]{images/gemv_4.svg}}
\end{figure}
\end{frame}
\begin{frame}{HBM-PIM/FIMDRAM}
\begin{huge}
How fast is it?
\end{huge}\\
Research should ...
\begin{itemize}
\item ... conduct simulations to explore \alert{performance gains}
\item ... consider also the programmability to identify \alert{challenges}
\end{itemize}
\end{frame}
\section{Virtual Prototype}
\begin{frame}{Virtual Prototype}
\begin{itemize}
\item Coupling of gem5 and DRAMSys
\item Implementation of HBM-PIM in DRAM model
\end{itemize}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/dramsys.svg}
\end{figure}
\end{frame}
\begin{frame}{Software Library}
Software support library
\begin{columns}
\begin{column}{0.5\textwidth}
\begin{itemize}
\item Initialization
\begin{itemize}
\item Assembly and loading of microkernel
\end{itemize}
\item Execution
\begin{itemize}
\item Generation of RD and WR requests
\item Insetion of memory barriers
\end{itemize}
\end{itemize}
\end{column}
\begin{column}{0.5\textwidth}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/data_structures.svg}
\end{figure}
\end{column}
\end{columns}
\end{frame}
\begin{frame}[fragile]{Example: GEMV Kernel}
\begin{center}
\only<1>{
\inputminted[firstline=1,lastline=8, fontsize=\footnotesize]{gas}{kernel.s}
\hrule
\inputminted[firstline=7,lastline=10,fontsize=\footnotesize]{rust}{kernel.rs}
}
\only<2>{
\inputminted[firstline=9,lastline=10, fontsize=\footnotesize]{gas}{kernel.s}
\hrule
\inputminted[firstline=12,lastline=20,fontsize=\footnotesize]{rust}{kernel.rs}
}
\only<3>{
\inputminted[firstline=11,lastline=12,fontsize=\footnotesize]{gas}{kernel.s}
\hrule
\inputminted[firstline=22,lastline=30,fontsize=\footnotesize]{rust}{kernel.rs}
}
\end{center}
\end{frame}
\begin{frame}{Virtual Prototype Platform}
\begin{columns}
\begin{column}{0.5\textwidth}
\begin{itemize}
\item ARM processor model
\item Bare-metal kernel
\item Custom page table configuration
\begin{itemize}
\item Non-PIM DRAM region mapped as cacheable memory
\item PIM DRAM region mapped as non-cacheable memory
\end{itemize}
\end{itemize}
\end{column}
\begin{column}{0.5\textwidth}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/bare_metal.svg}
\end{figure}
\end{column}
\end{columns}
\end{frame}
\section{Simulations}
\begin{frame}{Microbenchmarks}
\begin{columns}
\begin{column}{0.5\textwidth}
\begin{itemize}
\item Vector benchmarks (BLAS level 1)
\item VADD: $z = x + y$
\item VMUL: $z = x \cdot y$
\item HAXPY: $z = a \cdot x + y$
\item Vector-Matrix benchmarks (BLAS level 2)
\begin{itemize}
\item GEMV: $z = A \cdot x$
\item Simple DNN:
\begin{itemize}
\item $f(x) = z = ReLU(A \cdot x)$
\item $z_{n+1} = f(z_n)$
\item 5 layers in total
\end{itemize}
\end{itemize}
\end{itemize}
\end{column}
\begin{column}{0.5\textwidth}
\begin{figure}
\begin{tblr}{
hlines,
column{1} = {c},
column{2} = {r},
column{3} = {r},
column{4} = {r},
row{1} = {l},
hline{2} = {2}{-}{solid,black},
}
Level & Vector & GEMV & DNN \\
X1 & 2M & (1k $\times$ 4k) & (256 $\times$ 256) \\
X2 & 4M & (2k $\times$ 4k) & (512 $\times$ 512) \\
X3 & 8M & (4k $\times$ 8k) & (1k $\times$ 1k) \\
X4 & 16M & (8k $\times$ 8k) & (2k $\times$ 2k)
\end{tblr}
\end{figure}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{System Configuration}
\begin{columns}[t]
\begin{column}{0.5\textwidth}
Two simulated systems:
\begin{itemize}
\item Generic ARM system
\item Infinite compute system
\begin{itemize}
\item Unrealistic high frequency of 100 GHz
\item Completely memory bound
\item Lower bound of possible speedup
\end{itemize}
\end{itemize}
\end{column}
\begin{column}{0.5\textwidth}
Two real GPUs using HBM2:
\begin{itemize}
\item AMD RX Vega 56
\item NVIDIA Tesla V100
\end{itemize}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{Speedups / Generic ARM System}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/speedup_normal.svg}
\end{figure}
\end{frame}
\begin{frame}{Speedups / Infinite Compute System}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/speedup_inf.svg}
\end{figure}
\end{frame}
\begin{frame}{Speedups / Samsung}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/samsung.svg}
\end{figure}
\end{frame}
\begin{frame}{Runtimes / Vector Benchmarks}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/runtimes_vector.svg}
\end{figure}
\end{frame}
\begin{frame}{Runtimes / Matrix Benchmarks}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg}
\end{figure}
\end{frame}
\begin{frame}{Conclusion and Future Work}
\begin{itemize}
\item PIM can accelerate memory-bound workloads
\item Special PIM-friendly memory layouts are required
\end{itemize}
Future work:
\begin{itemize}
\item Implementation of Linux driver
\item Comparison with real neural networks
\item Consider replacing library approach with compiler approach
\item Implement a power model to analyze the power efficiency gains
\end{itemize}
\end{frame}
\section{Thank you for your attention!}
\begin{frame}
\frametitle{Outline}
\tableofcontents
\end{frame}
\end{document}