236 lines
6.9 KiB
TeX
236 lines
6.9 KiB
TeX
\documentclass[aspectratio=169]{beamer}
|
||
\usetheme{UniWue}
|
||
|
||
\usepackage[style=verbose-ibid]{biblatex}
|
||
\usepackage{datetime}
|
||
\usepackage[inkscapeversion=1]{svg}
|
||
|
||
\addbibresource{references.bib}
|
||
|
||
\setbeamerfont{footnote}{size=\tiny}
|
||
|
||
\newdate{presentationday}{01}{10}{2024}
|
||
|
||
\title{PIMSys}
|
||
\subtitle{A Virtual Prototype for Processing in Memory}
|
||
|
||
\author{
|
||
Derek~Christ\inst{1}
|
||
\and
|
||
Lukas~Steiner\inst{2}
|
||
\and
|
||
Matthias~Jung\inst{3}
|
||
\and
|
||
Norbert~Wehn\inst{2}
|
||
}
|
||
|
||
\institute{
|
||
\inst{1}
|
||
Fraunhofer IESE
|
||
\quad
|
||
\inst{2}
|
||
RPTU Kaiserslautern-Landau
|
||
\quad
|
||
\inst{3}
|
||
University of Würzburg
|
||
}
|
||
|
||
\date{MEMSYS~2024}
|
||
|
||
\begin{document}
|
||
|
||
\frame{\titlepage}
|
||
|
||
\section{Introduction}
|
||
|
||
\begin{frame}{Energy Demand of Applications}
|
||
Total compute energy approaches world’s energy production\autocite{src2021}
|
||
\begin{figure}
|
||
\includesvg[width=0.6\textwidth]{images/world_energy}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\begin{frame}{Memory Bound Workloads}
|
||
AI applications become increasingly memory-bound\autocite{ivobolsens2023}
|
||
\begin{figure}
|
||
\includesvg[width=0.5\textwidth]{images/gpt}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\section{Processing-in-Memory}
|
||
|
||
\begin{frame}{Workloads for PIM}
|
||
Fully connected neural network layers:
|
||
\begin{itemize}
|
||
\item Large weight matrix - does not fit onto cache
|
||
\item No data reuse - cache is useless
|
||
\end{itemize}
|
||
\begin{figure}
|
||
\includesvg[width=0.6\textwidth]{images/dnn}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\begin{frame}{Workloads for PIM}
|
||
Convolutional layers:
|
||
\begin{itemize}
|
||
\item Small filter matrix - does fit onto cache
|
||
\item Excessive data reuse - cache is useful
|
||
\end{itemize}
|
||
\begin{figure}
|
||
TODO Tikz Image
|
||
% \includesvg[width=0.6\textwidth]{images/dnn}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\begin{frame}{Workloads for PIM}
|
||
\begin{columns}[T]
|
||
\begin{column}{0.5\textwidth}
|
||
\begin{center} \includesvg[height=50px]{images/thumbs-up} \end{center}
|
||
\begin{itemize}
|
||
\item Fully connected layers in multilayer perceptrons (MLPs)
|
||
\item Layers in recurrent neural networks (RNNs)
|
||
\end{itemize}
|
||
\end{column}
|
||
\begin{column}{0.5\textwidth}
|
||
\begin{center} \includesvg[height=50px]{images/thumbs-unsure} \end{center}
|
||
\begin{itemize}
|
||
\item Convolutional neural network (CNNs)
|
||
\end{itemize}
|
||
\end{column}
|
||
\end{columns}
|
||
\end{frame}
|
||
|
||
\begin{frame}{PIM Architectures}
|
||
\begin{columns}[T]
|
||
\begin{column}{0.4\textwidth}
|
||
\begin{itemize}
|
||
\item<2-> Inside the memory subarray
|
||
\item<3-> Near the subarray in the PSA output region
|
||
\item<4-> Near the bank in its peripheral region
|
||
\item<5-> In the I/O region of the memory
|
||
\end{itemize}
|
||
\end{column}
|
||
\begin{column}{0.6\textwidth}
|
||
\only<1>{\includesvg[height=115px]{images/pim_positions_0}}
|
||
\only<2>{\includesvg[height=115px]{images/pim_positions_1}}
|
||
\only<3>{\includesvg[height=115px]{images/pim_positions_2}}
|
||
\only<4>{\includesvg[height=115px]{images/pim_positions_3}}
|
||
\only<5->{\includesvg[height=115px]{images/pim_positions_4}}
|
||
\end{column}
|
||
\end{columns}
|
||
\visible<6>{\begin{block}{Remark}The nearer the computation is to the memory cells, the higher the achievable bandwidth!\end{block}}
|
||
\end{frame}
|
||
|
||
\section{Samsung HBM-PIM/FIMDRAM}
|
||
|
||
\begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
|
||
\begin{itemize}
|
||
\item Real-world PIM implementation based on HBM2
|
||
\item PIM units embedded at the bank level
|
||
\end{itemize}
|
||
\begin{figure}
|
||
\includesvg[width=0.8\textwidth]{images/hbm-pim}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
|
||
\begin{columns}
|
||
\begin{column}{0.4\textwidth}
|
||
Processing units:
|
||
\begin{itemize}
|
||
\item Two 16-wide 16-bit FPUs
|
||
\item Register files and control unit
|
||
\end{itemize}
|
||
Instructions:
|
||
\begin{itemize}
|
||
\item Control: NOP, JUMP, EXIT
|
||
\item Data: MOV (ReLU), FILL
|
||
\item Arithmetic: ADD, MUL, MAC, MAD
|
||
\end{itemize}
|
||
\end{column}
|
||
\begin{column}{0.6\textwidth}
|
||
\begin{figure}
|
||
\includesvg[width=\textwidth]{images/pu}
|
||
\end{figure}
|
||
\end{column}
|
||
\end{columns}
|
||
\end{frame}
|
||
|
||
\begin{frame}{HBM-PIM/FIMDRAM GEMV Operation}
|
||
\begin{figure}
|
||
\only<1>{\includesvg[width=0.8\textwidth]{images/gemv_normal}}
|
||
\only<2>{\includesvg[width=0.9\textwidth]{images/gemv_interleaved}}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\begin{frame}{HBM-PIM/FIMDRAM GEMV Operation\autocite{kang2022}}
|
||
\begin{figure}
|
||
\only<1>{\includesvg[width=0.55\textwidth]{images/gemv.svg}}
|
||
\only<2>{\includesvg[width=0.55\textwidth]{images/gemv_0.svg}}
|
||
\only<3>{\includesvg[width=0.55\textwidth]{images/gemv_1.svg}}
|
||
\only<4>{\includesvg[width=0.55\textwidth]{images/gemv_2.svg}}
|
||
\only<5>{\includesvg[width=0.55\textwidth]{images/gemv_3.svg}}
|
||
\only<6>{\includesvg[width=0.55\textwidth]{images/gemv_4.svg}}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\begin{frame}{HBM-PIM/FIMDRAM}
|
||
\begin{huge}
|
||
How fast is it?
|
||
\end{huge}\\
|
||
|
||
Research should ...
|
||
\begin{itemize}
|
||
\item ... conduct simulations to explore \alert{performance gains}
|
||
\item ... consider also the programmability to identify \alert{challenges}
|
||
\end{itemize}
|
||
\end{frame}
|
||
|
||
\section{Virtual Prototype}
|
||
|
||
\begin{frame}{Virtual Prototype}
|
||
\begin{itemize}
|
||
\item Coupling of gem5 and DRAMSys
|
||
\item Implementation of HBM-PIM in DRAM model
|
||
\end{itemize}
|
||
|
||
\begin{figure}
|
||
\includesvg[width=0.8\textwidth]{images/dramsys.svg}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\begin{frame}{Software Library}
|
||
Software support library
|
||
\begin{columns}
|
||
\begin{column}{0.5\textwidth}
|
||
\begin{itemize}
|
||
\item Initialization
|
||
\begin{itemize}
|
||
\item Assembly and loading of microkernel
|
||
\end{itemize}
|
||
\item Execution
|
||
\begin{itemize}
|
||
\item Generation of RD and WR requests
|
||
\item Insetion of memory barriers
|
||
\end{itemize}
|
||
\end{itemize}
|
||
\end{column}
|
||
|
||
\begin{column}{0.5\textwidth}
|
||
\begin{figure}
|
||
\includesvg[width=0.8\textwidth]{images/data_structures.svg}
|
||
\end{figure}
|
||
\end{column}
|
||
\end{columns}
|
||
\end{frame}
|
||
|
||
\begin{frame}[fragile]{Example: GEMV Kernel}
|
||
\end{frame}
|
||
|
||
\begin{frame}
|
||
\frametitle{Outline}
|
||
\tableofcontents
|
||
\end{frame}
|
||
|
||
\end{document}
|