487 lines
15 KiB
TeX
487 lines
15 KiB
TeX
\documentclass[aspectratio=169]{beamer}
|
||
% \setbeameroption{show notes on second screen=right}
|
||
\usetheme{UniWue}
|
||
|
||
\usepackage[style=verbose-ibid]{biblatex}
|
||
\usepackage{datetime}
|
||
\usepackage{tabularray}
|
||
\usepackage{tikz}
|
||
\usepackage[inkscapeversion=1]{svg}
|
||
|
||
\usetikzlibrary{fit}
|
||
\usetikzlibrary{positioning}
|
||
\usetikzlibrary{matrix}
|
||
|
||
\addbibresource{references.bib}
|
||
|
||
\setbeamerfont{footnote}{size=\tiny}
|
||
|
||
\newdate{presentationday}{01}{10}{2024}
|
||
|
||
\title{PIMSys}
|
||
\subtitle{A Virtual Prototype for Processing in Memory}
|
||
|
||
\author{
|
||
Derek~Christ\inst{1}
|
||
\and
|
||
Lukas~Steiner\inst{2}
|
||
\and
|
||
Matthias~Jung\inst{3}
|
||
\and
|
||
Norbert~Wehn\inst{2}
|
||
}
|
||
|
||
\institute{
|
||
\inst{1}
|
||
Fraunhofer IESE
|
||
\quad
|
||
\inst{2}
|
||
RPTU Kaiserslautern-Landau
|
||
\quad
|
||
\inst{3}
|
||
University of Würzburg
|
||
}
|
||
|
||
\date{MEMSYS~2024}
|
||
|
||
\begin{document}
|
||
|
||
\frame{\titlepage}
|
||
|
||
\section{Introduction}
|
||
|
||
\begin{frame}{Energy Demand of Applications}
|
||
Total compute energy approaches world’s energy production\autocite{src2021}
|
||
\begin{figure}
|
||
\includesvg[width=0.6\textwidth]{images/world_energy}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\note[itemize]{
|
||
\item compute 2x every two years
|
||
\item energy production 2\% per year
|
||
\item to meet future compute demands, drastic improvements in energy efficiency
|
||
}
|
||
|
||
\begin{frame}{Memory Bound Workloads}
|
||
AI applications become increasingly memory-bound\autocite{ivobolsens2023}
|
||
\begin{figure}
|
||
\includesvg[width=0.5\textwidth]{images/gpt}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\note[itemize]{
|
||
\item Emerging AI applications become increasingly memory-bound
|
||
\item Roofline model
|
||
\item Not limited by compute power but by memory
|
||
\item researchers begin to consider PIM to circumvent memory bottleneck
|
||
\item (drastically more parameters in GPT-3, operational intensity goes down)
|
||
}
|
||
|
||
\section{Processing-in-Memory}
|
||
|
||
\begin{frame}{Workloads for PIM}
|
||
Fully connected neural network layers:
|
||
\begin{itemize}
|
||
\item Large weight matrix -\alert{does not fit onto cache}
|
||
\item No data reuse - \alert{cache is useless}
|
||
\end{itemize}
|
||
\begin{figure}
|
||
\includesvg[width=0.6\textwidth]{images/dnn}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\begin{frame}{Workloads for PIM}
|
||
Convolutional layers:
|
||
\begin{itemize}
|
||
\item Small filter matrix - \alert{does fit onto cache}
|
||
\item Excessive data reuse - \alert{cache is useful}
|
||
\end{itemize}
|
||
\begin{figure}
|
||
\begin{figure}
|
||
\input{images/cnn}
|
||
\end{figure}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\begin{frame}{Workloads for PIM}
|
||
\begin{columns}[T]
|
||
\begin{column}{0.5\textwidth}
|
||
\begin{center} \includesvg[height=50px]{images/thumbs-up} \end{center}
|
||
\begin{itemize}
|
||
\item Fully connected layers in multilayer perceptrons (MLPs)
|
||
\item Layers in recurrent neural networks (RNNs)
|
||
\end{itemize}
|
||
\end{column}
|
||
\begin{column}{0.5\textwidth}
|
||
\begin{center} \includesvg[height=50px]{images/thumbs-unsure} \end{center}
|
||
\begin{itemize}
|
||
\item Convolutional neural network (CNNs)
|
||
\end{itemize}
|
||
\end{column}
|
||
\end{columns}
|
||
\end{frame}
|
||
|
||
\note{
|
||
To summarize...
|
||
}
|
||
|
||
\begin{frame}{PIM Architectures}
|
||
\begin{columns}[T]
|
||
\begin{column}{0.4\textwidth}
|
||
\begin{itemize}
|
||
\item<2-> Inside the memory subarray
|
||
\item<3-> Near the subarray in the PSA output region
|
||
\item<4-> Near the bank in its peripheral region
|
||
\item<5-> In the I/O region of the memory
|
||
\end{itemize}
|
||
\end{column}
|
||
\begin{column}{0.6\textwidth}
|
||
\only<1>{\includesvg[height=115px]{images/pim_positions_0}}
|
||
\only<2>{\includesvg[height=115px]{images/pim_positions_1}}
|
||
\only<3>{\includesvg[height=115px]{images/pim_positions_2}}
|
||
\only<4>{\includesvg[height=115px]{images/pim_positions_3}}
|
||
\only<5->{\includesvg[height=115px]{images/pim_positions_4}}
|
||
\end{column}
|
||
\end{columns}
|
||
\visible<6>{\begin{block}{Remark}The nearer the computation is to the memory cells, the higher the achievable bandwidth!\end{block}}
|
||
\end{frame}
|
||
|
||
\note[itemize]{
|
||
\item Architecture space of PIM:
|
||
\item Inside the memory SA - simple bulk logic
|
||
\item Near SA in PSA output region - logic gates in the region
|
||
\item Near a bank in its peripheral region - computation units with control
|
||
\item I/O region of memory - limited by memory bus
|
||
}
|
||
|
||
\section{Samsung HBM-PIM/FIMDRAM}
|
||
|
||
\begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
|
||
\begin{itemize}
|
||
\item Real-world PIM implementation based on HBM2
|
||
\item PIM units embedded at the bank level
|
||
\end{itemize}
|
||
\begin{figure}
|
||
\includesvg[width=0.8\textwidth]{images/hbm-pim}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\note[itemize]{
|
||
\item One PIM unit shared by two banks
|
||
\item 16-wide SIMD FPUs are 16-wide
|
||
\item All-Bank mode: All PIM units operate in parallel
|
||
}
|
||
|
||
\begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
|
||
\begin{columns}
|
||
\begin{column}{0.4\textwidth}
|
||
Processing units:
|
||
\begin{itemize}
|
||
\item Two 16-wide 16-bit FPUs
|
||
\item Register files and control unit
|
||
\end{itemize}
|
||
Instructions:
|
||
\begin{itemize}
|
||
\item Control: NOP, JUMP, EXIT
|
||
\item Data: MOV (ReLU), FILL
|
||
\item Arithmetic: ADD, MUL, MAC, MAD
|
||
\end{itemize}
|
||
\end{column}
|
||
\begin{column}{0.6\textwidth}
|
||
\begin{figure}
|
||
\includesvg[width=\textwidth]{images/pu}
|
||
\end{figure}
|
||
\end{column}
|
||
\end{columns}
|
||
\end{frame}
|
||
|
||
\note[itemize]{
|
||
\item Two SIMD FPUs (ADD,MUL)
|
||
\item CRF: 32 instructions, stores the program
|
||
\item GRF: 16 entries, one memory fetch
|
||
\item SRF: 16 entries
|
||
\item Control units executes one instruction when RD or WR command is issued
|
||
}
|
||
|
||
\begin{frame}{HBM-PIM/FIMDRAM GEMV Operation}
|
||
\begin{figure}
|
||
\only<1>{\includesvg[width=0.8\textwidth]{images/gemv_normal}}
|
||
\only<2>{\includesvg[width=0.9\textwidth]{images/gemv_interleaved}}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\begin{frame}{HBM-PIM/FIMDRAM GEMV Operation\autocite{kang2022}}
|
||
\begin{figure}
|
||
\only<1>{\includesvg[width=0.55\textwidth]{images/gemv.svg}}
|
||
\only<2>{\includesvg[width=0.55\textwidth]{images/gemv_0.svg}}
|
||
\only<3>{\includesvg[width=0.55\textwidth]{images/gemv_1.svg}}
|
||
\only<4>{\includesvg[width=0.55\textwidth]{images/gemv_2.svg}}
|
||
\only<5>{\includesvg[width=0.55\textwidth]{images/gemv_3.svg}}
|
||
\only<6>{\includesvg[width=0.55\textwidth]{images/gemv_4.svg}}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\note[itemize]{
|
||
\item Procedure of GEMV operation
|
||
\item multiple cycles
|
||
\item each PIM unit operatates on one matrix row
|
||
\item partial sum, reduced by host
|
||
}
|
||
|
||
\begin{frame}{HBM-PIM/FIMDRAM}
|
||
\begin{huge}
|
||
How fast is it?
|
||
\end{huge}\\
|
||
|
||
Research should ...
|
||
\begin{itemize}
|
||
\item ... conduct simulations to explore \alert{performance gains}
|
||
\item ... consider also the programmability to identify \alert{challenges}
|
||
\end{itemize}
|
||
\end{frame}
|
||
|
||
\section{Virtual Prototype}
|
||
|
||
\begin{frame}{Virtual Prototype}
|
||
\begin{itemize}
|
||
\item Coupling of gem5 and DRAMSys
|
||
\item Implementation of HBM-PIM in DRAM model
|
||
\end{itemize}
|
||
|
||
\begin{figure}
|
||
\includesvg[width=0.8\textwidth]{images/dramsys.svg}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\note[itemize]{
|
||
\item VP interprets the programmed microkernel
|
||
\item not yet drop-in replacement
|
||
}
|
||
|
||
\begin{frame}{Software Library}
|
||
Software support library
|
||
\begin{columns}
|
||
\begin{column}{0.5\textwidth}
|
||
\begin{itemize}
|
||
\item Initialization
|
||
\begin{itemize}
|
||
\item Assembly and loading of microkernel
|
||
\end{itemize}
|
||
\item Execution
|
||
\begin{itemize}
|
||
\item Generation of RD and WR requests
|
||
\item Insetion of memory barriers
|
||
\end{itemize}
|
||
\end{itemize}
|
||
\end{column}
|
||
|
||
\begin{column}{0.5\textwidth}
|
||
\begin{figure}
|
||
\includesvg[width=0.8\textwidth]{images/data_structures.svg}
|
||
\end{figure}
|
||
\end{column}
|
||
\end{columns}
|
||
\end{frame}
|
||
|
||
\begin{frame}[fragile]{Example: GEMV Kernel}
|
||
\begin{center}
|
||
\only<1>{
|
||
\inputminted[firstline=1,lastline=8, fontsize=\footnotesize]{gas}{kernel.s}
|
||
\hrule
|
||
\inputminted[firstline=7,lastline=10,fontsize=\footnotesize]{rust}{kernel.rs}
|
||
}
|
||
\only<2>{
|
||
\inputminted[firstline=9,lastline=10, fontsize=\footnotesize]{gas}{kernel.s}
|
||
\hrule
|
||
\inputminted[firstline=12,lastline=20,fontsize=\footnotesize]{rust}{kernel.rs}
|
||
}
|
||
\only<3>{
|
||
\inputminted[firstline=11,lastline=12,fontsize=\footnotesize]{gas}{kernel.s}
|
||
\hrule
|
||
\inputminted[firstline=22,lastline=30,fontsize=\footnotesize]{rust}{kernel.rs}
|
||
}
|
||
\end{center}
|
||
\end{frame}
|
||
|
||
\begin{frame}{Virtual Prototype Platform}
|
||
\begin{columns}
|
||
\begin{column}{0.5\textwidth}
|
||
\begin{itemize}
|
||
\item ARM processor model
|
||
\item Bare-metal kernel
|
||
\item Custom page table configuration
|
||
\begin{itemize}
|
||
\item Non-PIM DRAM region mapped as cacheable memory
|
||
\item PIM DRAM region mapped as non-cacheable memory
|
||
\end{itemize}
|
||
\end{itemize}
|
||
\end{column}
|
||
\begin{column}{0.5\textwidth}
|
||
\begin{figure}
|
||
\includesvg[width=0.8\textwidth]{images/bare_metal.svg}
|
||
\end{figure}
|
||
\end{column}
|
||
\end{columns}
|
||
\end{frame}
|
||
|
||
\note[itemize]{
|
||
\item bare metal offers most control
|
||
}
|
||
|
||
\section{Simulations}
|
||
|
||
\begin{frame}{Microbenchmarks}
|
||
\begin{columns}
|
||
\begin{column}{0.5\textwidth}
|
||
\begin{itemize}
|
||
\item Vector benchmarks (BLAS level 1)
|
||
\item VADD: $z = x + y$
|
||
\item VMUL: $z = x \cdot y$
|
||
\item HAXPY: $z = a \cdot x + y$
|
||
|
||
\item Vector-Matrix benchmarks (BLAS level 2)
|
||
\begin{itemize}
|
||
\item GEMV: $z = A \cdot x$
|
||
\item Simple DNN:
|
||
\begin{itemize}
|
||
\item $f(x) = z = ReLU(A \cdot x)$
|
||
\item $z_{n+1} = f(z_n)$
|
||
\item 5 layers in total
|
||
\end{itemize}
|
||
\end{itemize}
|
||
\end{itemize}
|
||
\end{column}
|
||
\begin{column}{0.5\textwidth}
|
||
\begin{figure}
|
||
\begin{tblr}{
|
||
hlines,
|
||
column{1} = {c},
|
||
column{2} = {r},
|
||
column{3} = {r},
|
||
column{4} = {r},
|
||
row{1} = {l},
|
||
hline{2} = {2}{-}{solid,black},
|
||
}
|
||
Level & Vector & GEMV & DNN \\
|
||
X1 & 2M & (1k $\times$ 4k) & (256 $\times$ 256) \\
|
||
X2 & 4M & (2k $\times$ 4k) & (512 $\times$ 512) \\
|
||
X3 & 8M & (4k $\times$ 8k) & (1k $\times$ 1k) \\
|
||
X4 & 16M & (8k $\times$ 8k) & (2k $\times$ 2k)
|
||
\end{tblr}
|
||
\end{figure}
|
||
\end{column}
|
||
\end{columns}
|
||
\end{frame}
|
||
|
||
\note[itemize]{
|
||
\item operand data significantly larger than on-chip cache
|
||
}
|
||
|
||
\begin{frame}{System Configuration}
|
||
\begin{columns}[t]
|
||
\begin{column}{0.5\textwidth}
|
||
Two simulated systems:
|
||
\begin{itemize}
|
||
\item Generic ARM system
|
||
\item Infinite compute system
|
||
\begin{itemize}
|
||
\item Unrealistic high frequency of 100 GHz
|
||
\item Completely memory bound
|
||
\item Lower bound of possible speedup
|
||
\end{itemize}
|
||
\end{itemize}
|
||
\end{column}
|
||
\begin{column}{0.5\textwidth}
|
||
Two real GPUs using HBM2:
|
||
\begin{itemize}
|
||
\item AMD RX Vega 56
|
||
\item NVIDIA Tesla V100
|
||
\end{itemize}
|
||
\end{column}
|
||
\end{columns}
|
||
\end{frame}
|
||
|
||
\begin{frame}{Speedups / Generic ARM System}
|
||
\begin{figure}
|
||
\includesvg[width=0.8\textwidth]{images/speedup_normal.svg}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\begin{frame}{Speedups / Infinite Compute System}
|
||
\begin{figure}
|
||
\includesvg[width=0.8\textwidth]{images/speedup_inf.svg}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\note[itemize]{
|
||
\item VADD: 12.7x
|
||
\item GEMV: 9.0x
|
||
}
|
||
|
||
\begin{frame}{Speedups / Samsung}
|
||
\begin{figure}
|
||
\includesvg[width=0.8\textwidth]{images/samsung.svg}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\note[itemize]{
|
||
\item GEMV matches good
|
||
\item ADD shows deviation
|
||
\item -> differences in hardware architecture
|
||
\item GPU has no speculative execution
|
||
}
|
||
|
||
\begin{frame}{Runtimes / Vector Benchmarks}
|
||
\begin{figure}
|
||
\includesvg[width=0.8\textwidth]{images/runtimes_vector.svg}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\note[itemize]{
|
||
\item Real GPUs use multiple memory channels
|
||
\item Memory barriers
|
||
\item Also architectural differences
|
||
}
|
||
|
||
\begin{frame}{Runtimes / Matrix Benchmarks}
|
||
\begin{figure}
|
||
\includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
\begin{frame}{Conclusion and Future Work}
|
||
\begin{itemize}
|
||
\item PIM can accelerate memory-bound workloads
|
||
\item Special PIM-friendly memory layouts are required
|
||
\end{itemize}
|
||
|
||
Future work:
|
||
\begin{itemize}
|
||
\item Implementation of Linux driver
|
||
\item Comparison with real neural networks
|
||
\item Consider replacing library approach with compiler approach
|
||
\item Implement a power model to analyze the power efficiency gains
|
||
\end{itemize}
|
||
\end{frame}
|
||
|
||
\section{Thank you for your attention!}
|
||
|
||
\begin{frame}
|
||
\frametitle{Outline}
|
||
\tableofcontents
|
||
\end{frame}
|
||
|
||
\begin{frame}{Memory Layout}
|
||
\begin{figure}
|
||
\includesvg[width=\textwidth]{images/complete_layout.svg}
|
||
\end{figure}
|
||
\end{frame}
|
||
|
||
|
||
\note[itemize]{
|
||
\item Data layout in program and address mapping must match
|
||
}
|
||
|
||
\end{document}
|