Files
memsys24-slides/main.tex
2024-09-17 16:39:30 +02:00

487 lines
15 KiB
TeX
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
\documentclass[aspectratio=169]{beamer}
% \setbeameroption{show notes on second screen=right}
\usetheme{UniWue}
\usepackage[style=verbose-ibid]{biblatex}
\usepackage{datetime}
\usepackage{tabularray}
\usepackage{tikz}
\usepackage[inkscapeversion=1]{svg}
\usetikzlibrary{fit}
\usetikzlibrary{positioning}
\usetikzlibrary{matrix}
\addbibresource{references.bib}
\setbeamerfont{footnote}{size=\tiny}
\newdate{presentationday}{01}{10}{2024}
\title{PIMSys}
\subtitle{A Virtual Prototype for Processing in Memory}
\author{
Derek~Christ\inst{1}
\and
Lukas~Steiner\inst{2}
\and
Matthias~Jung\inst{3}
\and
Norbert~Wehn\inst{2}
}
\institute{
\inst{1}
Fraunhofer IESE
\quad
\inst{2}
RPTU Kaiserslautern-Landau
\quad
\inst{3}
University of Würzburg
}
\date{MEMSYS~2024}
\begin{document}
\frame{\titlepage}
\section{Introduction}
\begin{frame}{Energy Demand of Applications}
Total compute energy approaches worlds energy production\autocite{src2021}
\begin{figure}
\includesvg[width=0.6\textwidth]{images/world_energy}
\end{figure}
\end{frame}
\note[itemize]{
\item compute 2x every two years
\item energy production 2\% per year
\item to meet future compute demands, drastic improvements in energy efficiency
}
\begin{frame}{Memory Bound Workloads}
AI applications become increasingly memory-bound\autocite{ivobolsens2023}
\begin{figure}
\includesvg[width=0.5\textwidth]{images/gpt}
\end{figure}
\end{frame}
\note[itemize]{
\item Emerging AI applications become increasingly memory-bound
\item Roofline model
\item Not limited by compute power but by memory
\item researchers begin to consider PIM to circumvent memory bottleneck
\item (drastically more parameters in GPT-3, operational intensity goes down)
}
\section{Processing-in-Memory}
\begin{frame}{Workloads for PIM}
Fully connected neural network layers:
\begin{itemize}
\item Large weight matrix -\alert{does not fit onto cache}
\item No data reuse - \alert{cache is useless}
\end{itemize}
\begin{figure}
\includesvg[width=0.6\textwidth]{images/dnn}
\end{figure}
\end{frame}
\begin{frame}{Workloads for PIM}
Convolutional layers:
\begin{itemize}
\item Small filter matrix - \alert{does fit onto cache}
\item Excessive data reuse - \alert{cache is useful}
\end{itemize}
\begin{figure}
\begin{figure}
\input{images/cnn}
\end{figure}
\end{figure}
\end{frame}
\begin{frame}{Workloads for PIM}
\begin{columns}[T]
\begin{column}{0.5\textwidth}
\begin{center} \includesvg[height=50px]{images/thumbs-up} \end{center}
\begin{itemize}
\item Fully connected layers in multilayer perceptrons (MLPs)
\item Layers in recurrent neural networks (RNNs)
\end{itemize}
\end{column}
\begin{column}{0.5\textwidth}
\begin{center} \includesvg[height=50px]{images/thumbs-unsure} \end{center}
\begin{itemize}
\item Convolutional neural network (CNNs)
\end{itemize}
\end{column}
\end{columns}
\end{frame}
\note{
To summarize...
}
\begin{frame}{PIM Architectures}
\begin{columns}[T]
\begin{column}{0.4\textwidth}
\begin{itemize}
\item<2-> Inside the memory subarray
\item<3-> Near the subarray in the PSA output region
\item<4-> Near the bank in its peripheral region
\item<5-> In the I/O region of the memory
\end{itemize}
\end{column}
\begin{column}{0.6\textwidth}
\only<1>{\includesvg[height=115px]{images/pim_positions_0}}
\only<2>{\includesvg[height=115px]{images/pim_positions_1}}
\only<3>{\includesvg[height=115px]{images/pim_positions_2}}
\only<4>{\includesvg[height=115px]{images/pim_positions_3}}
\only<5->{\includesvg[height=115px]{images/pim_positions_4}}
\end{column}
\end{columns}
\visible<6>{\begin{block}{Remark}The nearer the computation is to the memory cells, the higher the achievable bandwidth!\end{block}}
\end{frame}
\note[itemize]{
\item Architecture space of PIM:
\item Inside the memory SA - simple bulk logic
\item Near SA in PSA output region - logic gates in the region
\item Near a bank in its peripheral region - computation units with control
\item I/O region of memory - limited by memory bus
}
\section{Samsung HBM-PIM/FIMDRAM}
\begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
\begin{itemize}
\item Real-world PIM implementation based on HBM2
\item PIM units embedded at the bank level
\end{itemize}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/hbm-pim}
\end{figure}
\end{frame}
\note[itemize]{
\item One PIM unit shared by two banks
\item 16-wide SIMD FPUs are 16-wide
\item All-Bank mode: All PIM units operate in parallel
}
\begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
\begin{columns}
\begin{column}{0.4\textwidth}
Processing units:
\begin{itemize}
\item Two 16-wide 16-bit FPUs
\item Register files and control unit
\end{itemize}
Instructions:
\begin{itemize}
\item Control: NOP, JUMP, EXIT
\item Data: MOV (ReLU), FILL
\item Arithmetic: ADD, MUL, MAC, MAD
\end{itemize}
\end{column}
\begin{column}{0.6\textwidth}
\begin{figure}
\includesvg[width=\textwidth]{images/pu}
\end{figure}
\end{column}
\end{columns}
\end{frame}
\note[itemize]{
\item Two SIMD FPUs (ADD,MUL)
\item CRF: 32 instructions, stores the program
\item GRF: 16 entries, one memory fetch
\item SRF: 16 entries
\item Control units executes one instruction when RD or WR command is issued
}
\begin{frame}{HBM-PIM/FIMDRAM GEMV Operation}
\begin{figure}
\only<1>{\includesvg[width=0.8\textwidth]{images/gemv_normal}}
\only<2>{\includesvg[width=0.9\textwidth]{images/gemv_interleaved}}
\end{figure}
\end{frame}
\begin{frame}{HBM-PIM/FIMDRAM GEMV Operation\autocite{kang2022}}
\begin{figure}
\only<1>{\includesvg[width=0.55\textwidth]{images/gemv.svg}}
\only<2>{\includesvg[width=0.55\textwidth]{images/gemv_0.svg}}
\only<3>{\includesvg[width=0.55\textwidth]{images/gemv_1.svg}}
\only<4>{\includesvg[width=0.55\textwidth]{images/gemv_2.svg}}
\only<5>{\includesvg[width=0.55\textwidth]{images/gemv_3.svg}}
\only<6>{\includesvg[width=0.55\textwidth]{images/gemv_4.svg}}
\end{figure}
\end{frame}
\note[itemize]{
\item Procedure of GEMV operation
\item multiple cycles
\item each PIM unit operatates on one matrix row
\item partial sum, reduced by host
}
\begin{frame}{HBM-PIM/FIMDRAM}
\begin{huge}
How fast is it?
\end{huge}\\
Research should ...
\begin{itemize}
\item ... conduct simulations to explore \alert{performance gains}
\item ... consider also the programmability to identify \alert{challenges}
\end{itemize}
\end{frame}
\section{Virtual Prototype}
\begin{frame}{Virtual Prototype}
\begin{itemize}
\item Coupling of gem5 and DRAMSys
\item Implementation of HBM-PIM in DRAM model
\end{itemize}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/dramsys.svg}
\end{figure}
\end{frame}
\note[itemize]{
\item VP interprets the programmed microkernel
\item not yet drop-in replacement
}
\begin{frame}{Software Library}
Software support library
\begin{columns}
\begin{column}{0.5\textwidth}
\begin{itemize}
\item Initialization
\begin{itemize}
\item Assembly and loading of microkernel
\end{itemize}
\item Execution
\begin{itemize}
\item Generation of RD and WR requests
\item Insetion of memory barriers
\end{itemize}
\end{itemize}
\end{column}
\begin{column}{0.5\textwidth}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/data_structures.svg}
\end{figure}
\end{column}
\end{columns}
\end{frame}
\begin{frame}[fragile]{Example: GEMV Kernel}
\begin{center}
\only<1>{
\inputminted[firstline=1,lastline=8, fontsize=\footnotesize]{gas}{kernel.s}
\hrule
\inputminted[firstline=7,lastline=10,fontsize=\footnotesize]{rust}{kernel.rs}
}
\only<2>{
\inputminted[firstline=9,lastline=10, fontsize=\footnotesize]{gas}{kernel.s}
\hrule
\inputminted[firstline=12,lastline=20,fontsize=\footnotesize]{rust}{kernel.rs}
}
\only<3>{
\inputminted[firstline=11,lastline=12,fontsize=\footnotesize]{gas}{kernel.s}
\hrule
\inputminted[firstline=22,lastline=30,fontsize=\footnotesize]{rust}{kernel.rs}
}
\end{center}
\end{frame}
\begin{frame}{Virtual Prototype Platform}
\begin{columns}
\begin{column}{0.5\textwidth}
\begin{itemize}
\item ARM processor model
\item Bare-metal kernel
\item Custom page table configuration
\begin{itemize}
\item Non-PIM DRAM region mapped as cacheable memory
\item PIM DRAM region mapped as non-cacheable memory
\end{itemize}
\end{itemize}
\end{column}
\begin{column}{0.5\textwidth}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/bare_metal.svg}
\end{figure}
\end{column}
\end{columns}
\end{frame}
\note[itemize]{
\item bare metal offers most control
}
\section{Simulations}
\begin{frame}{Microbenchmarks}
\begin{columns}
\begin{column}{0.5\textwidth}
\begin{itemize}
\item Vector benchmarks (BLAS level 1)
\item VADD: $z = x + y$
\item VMUL: $z = x \cdot y$
\item HAXPY: $z = a \cdot x + y$
\item Vector-Matrix benchmarks (BLAS level 2)
\begin{itemize}
\item GEMV: $z = A \cdot x$
\item Simple DNN:
\begin{itemize}
\item $f(x) = z = ReLU(A \cdot x)$
\item $z_{n+1} = f(z_n)$
\item 5 layers in total
\end{itemize}
\end{itemize}
\end{itemize}
\end{column}
\begin{column}{0.5\textwidth}
\begin{figure}
\begin{tblr}{
hlines,
column{1} = {c},
column{2} = {r},
column{3} = {r},
column{4} = {r},
row{1} = {l},
hline{2} = {2}{-}{solid,black},
}
Level & Vector & GEMV & DNN \\
X1 & 2M & (1k $\times$ 4k) & (256 $\times$ 256) \\
X2 & 4M & (2k $\times$ 4k) & (512 $\times$ 512) \\
X3 & 8M & (4k $\times$ 8k) & (1k $\times$ 1k) \\
X4 & 16M & (8k $\times$ 8k) & (2k $\times$ 2k)
\end{tblr}
\end{figure}
\end{column}
\end{columns}
\end{frame}
\note[itemize]{
\item operand data significantly larger than on-chip cache
}
\begin{frame}{System Configuration}
\begin{columns}[t]
\begin{column}{0.5\textwidth}
Two simulated systems:
\begin{itemize}
\item Generic ARM system
\item Infinite compute system
\begin{itemize}
\item Unrealistic high frequency of 100 GHz
\item Completely memory bound
\item Lower bound of possible speedup
\end{itemize}
\end{itemize}
\end{column}
\begin{column}{0.5\textwidth}
Two real GPUs using HBM2:
\begin{itemize}
\item AMD RX Vega 56
\item NVIDIA Tesla V100
\end{itemize}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{Speedups / Generic ARM System}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/speedup_normal.svg}
\end{figure}
\end{frame}
\begin{frame}{Speedups / Infinite Compute System}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/speedup_inf.svg}
\end{figure}
\end{frame}
\note[itemize]{
\item VADD: 12.7x
\item GEMV: 9.0x
}
\begin{frame}{Speedups / Samsung}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/samsung.svg}
\end{figure}
\end{frame}
\note[itemize]{
\item GEMV matches good
\item ADD shows deviation
\item -> differences in hardware architecture
\item GPU has no speculative execution
}
\begin{frame}{Runtimes / Vector Benchmarks}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/runtimes_vector.svg}
\end{figure}
\end{frame}
\note[itemize]{
\item Real GPUs use multiple memory channels
\item Memory barriers
\item Also architectural differences
}
\begin{frame}{Runtimes / Matrix Benchmarks}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg}
\end{figure}
\end{frame}
\begin{frame}{Conclusion and Future Work}
\begin{itemize}
\item PIM can accelerate memory-bound workloads
\item Special PIM-friendly memory layouts are required
\end{itemize}
Future work:
\begin{itemize}
\item Implementation of Linux driver
\item Comparison with real neural networks
\item Consider replacing library approach with compiler approach
\item Implement a power model to analyze the power efficiency gains
\end{itemize}
\end{frame}
\section{Thank you for your attention!}
\begin{frame}
\frametitle{Outline}
\tableofcontents
\end{frame}
\begin{frame}{Memory Layout}
\begin{figure}
\includesvg[width=\textwidth]{images/complete_layout.svg}
\end{figure}
\end{frame}
\note[itemize]{
\item Data layout in program and address mapping must match
}
\end{document}