Do some fine tuning

This commit is contained in:
2024-09-28 19:57:05 +02:00
parent 57c923c671
commit 6910c185d7
2 changed files with 56 additions and 39 deletions

BIN
main.pdf

Binary file not shown.

View File

@@ -17,6 +17,7 @@
\addbibresource{references.bib}
\setbeamerfont{footnote}{size=\tiny}
\setbeamercolor{alerted text}{fg=uniwuered}
\newdate{presentationday}{01}{10}{2024}
@@ -52,47 +53,24 @@
% \section{Introduction}
\begin{frame}{Energy Demand of Applications}
Total compute energy approaches worlds energy production\autocite{src2021}
\begin{figure}
\includesvg[width=0.6\textwidth]{images/world_energy}
\end{figure}
\end{frame}
\note[itemize]{
\item compute 2x every two years
\item energy production 2\% per year
\item to meet future compute demands, drastic improvements in energy efficiency
}
\begin{frame}{Memory Bound Workloads}
AI applications become increasingly memory-bound\autocite{ivobolsens2023}
\begin{figure}
\includesvg[width=0.5\textwidth]{images/gpt}
\end{figure}
\end{frame}
\note[itemize]{
\item Emerging AI applications become increasingly memory-bound
\item Roofline model
\item Not limited by compute power but by memory
\item researchers begin to consider PIM to circumvent memory bottleneck
\item (drastically more parameters in GPT-3, operational intensity goes down)
}
% \section{Processing-in-Memory}
\begin{frame}{Workloads for PIM}
Fully connected neural network layers:
\begin{itemize}
\item Large weight matrix - \alert{does not fit onto cache}
\item No data reuse - \alert{cache is useless}
\item No data reuse - \alert{cache is almost useless}
\end{itemize}
\begin{figure}
\includesvg[width=0.6\textwidth]{images/dnn}
\end{figure}
\end{frame}
\note[itemize]{
\item Let's start by having a look on which workloads can be accelerated by PIM...
\item memory-bound: each entry only used once
}
\begin{frame}{Workloads for PIM}
Convolutional layers:
\begin{itemize}
@@ -106,10 +84,17 @@
\end{figure}
\end{frame}
\note[itemize]{
\item compute-bound: elements of filter matrix used often
}
\begin{frame}{Workloads for PIM}
\begin{columns}[T]
\begin{column}{0.5\textwidth}
\begin{center} \includesvg[height=50px]{images/thumbs-up} \end{center}
\begin{center}
\includesvg[height=50px]{images/thumbs-up}
\end{center}
\begin{center}\alert{(memory-bound)}\end{center}
\begin{itemize}
\item Fully connected layers in multilayer perceptrons (MLPs)
\item Layers in recurrent neural networks (RNNs)
@@ -117,6 +102,7 @@
\end{column}
\begin{column}{0.5\textwidth}
\begin{center} \includesvg[height=50px]{images/thumbs-unsure} \end{center}
\begin{center}\alert{(compute-bound)}\end{center}
\begin{itemize}
\item Convolutional neural network (CNNs)
\end{itemize}
@@ -133,7 +119,7 @@
\begin{column}{0.4\textwidth}
\begin{itemize}
\item<2-> Inside the memory subarray
\item<3-> Near the subarray in the PSA output region
\item<3-> Near the subarray in the PSA region
\item<4-> Near the bank in its peripheral region
\item<5-> In the I/O region of the memory
\end{itemize}
@@ -152,7 +138,7 @@
\note[itemize]{
\item Architecture space of PIM:
\item Inside the memory SA - simple bulk logic
\item Near SA in PSA output region - logic gates in the region
\item Near SA in PSA region - logic gates in the region
\item Near a bank in its peripheral region - computation units with control
\item I/O region of memory - limited by memory bus
}
@@ -162,7 +148,7 @@
\begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
\begin{itemize}
\item Real-world PIM implementation based on HBM2
\item PIM units embedded at the bank level
\item PIM units embedded at the bank level - in parallel
\end{itemize}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/hbm-pim}
@@ -314,9 +300,11 @@
\begin{column}{0.5\textwidth}
\begin{itemize}
\item Vector benchmarks (BLAS level 1)
\begin{itemize}
\item VADD: $z = x + y$
\item VMUL: $z = x \cdot y$
\item HAXPY: $z = a \cdot x + y$
\end{itemize}
\item Vector-Matrix benchmarks (BLAS level 2)
\begin{itemize}
@@ -397,7 +385,7 @@
\item GEMV: 9.0x
}
\begin{frame}{Speedups / Samsung}
\begin{frame}{Speedups / Samsung\autocite{lee2021}}
\begin{figure}
\includesvg[width=0.8\textwidth]{images/samsung.svg}
\end{figure}
@@ -429,12 +417,13 @@
\end{frame}
\begin{frame}{Conclusion and Future Work}
\textbf{Conclusion}
\begin{itemize}
\item PIM can accelerate memory-bound workloads
\item Special PIM-friendly memory layouts are required
\end{itemize}
Future work:
\textbf{Future work}
\begin{itemize}
\item Implementation of Linux driver
\item Comparison with real neural networks
@@ -452,6 +441,34 @@
\appendix
\begin{frame}{Energy Demand of Applications}
Total compute energy approaches worlds energy production\autocite{src2021}
\begin{figure}
\includesvg[width=0.6\textwidth]{images/world_energy}
\end{figure}
\end{frame}
\note[itemize]{
\item compute 2x every two years
\item energy production 2\% per year
\item to meet future compute demands, drastic improvements in energy efficiency
}
\begin{frame}{Memory Bound Workloads}
AI applications become increasingly memory-bound\autocite{ivobolsens2023}
\begin{figure}
\includesvg[width=0.5\textwidth]{images/gpt}
\end{figure}
\end{frame}
\note[itemize]{
\item Emerging AI applications become increasingly memory-bound
\item Roofline model
\item Not limited by compute power but by memory
\item researchers begin to consider PIM to circumvent memory bottleneck
\item (drastically more parameters in GPT-3, operational intensity goes down)
}
\begin{frame}{HBM-PIM/FIMDRAM GEMV Operation}
\begin{figure}
\only<1>{\includesvg[width=0.8\textwidth]{images/gemv_normal}}