Do some fine tuning

2024-09-28 19:57:05 +02:00
parent 57c923c671
commit 6910c185d7
2 changed files with 56 additions and 39 deletions
--- a/main.pdf
+++ b/main.pdf
--- a/main.tex
+++ b/main.tex
@@ -17,6 +17,7 @@
 \addbibresource{references.bib}

 \setbeamerfont{footnote}{size=\tiny}
+\setbeamercolor{alerted text}{fg=uniwuered}

 \newdate{presentationday}{01}{10}{2024}

@@ -52,47 +53,24 @@

 % \section{Introduction}

-\begin{frame}{Energy Demand of Applications}
-    Total compute energy approaches world’s energy production\autocite{src2021}
-    \begin{figure}
-        \includesvg[width=0.6\textwidth]{images/world_energy}
-    \end{figure}
-\end{frame}
-
-\note[itemize]{
-    \item compute 2x every two years
-    \item energy production 2\% per year
-    \item to meet future compute demands, drastic improvements in energy efficiency
-}
-
-\begin{frame}{Memory Bound Workloads}
-    AI applications become increasingly memory-bound\autocite{ivobolsens2023}
-    \begin{figure}
-        \includesvg[width=0.5\textwidth]{images/gpt}
-    \end{figure}
-\end{frame}
-
-\note[itemize]{
-    \item Emerging AI applications become increasingly memory-bound
-    \item Roofline model
-    \item Not limited by compute power but by memory
-    \item researchers begin to consider PIM to circumvent memory bottleneck
-    \item (drastically more parameters in GPT-3, operational intensity goes down)
-}
-
 % \section{Processing-in-Memory}

 \begin{frame}{Workloads for PIM}
    Fully connected neural network layers:
    \begin{itemize}
        \item Large weight matrix - \alert{does not fit onto cache}
-        \item No data reuse - \alert{cache is useless}
+        \item No data reuse - \alert{cache is almost useless}
    \end{itemize}
    \begin{figure}
        \includesvg[width=0.6\textwidth]{images/dnn}
    \end{figure}
 \end{frame}

+\note[itemize]{
+    \item Let's start by having a look on which workloads can be accelerated by PIM...
+    \item memory-bound: each entry only used once
+}
+
 \begin{frame}{Workloads for PIM}
    Convolutional layers:
    \begin{itemize}
@@ -106,10 +84,17 @@
    \end{figure}
 \end{frame}

+\note[itemize]{
+    \item compute-bound: elements of filter matrix used often
+}
+
 \begin{frame}{Workloads for PIM}
    \begin{columns}[T]
        \begin{column}{0.5\textwidth}
-            \begin{center} \includesvg[height=50px]{images/thumbs-up} \end{center}
+            \begin{center}
+                \includesvg[height=50px]{images/thumbs-up}
+            \end{center}
+            \begin{center}\alert{(memory-bound)}\end{center}
            \begin{itemize}
                \item Fully connected layers in multilayer perceptrons (MLPs)
                \item Layers in recurrent neural networks (RNNs)
@@ -117,6 +102,7 @@
        \end{column}
        \begin{column}{0.5\textwidth}
            \begin{center} \includesvg[height=50px]{images/thumbs-unsure} \end{center}
+            \begin{center}\alert{(compute-bound)}\end{center}
            \begin{itemize}
                \item Convolutional neural network (CNNs)
            \end{itemize}
@@ -133,7 +119,7 @@
        \begin{column}{0.4\textwidth}
            \begin{itemize}
                \item<2-> Inside the memory subarray
-                \item<3-> Near the subarray in the PSA output region
+                \item<3-> Near the subarray in the PSA region
                \item<4-> Near the bank in its peripheral region
                \item<5-> In the I/O region of the memory
            \end{itemize}
@@ -152,7 +138,7 @@
 \note[itemize]{
    \item Architecture space of PIM:
    \item Inside the memory SA - simple bulk logic
-    \item Near SA in PSA output region - logic gates in the region
+    \item Near SA in PSA region - logic gates in the region
    \item Near a bank in its peripheral region - computation units with control
    \item I/O region of memory - limited by memory bus
 }
@@ -162,7 +148,7 @@
 \begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
    \begin{itemize}
        \item Real-world PIM implementation based on HBM2
-        \item PIM units embedded at the bank level
+        \item PIM units embedded at the bank level - in parallel
    \end{itemize}
    \begin{figure}
        \includesvg[width=0.8\textwidth]{images/hbm-pim}
@@ -314,9 +300,11 @@
        \begin{column}{0.5\textwidth}
            \begin{itemize}
                \item Vector benchmarks (BLAS level 1)
+                \begin{itemize}
                    \item VADD: $z = x + y$
                    \item VMUL: $z = x \cdot y$
                    \item HAXPY: $z = a \cdot x + y$
+                \end{itemize}

                \item Vector-Matrix benchmarks (BLAS level 2)
                      \begin{itemize}
@@ -397,7 +385,7 @@
    \item GEMV: 9.0x
 }

-\begin{frame}{Speedups / Samsung}
+\begin{frame}{Speedups / Samsung\autocite{lee2021}}
    \begin{figure}
        \includesvg[width=0.8\textwidth]{images/samsung.svg}
    \end{figure}
@@ -429,12 +417,13 @@
 \end{frame}

 \begin{frame}{Conclusion and Future Work}
+    \textbf{Conclusion}
    \begin{itemize}
        \item PIM can accelerate memory-bound workloads
        \item Special PIM-friendly memory layouts are required
    \end{itemize}

-    Future work:
+    \textbf{Future work}
    \begin{itemize}
        \item Implementation of Linux driver
        \item Comparison with real neural networks
@@ -452,6 +441,34 @@

 \appendix

+\begin{frame}{Energy Demand of Applications}
+    Total compute energy approaches world’s energy production\autocite{src2021}
+    \begin{figure}
+        \includesvg[width=0.6\textwidth]{images/world_energy}
+    \end{figure}
+\end{frame}
+
+\note[itemize]{
+    \item compute 2x every two years
+    \item energy production 2\% per year
+    \item to meet future compute demands, drastic improvements in energy efficiency
+}
+
+\begin{frame}{Memory Bound Workloads}
+    AI applications become increasingly memory-bound\autocite{ivobolsens2023}
+    \begin{figure}
+        \includesvg[width=0.5\textwidth]{images/gpt}
+    \end{figure}
+\end{frame}
+
+\note[itemize]{
+    \item Emerging AI applications become increasingly memory-bound
+    \item Roofline model
+    \item Not limited by compute power but by memory
+    \item researchers begin to consider PIM to circumvent memory bottleneck
+    \item (drastically more parameters in GPT-3, operational intensity goes down)
+}
+
 \begin{frame}{HBM-PIM/FIMDRAM GEMV Operation}
    \begin{figure}
        \only<1>{\includesvg[width=0.8\textwidth]{images/gemv_normal}}