From 3f548705db132cbe6e4f2d8b26a56c988506a19a Mon Sep 17 00:00:00 2001
From: Derek Christ <derek.christ@uni-wuerzburg.de>
Date: Tue, 17 Sep 2024 16:39:30 +0200
Subject: [PATCH] Add old notes

---
 main.pdf | Bin 2861981 -> 2861981 bytes
 main.tex |  84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+)

diff --git a/main.pdf b/main.pdf
index 2d8c9f6134e1a70b4b402b04d3f4aa3520cec508..38b44bb0d6b6241706efeef8b7baeab00056e29f 100644
GIT binary patch
delta 199
zcmajVxe3Bx06<~9@B6-^@lK;j{K*d@XcA`-L|jGXZD3*L3RZ%(Q`kF+;sO>AzT#b9
zH_!Fk0S{h$_z4gsM3@LsV#G<1Bt@DGS#snlP^3he3RP;<Y0#vFLYodMU3&BxFl5B|
zu5DGnVHx7GSGCqpkCUqHgKOKeCo4<3_RN%GCYDnvg>D+c)Ys0$cH}~t|6g|J=ezlE
I$J*xj0ddVZ)c^nh

delta 199
zcmajVNe;na06^iGYM$q*hMK!+i}sg@MCEq|iHNHp?*<lDu3#mxb_#nZAueF?l5g>@
zubb!k?SKa_KKukw2ofSpgeWoMBuJ7XO@=Hv@)RgiqD+M<HR`A|XwssMMu#pv`V1H{
zy6aobZ&*`t*=u^^r^iXt_sUv0mOYy~W9f?3(iPIQCbsRE(zT>8)<Q0t^Z(24{Cukq
IInp=B4{xzJ>i_@%

diff --git a/main.tex b/main.tex
index 8734b08..df7180c 100644
--- a/main.tex
+++ b/main.tex
@@ -1,4 +1,5 @@
 \documentclass[aspectratio=169]{beamer}
+% \setbeameroption{show notes on second screen=right}
 \usetheme{UniWue}
 
 \usepackage[style=verbose-ibid]{biblatex}
@@ -56,6 +57,12 @@
     \end{figure}
 \end{frame}
 
+\note[itemize]{
+    \item compute 2x every two years
+    \item energy production 2\% per year
+    \item to meet future compute demands, drastic improvements in energy efficiency
+}
+
 \begin{frame}{Memory Bound Workloads}
     AI applications become increasingly memory-bound\autocite{ivobolsens2023}
     \begin{figure}
@@ -63,6 +70,14 @@
     \end{figure}
 \end{frame}
 
+\note[itemize]{
+    \item Emerging AI applications become increasingly memory-bound
+    \item Roofline model
+    \item Not limited by compute power but by memory
+    \item researchers begin to consider PIM to circumvent memory bottleneck
+    \item (drastically more parameters in GPT-3, operational intensity goes down)
+}
+
 \section{Processing-in-Memory}
 
 \begin{frame}{Workloads for PIM}
@@ -107,6 +122,10 @@
     \end{columns}
 \end{frame}
 
+\note{
+    To summarize...
+}
+
 \begin{frame}{PIM Architectures}
     \begin{columns}[T]
         \begin{column}{0.4\textwidth}
@@ -128,6 +147,14 @@
     \visible<6>{\begin{block}{Remark}The nearer the computation is to the memory cells, the higher the achievable bandwidth!\end{block}}
 \end{frame}
 
+\note[itemize]{
+    \item Architecture space of PIM:
+    \item Inside the memory SA - simple bulk logic
+    \item Near SA in PSA output region - logic gates in the region
+    \item Near a bank in its peripheral region - computation units with control
+    \item I/O region of memory - limited by memory bus
+}
+
 \section{Samsung HBM-PIM/FIMDRAM}
 
 \begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
@@ -140,6 +167,12 @@
     \end{figure}
 \end{frame}
 
+\note[itemize]{
+    \item One PIM unit shared by two banks
+    \item 16-wide SIMD FPUs are 16-wide
+    \item All-Bank mode: All PIM units operate in parallel
+}
+
 \begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
     \begin{columns}
         \begin{column}{0.4\textwidth}
@@ -163,6 +196,14 @@
     \end{columns}
 \end{frame}
 
+\note[itemize]{
+    \item Two SIMD FPUs (ADD,MUL)
+    \item CRF: 32 instructions, stores the program
+    \item GRF: 16 entries, one memory fetch
+    \item SRF: 16 entries
+    \item Control units executes one instruction when RD or WR command is issued
+}
+
 \begin{frame}{HBM-PIM/FIMDRAM GEMV Operation}
     \begin{figure}
         \only<1>{\includesvg[width=0.8\textwidth]{images/gemv_normal}}
@@ -181,6 +222,13 @@
     \end{figure}
 \end{frame}
 
+\note[itemize]{
+    \item Procedure of GEMV operation
+    \item multiple cycles
+    \item each PIM unit operatates on one matrix row
+    \item partial sum, reduced by host
+}
+
 \begin{frame}{HBM-PIM/FIMDRAM}
     \begin{huge}
         How fast is it?
@@ -206,6 +254,11 @@
     \end{figure}
 \end{frame}
 
+\note[itemize]{
+    \item VP interprets the programmed microkernel
+    \item not yet drop-in replacement
+}
+
 \begin{frame}{Software Library}
     Software support library
     \begin{columns}
@@ -272,6 +325,10 @@
     \end{columns}
 \end{frame}
 
+\note[itemize]{
+    \item bare metal offers most control
+}
+
 \section{Simulations}
 
 \begin{frame}{Microbenchmarks}
@@ -317,6 +374,10 @@
     \end{columns}
 \end{frame}
 
+\note[itemize]{
+    \item operand data significantly larger than on-chip cache
+}
+
 \begin{frame}{System Configuration}
     \begin{columns}[t]
         \begin{column}{0.5\textwidth}
@@ -353,18 +414,36 @@
     \end{figure}
 \end{frame}
 
+\note[itemize]{
+    \item VADD: 12.7x
+    \item GEMV: 9.0x
+}
+
 \begin{frame}{Speedups / Samsung}
     \begin{figure}
         \includesvg[width=0.8\textwidth]{images/samsung.svg}
     \end{figure}
 \end{frame}
 
+\note[itemize]{
+    \item GEMV matches good
+    \item ADD shows deviation
+    \item -> differences in hardware architecture
+    \item GPU has no speculative execution
+}
+
 \begin{frame}{Runtimes / Vector Benchmarks}
     \begin{figure}
         \includesvg[width=0.8\textwidth]{images/runtimes_vector.svg}
     \end{figure}
 \end{frame}
 
+\note[itemize]{
+    \item Real GPUs use multiple memory channels
+    \item Memory barriers
+    \item Also architectural differences
+}
+
 \begin{frame}{Runtimes / Matrix Benchmarks}
     \begin{figure}
         \includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg}
@@ -399,4 +478,9 @@
     \end{figure}
 \end{frame}
 
+
+\note[itemize]{
+    \item Data layout in program and address mapping must match
+}
+
 \end{document}