Add old notes

2024-09-17 16:39:30 +02:00
parent a4e4b3255b
commit 3f548705db
2 changed files with 84 additions and 0 deletions
--- a/main.pdf
+++ b/main.pdf
--- a/main.tex
+++ b/main.tex
@@ -1,4 +1,5 @@
 \documentclass[aspectratio=169]{beamer}
+% \setbeameroption{show notes on second screen=right}
 \usetheme{UniWue}

 \usepackage[style=verbose-ibid]{biblatex}
@@ -56,6 +57,12 @@
    \end{figure}
 \end{frame}

+\note[itemize]{
+    \item compute 2x every two years
+    \item energy production 2\% per year
+    \item to meet future compute demands, drastic improvements in energy efficiency
+}
+
 \begin{frame}{Memory Bound Workloads}
    AI applications become increasingly memory-bound\autocite{ivobolsens2023}
    \begin{figure}
@@ -63,6 +70,14 @@
    \end{figure}
 \end{frame}

+\note[itemize]{
+    \item Emerging AI applications become increasingly memory-bound
+    \item Roofline model
+    \item Not limited by compute power but by memory
+    \item researchers begin to consider PIM to circumvent memory bottleneck
+    \item (drastically more parameters in GPT-3, operational intensity goes down)
+}
+
 \section{Processing-in-Memory}

 \begin{frame}{Workloads for PIM}
@@ -107,6 +122,10 @@
    \end{columns}
 \end{frame}

+\note{
+    To summarize...
+}
+
 \begin{frame}{PIM Architectures}
    \begin{columns}[T]
        \begin{column}{0.4\textwidth}
@@ -128,6 +147,14 @@
    \visible<6>{\begin{block}{Remark}The nearer the computation is to the memory cells, the higher the achievable bandwidth!\end{block}}
 \end{frame}

+\note[itemize]{
+    \item Architecture space of PIM:
+    \item Inside the memory SA - simple bulk logic
+    \item Near SA in PSA output region - logic gates in the region
+    \item Near a bank in its peripheral region - computation units with control
+    \item I/O region of memory - limited by memory bus
+}
+
 \section{Samsung HBM-PIM/FIMDRAM}

 \begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
@@ -140,6 +167,12 @@
    \end{figure}
 \end{frame}

+\note[itemize]{
+    \item One PIM unit shared by two banks
+    \item 16-wide SIMD FPUs are 16-wide
+    \item All-Bank mode: All PIM units operate in parallel
+}
+
 \begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
    \begin{columns}
        \begin{column}{0.4\textwidth}
@@ -163,6 +196,14 @@
    \end{columns}
 \end{frame}

+\note[itemize]{
+    \item Two SIMD FPUs (ADD,MUL)
+    \item CRF: 32 instructions, stores the program
+    \item GRF: 16 entries, one memory fetch
+    \item SRF: 16 entries
+    \item Control units executes one instruction when RD or WR command is issued
+}
+
 \begin{frame}{HBM-PIM/FIMDRAM GEMV Operation}
    \begin{figure}
        \only<1>{\includesvg[width=0.8\textwidth]{images/gemv_normal}}
@@ -181,6 +222,13 @@
    \end{figure}
 \end{frame}

+\note[itemize]{
+    \item Procedure of GEMV operation
+    \item multiple cycles
+    \item each PIM unit operatates on one matrix row
+    \item partial sum, reduced by host
+}
+
 \begin{frame}{HBM-PIM/FIMDRAM}
    \begin{huge}
        How fast is it?
@@ -206,6 +254,11 @@
    \end{figure}
 \end{frame}

+\note[itemize]{
+    \item VP interprets the programmed microkernel
+    \item not yet drop-in replacement
+}
+
 \begin{frame}{Software Library}
    Software support library
    \begin{columns}
@@ -272,6 +325,10 @@
    \end{columns}
 \end{frame}

+\note[itemize]{
+    \item bare metal offers most control
+}
+
 \section{Simulations}

 \begin{frame}{Microbenchmarks}
@@ -317,6 +374,10 @@
    \end{columns}
 \end{frame}

+\note[itemize]{
+    \item operand data significantly larger than on-chip cache
+}
+
 \begin{frame}{System Configuration}
    \begin{columns}[t]
        \begin{column}{0.5\textwidth}
@@ -353,18 +414,36 @@
    \end{figure}
 \end{frame}

+\note[itemize]{
+    \item VADD: 12.7x
+    \item GEMV: 9.0x
+}
+
 \begin{frame}{Speedups / Samsung}
    \begin{figure}
        \includesvg[width=0.8\textwidth]{images/samsung.svg}
    \end{figure}
 \end{frame}

+\note[itemize]{
+    \item GEMV matches good
+    \item ADD shows deviation
+    \item -> differences in hardware architecture
+    \item GPU has no speculative execution
+}
+
 \begin{frame}{Runtimes / Vector Benchmarks}
    \begin{figure}
        \includesvg[width=0.8\textwidth]{images/runtimes_vector.svg}
    \end{figure}
 \end{frame}

+\note[itemize]{
+    \item Real GPUs use multiple memory channels
+    \item Memory barriers
+    \item Also architectural differences
+}
+
 \begin{frame}{Runtimes / Matrix Benchmarks}
    \begin{figure}
        \includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg}
@@ -399,4 +478,9 @@
    \end{figure}
 \end{frame}

+
+\note[itemize]{
+    \item Data layout in program and address mapping must match
+}
+
 \end{document}