Add old notes

2024-09-17 16:39:30 +02:00
parent a4e4b3255b
commit 3f548705db
2 changed files with 84 additions and 0 deletions
--- a/main.pdf
+++ b/main.pdf
--- a/main.tex
+++ b/main.tex
@@ -1,4 +1,5 @@
 \documentclass[aspectratio=169]{beamer}
 % \setbeameroption{show notes on second screen=right}
 \usetheme{UniWue}
 \usepackage[style=verbose-ibid]{biblatex}
@@ -56,6 +57,12 @@
    \end{figure}
 \end{frame}
 \note[itemize]{
    \item compute 2x every two years
    \item energy production 2\% per year
    \item to meet future compute demands, drastic improvements in energy efficiency
 }
 \begin{frame}{Memory Bound Workloads}
    AI applications become increasingly memory-bound\autocite{ivobolsens2023}
    \begin{figure}
@@ -63,6 +70,14 @@
    \end{figure}
 \end{frame}
 \note[itemize]{
    \item Emerging AI applications become increasingly memory-bound
    \item Roofline model
    \item Not limited by compute power but by memory
    \item researchers begin to consider PIM to circumvent memory bottleneck
    \item (drastically more parameters in GPT-3, operational intensity goes down)
 }
 \section{Processing-in-Memory}
 \begin{frame}{Workloads for PIM}
@@ -107,6 +122,10 @@
    \end{columns}
 \end{frame}
 \note{
    To summarize...
 }
 \begin{frame}{PIM Architectures}
    \begin{columns}[T]
        \begin{column}{0.4\textwidth}
@@ -128,6 +147,14 @@
    \visible<6>{\begin{block}{Remark}The nearer the computation is to the memory cells, the higher the achievable bandwidth!\end{block}}
 \end{frame}
 \note[itemize]{
    \item Architecture space of PIM:
    \item Inside the memory SA - simple bulk logic
    \item Near SA in PSA output region - logic gates in the region
    \item Near a bank in its peripheral region - computation units with control
    \item I/O region of memory - limited by memory bus
 }
 \section{Samsung HBM-PIM/FIMDRAM}
 \begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
@@ -140,6 +167,12 @@
    \end{figure}
 \end{frame}
 \note[itemize]{
    \item One PIM unit shared by two banks
    \item 16-wide SIMD FPUs are 16-wide
    \item All-Bank mode: All PIM units operate in parallel
 }
 \begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
    \begin{columns}
        \begin{column}{0.4\textwidth}
@@ -163,6 +196,14 @@
    \end{columns}
 \end{frame}
 \note[itemize]{
    \item Two SIMD FPUs (ADD,MUL)
    \item CRF: 32 instructions, stores the program
    \item GRF: 16 entries, one memory fetch
    \item SRF: 16 entries
    \item Control units executes one instruction when RD or WR command is issued
 }
 \begin{frame}{HBM-PIM/FIMDRAM GEMV Operation}
    \begin{figure}
        \only<1>{\includesvg[width=0.8\textwidth]{images/gemv_normal}}
@@ -181,6 +222,13 @@
    \end{figure}
 \end{frame}
 \note[itemize]{
    \item Procedure of GEMV operation
    \item multiple cycles
    \item each PIM unit operatates on one matrix row
    \item partial sum, reduced by host
 }
 \begin{frame}{HBM-PIM/FIMDRAM}
    \begin{huge}
        How fast is it?
@@ -206,6 +254,11 @@
    \end{figure}
 \end{frame}
 \note[itemize]{
    \item VP interprets the programmed microkernel
    \item not yet drop-in replacement
 }
 \begin{frame}{Software Library}
    Software support library
    \begin{columns}
@@ -272,6 +325,10 @@
    \end{columns}
 \end{frame}
 \note[itemize]{
    \item bare metal offers most control
 }
 \section{Simulations}
 \begin{frame}{Microbenchmarks}
@@ -317,6 +374,10 @@
    \end{columns}
 \end{frame}
 \note[itemize]{
    \item operand data significantly larger than on-chip cache
 }
 \begin{frame}{System Configuration}
    \begin{columns}[t]
        \begin{column}{0.5\textwidth}
@@ -353,18 +414,36 @@
    \end{figure}
 \end{frame}
 \note[itemize]{
    \item VADD: 12.7x
    \item GEMV: 9.0x
 }
 \begin{frame}{Speedups / Samsung}
    \begin{figure}
        \includesvg[width=0.8\textwidth]{images/samsung.svg}
    \end{figure}
 \end{frame}
 \note[itemize]{
    \item GEMV matches good
    \item ADD shows deviation
    \item -> differences in hardware architecture
    \item GPU has no speculative execution
 }
 \begin{frame}{Runtimes / Vector Benchmarks}
    \begin{figure}
        \includesvg[width=0.8\textwidth]{images/runtimes_vector.svg}
    \end{figure}
 \end{frame}
 \note[itemize]{
    \item Real GPUs use multiple memory channels
    \item Memory barriers
    \item Also architectural differences
 }
 \begin{frame}{Runtimes / Matrix Benchmarks}
    \begin{figure}
        \includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg}
@@ -399,4 +478,9 @@
    \end{figure}
 \end{frame}
 \note[itemize]{
    \item Data layout in program and address mapping must match
 }
 \end{document}