From 3f548705db132cbe6e4f2d8b26a56c988506a19a Mon Sep 17 00:00:00 2001 From: Derek Christ Date: Tue, 17 Sep 2024 16:39:30 +0200 Subject: [PATCH] Add old notes --- main.pdf | Bin 2861981 -> 2861981 bytes main.tex | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) diff --git a/main.pdf b/main.pdf index 2d8c9f6134e1a70b4b402b04d3f4aa3520cec508..38b44bb0d6b6241706efeef8b7baeab00056e29f 100644 GIT binary patch delta 199 zcmajVxe3Bx06<~9@B6-^@lK;j{K*d@XcA`-L|jGXZD3*L3RZ%(Q`kF+;sO>AzT#b9 zH_!Fk0S{h$_z4gsM3@LsV#G<1Bt@DGS#snlP^3he3RP;D+c)Ys0$cH}~t|6g|J=ezlE I$J*xj0ddVZ)c^nh delta 199 zcmajVNe;na06^iGYM$q*hMK!+i}sg@MCEq|iHNHp?*@ zubb!k?SKa_KKukw2ofSpgeWoMBuJ7XO@=Hv@)RgiqD+M8)i_@% diff --git a/main.tex b/main.tex index 8734b08..df7180c 100644 --- a/main.tex +++ b/main.tex @@ -1,4 +1,5 @@ \documentclass[aspectratio=169]{beamer} +% \setbeameroption{show notes on second screen=right} \usetheme{UniWue} \usepackage[style=verbose-ibid]{biblatex} @@ -56,6 +57,12 @@ \end{figure} \end{frame} +\note[itemize]{ + \item compute 2x every two years + \item energy production 2\% per year + \item to meet future compute demands, drastic improvements in energy efficiency +} + \begin{frame}{Memory Bound Workloads} AI applications become increasingly memory-bound\autocite{ivobolsens2023} \begin{figure} @@ -63,6 +70,14 @@ \end{figure} \end{frame} +\note[itemize]{ + \item Emerging AI applications become increasingly memory-bound + \item Roofline model + \item Not limited by compute power but by memory + \item researchers begin to consider PIM to circumvent memory bottleneck + \item (drastically more parameters in GPT-3, operational intensity goes down) +} + \section{Processing-in-Memory} \begin{frame}{Workloads for PIM} @@ -107,6 +122,10 @@ \end{columns} \end{frame} +\note{ + To summarize... +} + \begin{frame}{PIM Architectures} \begin{columns}[T] \begin{column}{0.4\textwidth} @@ -128,6 +147,14 @@ \visible<6>{\begin{block}{Remark}The nearer the computation is to the memory cells, the higher the achievable bandwidth!\end{block}} \end{frame} +\note[itemize]{ + \item Architecture space of PIM: + \item Inside the memory SA - simple bulk logic + \item Near SA in PSA output region - logic gates in the region + \item Near a bank in its peripheral region - computation units with control + \item I/O region of memory - limited by memory bus +} + \section{Samsung HBM-PIM/FIMDRAM} \begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}} @@ -140,6 +167,12 @@ \end{figure} \end{frame} +\note[itemize]{ + \item One PIM unit shared by two banks + \item 16-wide SIMD FPUs are 16-wide + \item All-Bank mode: All PIM units operate in parallel +} + \begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}} \begin{columns} \begin{column}{0.4\textwidth} @@ -163,6 +196,14 @@ \end{columns} \end{frame} +\note[itemize]{ + \item Two SIMD FPUs (ADD,MUL) + \item CRF: 32 instructions, stores the program + \item GRF: 16 entries, one memory fetch + \item SRF: 16 entries + \item Control units executes one instruction when RD or WR command is issued +} + \begin{frame}{HBM-PIM/FIMDRAM GEMV Operation} \begin{figure} \only<1>{\includesvg[width=0.8\textwidth]{images/gemv_normal}} @@ -181,6 +222,13 @@ \end{figure} \end{frame} +\note[itemize]{ + \item Procedure of GEMV operation + \item multiple cycles + \item each PIM unit operatates on one matrix row + \item partial sum, reduced by host +} + \begin{frame}{HBM-PIM/FIMDRAM} \begin{huge} How fast is it? @@ -206,6 +254,11 @@ \end{figure} \end{frame} +\note[itemize]{ + \item VP interprets the programmed microkernel + \item not yet drop-in replacement +} + \begin{frame}{Software Library} Software support library \begin{columns} @@ -272,6 +325,10 @@ \end{columns} \end{frame} +\note[itemize]{ + \item bare metal offers most control +} + \section{Simulations} \begin{frame}{Microbenchmarks} @@ -317,6 +374,10 @@ \end{columns} \end{frame} +\note[itemize]{ + \item operand data significantly larger than on-chip cache +} + \begin{frame}{System Configuration} \begin{columns}[t] \begin{column}{0.5\textwidth} @@ -353,18 +414,36 @@ \end{figure} \end{frame} +\note[itemize]{ + \item VADD: 12.7x + \item GEMV: 9.0x +} + \begin{frame}{Speedups / Samsung} \begin{figure} \includesvg[width=0.8\textwidth]{images/samsung.svg} \end{figure} \end{frame} +\note[itemize]{ + \item GEMV matches good + \item ADD shows deviation + \item -> differences in hardware architecture + \item GPU has no speculative execution +} + \begin{frame}{Runtimes / Vector Benchmarks} \begin{figure} \includesvg[width=0.8\textwidth]{images/runtimes_vector.svg} \end{figure} \end{frame} +\note[itemize]{ + \item Real GPUs use multiple memory channels + \item Memory barriers + \item Also architectural differences +} + \begin{frame}{Runtimes / Matrix Benchmarks} \begin{figure} \includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg} @@ -399,4 +478,9 @@ \end{figure} \end{frame} + +\note[itemize]{ + \item Data layout in program and address mapping must match +} + \end{document}