Add old notes

This commit is contained in:
2024-09-17 16:39:30 +02:00
parent a4e4b3255b
commit 3f548705db
2 changed files with 84 additions and 0 deletions

BIN
main.pdf

Binary file not shown.

View File

@@ -1,4 +1,5 @@
\documentclass[aspectratio=169]{beamer} \documentclass[aspectratio=169]{beamer}
% \setbeameroption{show notes on second screen=right}
\usetheme{UniWue} \usetheme{UniWue}
\usepackage[style=verbose-ibid]{biblatex} \usepackage[style=verbose-ibid]{biblatex}
@@ -56,6 +57,12 @@
\end{figure} \end{figure}
\end{frame} \end{frame}
\note[itemize]{
\item compute 2x every two years
\item energy production 2\% per year
\item to meet future compute demands, drastic improvements in energy efficiency
}
\begin{frame}{Memory Bound Workloads} \begin{frame}{Memory Bound Workloads}
AI applications become increasingly memory-bound\autocite{ivobolsens2023} AI applications become increasingly memory-bound\autocite{ivobolsens2023}
\begin{figure} \begin{figure}
@@ -63,6 +70,14 @@
\end{figure} \end{figure}
\end{frame} \end{frame}
\note[itemize]{
\item Emerging AI applications become increasingly memory-bound
\item Roofline model
\item Not limited by compute power but by memory
\item researchers begin to consider PIM to circumvent memory bottleneck
\item (drastically more parameters in GPT-3, operational intensity goes down)
}
\section{Processing-in-Memory} \section{Processing-in-Memory}
\begin{frame}{Workloads for PIM} \begin{frame}{Workloads for PIM}
@@ -107,6 +122,10 @@
\end{columns} \end{columns}
\end{frame} \end{frame}
\note{
To summarize...
}
\begin{frame}{PIM Architectures} \begin{frame}{PIM Architectures}
\begin{columns}[T] \begin{columns}[T]
\begin{column}{0.4\textwidth} \begin{column}{0.4\textwidth}
@@ -128,6 +147,14 @@
\visible<6>{\begin{block}{Remark}The nearer the computation is to the memory cells, the higher the achievable bandwidth!\end{block}} \visible<6>{\begin{block}{Remark}The nearer the computation is to the memory cells, the higher the achievable bandwidth!\end{block}}
\end{frame} \end{frame}
\note[itemize]{
\item Architecture space of PIM:
\item Inside the memory SA - simple bulk logic
\item Near SA in PSA output region - logic gates in the region
\item Near a bank in its peripheral region - computation units with control
\item I/O region of memory - limited by memory bus
}
\section{Samsung HBM-PIM/FIMDRAM} \section{Samsung HBM-PIM/FIMDRAM}
\begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}} \begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
@@ -140,6 +167,12 @@
\end{figure} \end{figure}
\end{frame} \end{frame}
\note[itemize]{
\item One PIM unit shared by two banks
\item 16-wide SIMD FPUs are 16-wide
\item All-Bank mode: All PIM units operate in parallel
}
\begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}} \begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
\begin{columns} \begin{columns}
\begin{column}{0.4\textwidth} \begin{column}{0.4\textwidth}
@@ -163,6 +196,14 @@
\end{columns} \end{columns}
\end{frame} \end{frame}
\note[itemize]{
\item Two SIMD FPUs (ADD,MUL)
\item CRF: 32 instructions, stores the program
\item GRF: 16 entries, one memory fetch
\item SRF: 16 entries
\item Control units executes one instruction when RD or WR command is issued
}
\begin{frame}{HBM-PIM/FIMDRAM GEMV Operation} \begin{frame}{HBM-PIM/FIMDRAM GEMV Operation}
\begin{figure} \begin{figure}
\only<1>{\includesvg[width=0.8\textwidth]{images/gemv_normal}} \only<1>{\includesvg[width=0.8\textwidth]{images/gemv_normal}}
@@ -181,6 +222,13 @@
\end{figure} \end{figure}
\end{frame} \end{frame}
\note[itemize]{
\item Procedure of GEMV operation
\item multiple cycles
\item each PIM unit operatates on one matrix row
\item partial sum, reduced by host
}
\begin{frame}{HBM-PIM/FIMDRAM} \begin{frame}{HBM-PIM/FIMDRAM}
\begin{huge} \begin{huge}
How fast is it? How fast is it?
@@ -206,6 +254,11 @@
\end{figure} \end{figure}
\end{frame} \end{frame}
\note[itemize]{
\item VP interprets the programmed microkernel
\item not yet drop-in replacement
}
\begin{frame}{Software Library} \begin{frame}{Software Library}
Software support library Software support library
\begin{columns} \begin{columns}
@@ -272,6 +325,10 @@
\end{columns} \end{columns}
\end{frame} \end{frame}
\note[itemize]{
\item bare metal offers most control
}
\section{Simulations} \section{Simulations}
\begin{frame}{Microbenchmarks} \begin{frame}{Microbenchmarks}
@@ -317,6 +374,10 @@
\end{columns} \end{columns}
\end{frame} \end{frame}
\note[itemize]{
\item operand data significantly larger than on-chip cache
}
\begin{frame}{System Configuration} \begin{frame}{System Configuration}
\begin{columns}[t] \begin{columns}[t]
\begin{column}{0.5\textwidth} \begin{column}{0.5\textwidth}
@@ -353,18 +414,36 @@
\end{figure} \end{figure}
\end{frame} \end{frame}
\note[itemize]{
\item VADD: 12.7x
\item GEMV: 9.0x
}
\begin{frame}{Speedups / Samsung} \begin{frame}{Speedups / Samsung}
\begin{figure} \begin{figure}
\includesvg[width=0.8\textwidth]{images/samsung.svg} \includesvg[width=0.8\textwidth]{images/samsung.svg}
\end{figure} \end{figure}
\end{frame} \end{frame}
\note[itemize]{
\item GEMV matches good
\item ADD shows deviation
\item -> differences in hardware architecture
\item GPU has no speculative execution
}
\begin{frame}{Runtimes / Vector Benchmarks} \begin{frame}{Runtimes / Vector Benchmarks}
\begin{figure} \begin{figure}
\includesvg[width=0.8\textwidth]{images/runtimes_vector.svg} \includesvg[width=0.8\textwidth]{images/runtimes_vector.svg}
\end{figure} \end{figure}
\end{frame} \end{frame}
\note[itemize]{
\item Real GPUs use multiple memory channels
\item Memory barriers
\item Also architectural differences
}
\begin{frame}{Runtimes / Matrix Benchmarks} \begin{frame}{Runtimes / Matrix Benchmarks}
\begin{figure} \begin{figure}
\includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg} \includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg}
@@ -399,4 +478,9 @@
\end{figure} \end{figure}
\end{frame} \end{frame}
\note[itemize]{
\item Data layout in program and address mapping must match
}
\end{document} \end{document}