Add old notes
This commit is contained in:
84
main.tex
84
main.tex
@@ -1,4 +1,5 @@
|
||||
\documentclass[aspectratio=169]{beamer}
|
||||
% \setbeameroption{show notes on second screen=right}
|
||||
\usetheme{UniWue}
|
||||
|
||||
\usepackage[style=verbose-ibid]{biblatex}
|
||||
@@ -56,6 +57,12 @@
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
\note[itemize]{
|
||||
\item compute 2x every two years
|
||||
\item energy production 2\% per year
|
||||
\item to meet future compute demands, drastic improvements in energy efficiency
|
||||
}
|
||||
|
||||
\begin{frame}{Memory Bound Workloads}
|
||||
AI applications become increasingly memory-bound\autocite{ivobolsens2023}
|
||||
\begin{figure}
|
||||
@@ -63,6 +70,14 @@
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
\note[itemize]{
|
||||
\item Emerging AI applications become increasingly memory-bound
|
||||
\item Roofline model
|
||||
\item Not limited by compute power but by memory
|
||||
\item researchers begin to consider PIM to circumvent memory bottleneck
|
||||
\item (drastically more parameters in GPT-3, operational intensity goes down)
|
||||
}
|
||||
|
||||
\section{Processing-in-Memory}
|
||||
|
||||
\begin{frame}{Workloads for PIM}
|
||||
@@ -107,6 +122,10 @@
|
||||
\end{columns}
|
||||
\end{frame}
|
||||
|
||||
\note{
|
||||
To summarize...
|
||||
}
|
||||
|
||||
\begin{frame}{PIM Architectures}
|
||||
\begin{columns}[T]
|
||||
\begin{column}{0.4\textwidth}
|
||||
@@ -128,6 +147,14 @@
|
||||
\visible<6>{\begin{block}{Remark}The nearer the computation is to the memory cells, the higher the achievable bandwidth!\end{block}}
|
||||
\end{frame}
|
||||
|
||||
\note[itemize]{
|
||||
\item Architecture space of PIM:
|
||||
\item Inside the memory SA - simple bulk logic
|
||||
\item Near SA in PSA output region - logic gates in the region
|
||||
\item Near a bank in its peripheral region - computation units with control
|
||||
\item I/O region of memory - limited by memory bus
|
||||
}
|
||||
|
||||
\section{Samsung HBM-PIM/FIMDRAM}
|
||||
|
||||
\begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
|
||||
@@ -140,6 +167,12 @@
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
\note[itemize]{
|
||||
\item One PIM unit shared by two banks
|
||||
\item 16-wide SIMD FPUs are 16-wide
|
||||
\item All-Bank mode: All PIM units operate in parallel
|
||||
}
|
||||
|
||||
\begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
|
||||
\begin{columns}
|
||||
\begin{column}{0.4\textwidth}
|
||||
@@ -163,6 +196,14 @@
|
||||
\end{columns}
|
||||
\end{frame}
|
||||
|
||||
\note[itemize]{
|
||||
\item Two SIMD FPUs (ADD,MUL)
|
||||
\item CRF: 32 instructions, stores the program
|
||||
\item GRF: 16 entries, one memory fetch
|
||||
\item SRF: 16 entries
|
||||
\item Control units executes one instruction when RD or WR command is issued
|
||||
}
|
||||
|
||||
\begin{frame}{HBM-PIM/FIMDRAM GEMV Operation}
|
||||
\begin{figure}
|
||||
\only<1>{\includesvg[width=0.8\textwidth]{images/gemv_normal}}
|
||||
@@ -181,6 +222,13 @@
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
\note[itemize]{
|
||||
\item Procedure of GEMV operation
|
||||
\item multiple cycles
|
||||
\item each PIM unit operatates on one matrix row
|
||||
\item partial sum, reduced by host
|
||||
}
|
||||
|
||||
\begin{frame}{HBM-PIM/FIMDRAM}
|
||||
\begin{huge}
|
||||
How fast is it?
|
||||
@@ -206,6 +254,11 @@
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
\note[itemize]{
|
||||
\item VP interprets the programmed microkernel
|
||||
\item not yet drop-in replacement
|
||||
}
|
||||
|
||||
\begin{frame}{Software Library}
|
||||
Software support library
|
||||
\begin{columns}
|
||||
@@ -272,6 +325,10 @@
|
||||
\end{columns}
|
||||
\end{frame}
|
||||
|
||||
\note[itemize]{
|
||||
\item bare metal offers most control
|
||||
}
|
||||
|
||||
\section{Simulations}
|
||||
|
||||
\begin{frame}{Microbenchmarks}
|
||||
@@ -317,6 +374,10 @@
|
||||
\end{columns}
|
||||
\end{frame}
|
||||
|
||||
\note[itemize]{
|
||||
\item operand data significantly larger than on-chip cache
|
||||
}
|
||||
|
||||
\begin{frame}{System Configuration}
|
||||
\begin{columns}[t]
|
||||
\begin{column}{0.5\textwidth}
|
||||
@@ -353,18 +414,36 @@
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
\note[itemize]{
|
||||
\item VADD: 12.7x
|
||||
\item GEMV: 9.0x
|
||||
}
|
||||
|
||||
\begin{frame}{Speedups / Samsung}
|
||||
\begin{figure}
|
||||
\includesvg[width=0.8\textwidth]{images/samsung.svg}
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
\note[itemize]{
|
||||
\item GEMV matches good
|
||||
\item ADD shows deviation
|
||||
\item -> differences in hardware architecture
|
||||
\item GPU has no speculative execution
|
||||
}
|
||||
|
||||
\begin{frame}{Runtimes / Vector Benchmarks}
|
||||
\begin{figure}
|
||||
\includesvg[width=0.8\textwidth]{images/runtimes_vector.svg}
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
\note[itemize]{
|
||||
\item Real GPUs use multiple memory channels
|
||||
\item Memory barriers
|
||||
\item Also architectural differences
|
||||
}
|
||||
|
||||
\begin{frame}{Runtimes / Matrix Benchmarks}
|
||||
\begin{figure}
|
||||
\includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg}
|
||||
@@ -399,4 +478,9 @@
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
|
||||
\note[itemize]{
|
||||
\item Data layout in program and address mapping must match
|
||||
}
|
||||
|
||||
\end{document}
|
||||
|
||||
Reference in New Issue
Block a user