Add old notes
This commit is contained in:
84
main.tex
84
main.tex
@@ -1,4 +1,5 @@
|
|||||||
\documentclass[aspectratio=169]{beamer}
|
\documentclass[aspectratio=169]{beamer}
|
||||||
|
% \setbeameroption{show notes on second screen=right}
|
||||||
\usetheme{UniWue}
|
\usetheme{UniWue}
|
||||||
|
|
||||||
\usepackage[style=verbose-ibid]{biblatex}
|
\usepackage[style=verbose-ibid]{biblatex}
|
||||||
@@ -56,6 +57,12 @@
|
|||||||
\end{figure}
|
\end{figure}
|
||||||
\end{frame}
|
\end{frame}
|
||||||
|
|
||||||
|
\note[itemize]{
|
||||||
|
\item compute 2x every two years
|
||||||
|
\item energy production 2\% per year
|
||||||
|
\item to meet future compute demands, drastic improvements in energy efficiency
|
||||||
|
}
|
||||||
|
|
||||||
\begin{frame}{Memory Bound Workloads}
|
\begin{frame}{Memory Bound Workloads}
|
||||||
AI applications become increasingly memory-bound\autocite{ivobolsens2023}
|
AI applications become increasingly memory-bound\autocite{ivobolsens2023}
|
||||||
\begin{figure}
|
\begin{figure}
|
||||||
@@ -63,6 +70,14 @@
|
|||||||
\end{figure}
|
\end{figure}
|
||||||
\end{frame}
|
\end{frame}
|
||||||
|
|
||||||
|
\note[itemize]{
|
||||||
|
\item Emerging AI applications become increasingly memory-bound
|
||||||
|
\item Roofline model
|
||||||
|
\item Not limited by compute power but by memory
|
||||||
|
\item researchers begin to consider PIM to circumvent memory bottleneck
|
||||||
|
\item (drastically more parameters in GPT-3, operational intensity goes down)
|
||||||
|
}
|
||||||
|
|
||||||
\section{Processing-in-Memory}
|
\section{Processing-in-Memory}
|
||||||
|
|
||||||
\begin{frame}{Workloads for PIM}
|
\begin{frame}{Workloads for PIM}
|
||||||
@@ -107,6 +122,10 @@
|
|||||||
\end{columns}
|
\end{columns}
|
||||||
\end{frame}
|
\end{frame}
|
||||||
|
|
||||||
|
\note{
|
||||||
|
To summarize...
|
||||||
|
}
|
||||||
|
|
||||||
\begin{frame}{PIM Architectures}
|
\begin{frame}{PIM Architectures}
|
||||||
\begin{columns}[T]
|
\begin{columns}[T]
|
||||||
\begin{column}{0.4\textwidth}
|
\begin{column}{0.4\textwidth}
|
||||||
@@ -128,6 +147,14 @@
|
|||||||
\visible<6>{\begin{block}{Remark}The nearer the computation is to the memory cells, the higher the achievable bandwidth!\end{block}}
|
\visible<6>{\begin{block}{Remark}The nearer the computation is to the memory cells, the higher the achievable bandwidth!\end{block}}
|
||||||
\end{frame}
|
\end{frame}
|
||||||
|
|
||||||
|
\note[itemize]{
|
||||||
|
\item Architecture space of PIM:
|
||||||
|
\item Inside the memory SA - simple bulk logic
|
||||||
|
\item Near SA in PSA output region - logic gates in the region
|
||||||
|
\item Near a bank in its peripheral region - computation units with control
|
||||||
|
\item I/O region of memory - limited by memory bus
|
||||||
|
}
|
||||||
|
|
||||||
\section{Samsung HBM-PIM/FIMDRAM}
|
\section{Samsung HBM-PIM/FIMDRAM}
|
||||||
|
|
||||||
\begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
|
\begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
|
||||||
@@ -140,6 +167,12 @@
|
|||||||
\end{figure}
|
\end{figure}
|
||||||
\end{frame}
|
\end{frame}
|
||||||
|
|
||||||
|
\note[itemize]{
|
||||||
|
\item One PIM unit shared by two banks
|
||||||
|
\item 16-wide SIMD FPUs are 16-wide
|
||||||
|
\item All-Bank mode: All PIM units operate in parallel
|
||||||
|
}
|
||||||
|
|
||||||
\begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
|
\begin{frame}{Samsung HBM-PIM/FIMDRAM\autocite{lee2021}}
|
||||||
\begin{columns}
|
\begin{columns}
|
||||||
\begin{column}{0.4\textwidth}
|
\begin{column}{0.4\textwidth}
|
||||||
@@ -163,6 +196,14 @@
|
|||||||
\end{columns}
|
\end{columns}
|
||||||
\end{frame}
|
\end{frame}
|
||||||
|
|
||||||
|
\note[itemize]{
|
||||||
|
\item Two SIMD FPUs (ADD,MUL)
|
||||||
|
\item CRF: 32 instructions, stores the program
|
||||||
|
\item GRF: 16 entries, one memory fetch
|
||||||
|
\item SRF: 16 entries
|
||||||
|
\item Control units executes one instruction when RD or WR command is issued
|
||||||
|
}
|
||||||
|
|
||||||
\begin{frame}{HBM-PIM/FIMDRAM GEMV Operation}
|
\begin{frame}{HBM-PIM/FIMDRAM GEMV Operation}
|
||||||
\begin{figure}
|
\begin{figure}
|
||||||
\only<1>{\includesvg[width=0.8\textwidth]{images/gemv_normal}}
|
\only<1>{\includesvg[width=0.8\textwidth]{images/gemv_normal}}
|
||||||
@@ -181,6 +222,13 @@
|
|||||||
\end{figure}
|
\end{figure}
|
||||||
\end{frame}
|
\end{frame}
|
||||||
|
|
||||||
|
\note[itemize]{
|
||||||
|
\item Procedure of GEMV operation
|
||||||
|
\item multiple cycles
|
||||||
|
\item each PIM unit operatates on one matrix row
|
||||||
|
\item partial sum, reduced by host
|
||||||
|
}
|
||||||
|
|
||||||
\begin{frame}{HBM-PIM/FIMDRAM}
|
\begin{frame}{HBM-PIM/FIMDRAM}
|
||||||
\begin{huge}
|
\begin{huge}
|
||||||
How fast is it?
|
How fast is it?
|
||||||
@@ -206,6 +254,11 @@
|
|||||||
\end{figure}
|
\end{figure}
|
||||||
\end{frame}
|
\end{frame}
|
||||||
|
|
||||||
|
\note[itemize]{
|
||||||
|
\item VP interprets the programmed microkernel
|
||||||
|
\item not yet drop-in replacement
|
||||||
|
}
|
||||||
|
|
||||||
\begin{frame}{Software Library}
|
\begin{frame}{Software Library}
|
||||||
Software support library
|
Software support library
|
||||||
\begin{columns}
|
\begin{columns}
|
||||||
@@ -272,6 +325,10 @@
|
|||||||
\end{columns}
|
\end{columns}
|
||||||
\end{frame}
|
\end{frame}
|
||||||
|
|
||||||
|
\note[itemize]{
|
||||||
|
\item bare metal offers most control
|
||||||
|
}
|
||||||
|
|
||||||
\section{Simulations}
|
\section{Simulations}
|
||||||
|
|
||||||
\begin{frame}{Microbenchmarks}
|
\begin{frame}{Microbenchmarks}
|
||||||
@@ -317,6 +374,10 @@
|
|||||||
\end{columns}
|
\end{columns}
|
||||||
\end{frame}
|
\end{frame}
|
||||||
|
|
||||||
|
\note[itemize]{
|
||||||
|
\item operand data significantly larger than on-chip cache
|
||||||
|
}
|
||||||
|
|
||||||
\begin{frame}{System Configuration}
|
\begin{frame}{System Configuration}
|
||||||
\begin{columns}[t]
|
\begin{columns}[t]
|
||||||
\begin{column}{0.5\textwidth}
|
\begin{column}{0.5\textwidth}
|
||||||
@@ -353,18 +414,36 @@
|
|||||||
\end{figure}
|
\end{figure}
|
||||||
\end{frame}
|
\end{frame}
|
||||||
|
|
||||||
|
\note[itemize]{
|
||||||
|
\item VADD: 12.7x
|
||||||
|
\item GEMV: 9.0x
|
||||||
|
}
|
||||||
|
|
||||||
\begin{frame}{Speedups / Samsung}
|
\begin{frame}{Speedups / Samsung}
|
||||||
\begin{figure}
|
\begin{figure}
|
||||||
\includesvg[width=0.8\textwidth]{images/samsung.svg}
|
\includesvg[width=0.8\textwidth]{images/samsung.svg}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
\end{frame}
|
\end{frame}
|
||||||
|
|
||||||
|
\note[itemize]{
|
||||||
|
\item GEMV matches good
|
||||||
|
\item ADD shows deviation
|
||||||
|
\item -> differences in hardware architecture
|
||||||
|
\item GPU has no speculative execution
|
||||||
|
}
|
||||||
|
|
||||||
\begin{frame}{Runtimes / Vector Benchmarks}
|
\begin{frame}{Runtimes / Vector Benchmarks}
|
||||||
\begin{figure}
|
\begin{figure}
|
||||||
\includesvg[width=0.8\textwidth]{images/runtimes_vector.svg}
|
\includesvg[width=0.8\textwidth]{images/runtimes_vector.svg}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
\end{frame}
|
\end{frame}
|
||||||
|
|
||||||
|
\note[itemize]{
|
||||||
|
\item Real GPUs use multiple memory channels
|
||||||
|
\item Memory barriers
|
||||||
|
\item Also architectural differences
|
||||||
|
}
|
||||||
|
|
||||||
\begin{frame}{Runtimes / Matrix Benchmarks}
|
\begin{frame}{Runtimes / Matrix Benchmarks}
|
||||||
\begin{figure}
|
\begin{figure}
|
||||||
\includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg}
|
\includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg}
|
||||||
@@ -399,4 +478,9 @@
|
|||||||
\end{figure}
|
\end{figure}
|
||||||
\end{frame}
|
\end{frame}
|
||||||
|
|
||||||
|
|
||||||
|
\note[itemize]{
|
||||||
|
\item Data layout in program and address mapping must match
|
||||||
|
}
|
||||||
|
|
||||||
\end{document}
|
\end{document}
|
||||||
|
|||||||
Reference in New Issue
Block a user