Add rest of slides
This commit is contained in:
File diff suppressed because one or more lines are too long
|
Before Width: | Height: | Size: 54 KiB After Width: | Height: | Size: 69 KiB |
31
kernel.rs
Normal file
31
kernel.rs
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
pub fn execute<const X16R: usize, const X16C: usize, const R: usize>(
|
||||||
|
matrix: &Matrix<X16R, X16C>,
|
||||||
|
input_vector: &Vector<X16C>,
|
||||||
|
output_partial_sum_vector: &mut SVector<F16x16, R>,
|
||||||
|
dummy: &impl PimOperand,
|
||||||
|
) {
|
||||||
|
// Load input vector into GRF-A registers
|
||||||
|
for chunk in input_vector.0.iter() {
|
||||||
|
chunk.execute_read();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Execute the MAC instructions without memory barriers
|
||||||
|
for sub_matrix in matrix.0.iter() {
|
||||||
|
for column_block in sub_matrix.fixed_rows::<1>(0).iter() {
|
||||||
|
column_block.execute_read_async();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify all memory accesses have finished
|
||||||
|
barrier::dsb(barrier::SY);
|
||||||
|
|
||||||
|
// Copy the partial sums into the bank
|
||||||
|
for chunk in output_partial_sum_vector
|
||||||
|
.fixed_rows_with_step_mut::<X16R>(0, 16)
|
||||||
|
.iter_mut() {
|
||||||
|
chunk.execute_write();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Execute the EXIT instruction
|
||||||
|
dummy.execute_read();
|
||||||
|
}
|
||||||
12
kernel.s
Normal file
12
kernel.s
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
MOV GRF_A #0, BANK
|
||||||
|
MOV GRF_A #1, BANK
|
||||||
|
MOV GRF_A #2, BANK
|
||||||
|
MOV GRF_A #3, BANK
|
||||||
|
MOV GRF_A #4, BANK
|
||||||
|
MOV GRF_A #5, BANK
|
||||||
|
MOV GRF_A #6, BANK
|
||||||
|
MOV GRF_A #7, BANK
|
||||||
|
MAC(AAM) GRF_B, BANK, GRF_A
|
||||||
|
JUMP -1, 7
|
||||||
|
FILL BANK, GRF_B #0
|
||||||
|
EXIT
|
||||||
155
main.tex
155
main.tex
@@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
\usepackage[style=verbose-ibid]{biblatex}
|
\usepackage[style=verbose-ibid]{biblatex}
|
||||||
\usepackage{datetime}
|
\usepackage{datetime}
|
||||||
|
\usepackage{tabularray}
|
||||||
\usepackage[inkscapeversion=1]{svg}
|
\usepackage[inkscapeversion=1]{svg}
|
||||||
|
|
||||||
\addbibresource{references.bib}
|
\addbibresource{references.bib}
|
||||||
@@ -225,8 +226,162 @@
|
|||||||
\end{frame}
|
\end{frame}
|
||||||
|
|
||||||
\begin{frame}[fragile]{Example: GEMV Kernel}
|
\begin{frame}[fragile]{Example: GEMV Kernel}
|
||||||
|
\begin{center}
|
||||||
|
\only<1>{
|
||||||
|
\inputminted[firstline=1,lastline=8, fontsize=\footnotesize]{gas}{kernel.s}
|
||||||
|
\hrule
|
||||||
|
\inputminted[firstline=7,lastline=10,fontsize=\footnotesize]{rust}{kernel.rs}
|
||||||
|
}
|
||||||
|
\only<2>{
|
||||||
|
\inputminted[firstline=9,lastline=10, fontsize=\footnotesize]{gas}{kernel.s}
|
||||||
|
\hrule
|
||||||
|
\inputminted[firstline=12,lastline=20,fontsize=\footnotesize]{rust}{kernel.rs}
|
||||||
|
}
|
||||||
|
\only<3>{
|
||||||
|
\inputminted[firstline=11,lastline=12,fontsize=\footnotesize]{gas}{kernel.s}
|
||||||
|
\hrule
|
||||||
|
\inputminted[firstline=22,lastline=30,fontsize=\footnotesize]{rust}{kernel.rs}
|
||||||
|
}
|
||||||
|
\end{center}
|
||||||
\end{frame}
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}{Virtual Prototype Platform}
|
||||||
|
\begin{columns}
|
||||||
|
\begin{column}{0.5\textwidth}
|
||||||
|
\begin{itemize}
|
||||||
|
\item ARM processor model
|
||||||
|
\item Bare-metal kernel
|
||||||
|
\item Custom page table configuration
|
||||||
|
\begin{itemize}
|
||||||
|
\item Non-PIM DRAM region mapped as cacheable memory
|
||||||
|
\item PIM DRAM region mapped as non-cacheable memory
|
||||||
|
\end{itemize}
|
||||||
|
\end{itemize}
|
||||||
|
\end{column}
|
||||||
|
\begin{column}{0.5\textwidth}
|
||||||
|
\begin{figure}
|
||||||
|
\includesvg[width=0.8\textwidth]{images/bare_metal.svg}
|
||||||
|
\end{figure}
|
||||||
|
\end{column}
|
||||||
|
\end{columns}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\section{Simulations}
|
||||||
|
|
||||||
|
\begin{frame}{Microbenchmarks}
|
||||||
|
\begin{columns}
|
||||||
|
\begin{column}{0.5\textwidth}
|
||||||
|
\begin{itemize}
|
||||||
|
\item Vector benchmarks (BLAS level 1)
|
||||||
|
\item VADD: $z = x + y$
|
||||||
|
\item VMUL: $z = x \cdot y$
|
||||||
|
\item HAXPY: $z = a \cdot x + y$
|
||||||
|
|
||||||
|
\item Vector-Matrix benchmarks (BLAS level 2)
|
||||||
|
\begin{itemize}
|
||||||
|
\item GEMV: $z = A \cdot x$
|
||||||
|
\item Simple DNN:
|
||||||
|
\begin{itemize}
|
||||||
|
\item $f(x) = z = ReLU(A \cdot x)$
|
||||||
|
\item $z_{n+1} = f(z_n)$
|
||||||
|
\item 5 layers in total
|
||||||
|
\end{itemize}
|
||||||
|
\end{itemize}
|
||||||
|
\end{itemize}
|
||||||
|
\end{column}
|
||||||
|
\begin{column}{0.5\textwidth}
|
||||||
|
\begin{figure}
|
||||||
|
\begin{tblr}{
|
||||||
|
hlines,
|
||||||
|
column{1} = {c},
|
||||||
|
column{2} = {r},
|
||||||
|
column{3} = {r},
|
||||||
|
column{4} = {r},
|
||||||
|
row{1} = {l},
|
||||||
|
hline{2} = {2}{-}{solid,black},
|
||||||
|
}
|
||||||
|
Level & Vector & GEMV & DNN \\
|
||||||
|
X1 & 2M & (1k $\times$ 4k) & (256 $\times$ 256) \\
|
||||||
|
X2 & 4M & (2k $\times$ 4k) & (512 $\times$ 512) \\
|
||||||
|
X3 & 8M & (4k $\times$ 8k) & (1k $\times$ 1k) \\
|
||||||
|
X4 & 16M & (8k $\times$ 8k) & (2k $\times$ 2k)
|
||||||
|
\end{tblr}
|
||||||
|
\end{figure}
|
||||||
|
\end{column}
|
||||||
|
\end{columns}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}{System Configuration}
|
||||||
|
\begin{columns}[t]
|
||||||
|
\begin{column}{0.5\textwidth}
|
||||||
|
Two simulated systems:
|
||||||
|
\begin{itemize}
|
||||||
|
\item Generic ARM system
|
||||||
|
\item Infinite compute system
|
||||||
|
\begin{itemize}
|
||||||
|
\item Unrealistic high frequency of 100 GHz
|
||||||
|
\item Completely memory bound
|
||||||
|
\item Lower bound of possible speedup
|
||||||
|
\end{itemize}
|
||||||
|
\end{itemize}
|
||||||
|
\end{column}
|
||||||
|
\begin{column}{0.5\textwidth}
|
||||||
|
Two real GPUs using HBM2:
|
||||||
|
\begin{itemize}
|
||||||
|
\item AMD RX Vega 56
|
||||||
|
\item NVIDIA Tesla V100
|
||||||
|
\end{itemize}
|
||||||
|
\end{column}
|
||||||
|
\end{columns}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}{Speedups / Generic ARM System}
|
||||||
|
\begin{figure}
|
||||||
|
\includesvg[width=0.8\textwidth]{images/speedup_normal.svg}
|
||||||
|
\end{figure}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}{Speedups / Infinite Compute System}
|
||||||
|
\begin{figure}
|
||||||
|
\includesvg[width=0.8\textwidth]{images/speedup_inf.svg}
|
||||||
|
\end{figure}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}{Speedups / Samsung}
|
||||||
|
\begin{figure}
|
||||||
|
\includesvg[width=0.8\textwidth]{images/samsung.svg}
|
||||||
|
\end{figure}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}{Runtimes / Vector Benchmarks}
|
||||||
|
\begin{figure}
|
||||||
|
\includesvg[width=0.8\textwidth]{images/runtimes_vector.svg}
|
||||||
|
\end{figure}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}{Runtimes / Matrix Benchmarks}
|
||||||
|
\begin{figure}
|
||||||
|
\includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg}
|
||||||
|
\end{figure}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}{Conclusion and Future Work}
|
||||||
|
\begin{itemize}
|
||||||
|
\item PIM can accelerate memory-bound workloads
|
||||||
|
\item Special PIM-friendly memory layouts are required
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
Future work:
|
||||||
|
\begin{itemize}
|
||||||
|
\item Implementation of Linux driver
|
||||||
|
\item Comparison with real neural networks
|
||||||
|
\item Consider replacing library approach with compiler approach
|
||||||
|
\item Implement a power model to analyze the power efficiency gains
|
||||||
|
\end{itemize}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\section{Thank you for your attention!}
|
||||||
|
|
||||||
\begin{frame}
|
\begin{frame}
|
||||||
\frametitle{Outline}
|
\frametitle{Outline}
|
||||||
\tableofcontents
|
\tableofcontents
|
||||||
|
|||||||
Reference in New Issue
Block a user