Add rest of slides
This commit is contained in:
File diff suppressed because one or more lines are too long
|
Before Width: | Height: | Size: 54 KiB After Width: | Height: | Size: 69 KiB |
31
kernel.rs
Normal file
31
kernel.rs
Normal file
@@ -0,0 +1,31 @@
|
||||
pub fn execute<const X16R: usize, const X16C: usize, const R: usize>(
|
||||
matrix: &Matrix<X16R, X16C>,
|
||||
input_vector: &Vector<X16C>,
|
||||
output_partial_sum_vector: &mut SVector<F16x16, R>,
|
||||
dummy: &impl PimOperand,
|
||||
) {
|
||||
// Load input vector into GRF-A registers
|
||||
for chunk in input_vector.0.iter() {
|
||||
chunk.execute_read();
|
||||
}
|
||||
|
||||
// Execute the MAC instructions without memory barriers
|
||||
for sub_matrix in matrix.0.iter() {
|
||||
for column_block in sub_matrix.fixed_rows::<1>(0).iter() {
|
||||
column_block.execute_read_async();
|
||||
}
|
||||
}
|
||||
|
||||
// Verify all memory accesses have finished
|
||||
barrier::dsb(barrier::SY);
|
||||
|
||||
// Copy the partial sums into the bank
|
||||
for chunk in output_partial_sum_vector
|
||||
.fixed_rows_with_step_mut::<X16R>(0, 16)
|
||||
.iter_mut() {
|
||||
chunk.execute_write();
|
||||
}
|
||||
|
||||
// Execute the EXIT instruction
|
||||
dummy.execute_read();
|
||||
}
|
||||
12
kernel.s
Normal file
12
kernel.s
Normal file
@@ -0,0 +1,12 @@
|
||||
MOV GRF_A #0, BANK
|
||||
MOV GRF_A #1, BANK
|
||||
MOV GRF_A #2, BANK
|
||||
MOV GRF_A #3, BANK
|
||||
MOV GRF_A #4, BANK
|
||||
MOV GRF_A #5, BANK
|
||||
MOV GRF_A #6, BANK
|
||||
MOV GRF_A #7, BANK
|
||||
MAC(AAM) GRF_B, BANK, GRF_A
|
||||
JUMP -1, 7
|
||||
FILL BANK, GRF_B #0
|
||||
EXIT
|
||||
155
main.tex
155
main.tex
@@ -3,6 +3,7 @@
|
||||
|
||||
\usepackage[style=verbose-ibid]{biblatex}
|
||||
\usepackage{datetime}
|
||||
\usepackage{tabularray}
|
||||
\usepackage[inkscapeversion=1]{svg}
|
||||
|
||||
\addbibresource{references.bib}
|
||||
@@ -225,8 +226,162 @@
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[fragile]{Example: GEMV Kernel}
|
||||
\begin{center}
|
||||
\only<1>{
|
||||
\inputminted[firstline=1,lastline=8, fontsize=\footnotesize]{gas}{kernel.s}
|
||||
\hrule
|
||||
\inputminted[firstline=7,lastline=10,fontsize=\footnotesize]{rust}{kernel.rs}
|
||||
}
|
||||
\only<2>{
|
||||
\inputminted[firstline=9,lastline=10, fontsize=\footnotesize]{gas}{kernel.s}
|
||||
\hrule
|
||||
\inputminted[firstline=12,lastline=20,fontsize=\footnotesize]{rust}{kernel.rs}
|
||||
}
|
||||
\only<3>{
|
||||
\inputminted[firstline=11,lastline=12,fontsize=\footnotesize]{gas}{kernel.s}
|
||||
\hrule
|
||||
\inputminted[firstline=22,lastline=30,fontsize=\footnotesize]{rust}{kernel.rs}
|
||||
}
|
||||
\end{center}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Virtual Prototype Platform}
|
||||
\begin{columns}
|
||||
\begin{column}{0.5\textwidth}
|
||||
\begin{itemize}
|
||||
\item ARM processor model
|
||||
\item Bare-metal kernel
|
||||
\item Custom page table configuration
|
||||
\begin{itemize}
|
||||
\item Non-PIM DRAM region mapped as cacheable memory
|
||||
\item PIM DRAM region mapped as non-cacheable memory
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
\end{column}
|
||||
\begin{column}{0.5\textwidth}
|
||||
\begin{figure}
|
||||
\includesvg[width=0.8\textwidth]{images/bare_metal.svg}
|
||||
\end{figure}
|
||||
\end{column}
|
||||
\end{columns}
|
||||
\end{frame}
|
||||
|
||||
\section{Simulations}
|
||||
|
||||
\begin{frame}{Microbenchmarks}
|
||||
\begin{columns}
|
||||
\begin{column}{0.5\textwidth}
|
||||
\begin{itemize}
|
||||
\item Vector benchmarks (BLAS level 1)
|
||||
\item VADD: $z = x + y$
|
||||
\item VMUL: $z = x \cdot y$
|
||||
\item HAXPY: $z = a \cdot x + y$
|
||||
|
||||
\item Vector-Matrix benchmarks (BLAS level 2)
|
||||
\begin{itemize}
|
||||
\item GEMV: $z = A \cdot x$
|
||||
\item Simple DNN:
|
||||
\begin{itemize}
|
||||
\item $f(x) = z = ReLU(A \cdot x)$
|
||||
\item $z_{n+1} = f(z_n)$
|
||||
\item 5 layers in total
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
\end{column}
|
||||
\begin{column}{0.5\textwidth}
|
||||
\begin{figure}
|
||||
\begin{tblr}{
|
||||
hlines,
|
||||
column{1} = {c},
|
||||
column{2} = {r},
|
||||
column{3} = {r},
|
||||
column{4} = {r},
|
||||
row{1} = {l},
|
||||
hline{2} = {2}{-}{solid,black},
|
||||
}
|
||||
Level & Vector & GEMV & DNN \\
|
||||
X1 & 2M & (1k $\times$ 4k) & (256 $\times$ 256) \\
|
||||
X2 & 4M & (2k $\times$ 4k) & (512 $\times$ 512) \\
|
||||
X3 & 8M & (4k $\times$ 8k) & (1k $\times$ 1k) \\
|
||||
X4 & 16M & (8k $\times$ 8k) & (2k $\times$ 2k)
|
||||
\end{tblr}
|
||||
\end{figure}
|
||||
\end{column}
|
||||
\end{columns}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{System Configuration}
|
||||
\begin{columns}[t]
|
||||
\begin{column}{0.5\textwidth}
|
||||
Two simulated systems:
|
||||
\begin{itemize}
|
||||
\item Generic ARM system
|
||||
\item Infinite compute system
|
||||
\begin{itemize}
|
||||
\item Unrealistic high frequency of 100 GHz
|
||||
\item Completely memory bound
|
||||
\item Lower bound of possible speedup
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
\end{column}
|
||||
\begin{column}{0.5\textwidth}
|
||||
Two real GPUs using HBM2:
|
||||
\begin{itemize}
|
||||
\item AMD RX Vega 56
|
||||
\item NVIDIA Tesla V100
|
||||
\end{itemize}
|
||||
\end{column}
|
||||
\end{columns}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Speedups / Generic ARM System}
|
||||
\begin{figure}
|
||||
\includesvg[width=0.8\textwidth]{images/speedup_normal.svg}
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Speedups / Infinite Compute System}
|
||||
\begin{figure}
|
||||
\includesvg[width=0.8\textwidth]{images/speedup_inf.svg}
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Speedups / Samsung}
|
||||
\begin{figure}
|
||||
\includesvg[width=0.8\textwidth]{images/samsung.svg}
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Runtimes / Vector Benchmarks}
|
||||
\begin{figure}
|
||||
\includesvg[width=0.8\textwidth]{images/runtimes_vector.svg}
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Runtimes / Matrix Benchmarks}
|
||||
\begin{figure}
|
||||
\includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg}
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}{Conclusion and Future Work}
|
||||
\begin{itemize}
|
||||
\item PIM can accelerate memory-bound workloads
|
||||
\item Special PIM-friendly memory layouts are required
|
||||
\end{itemize}
|
||||
|
||||
Future work:
|
||||
\begin{itemize}
|
||||
\item Implementation of Linux driver
|
||||
\item Comparison with real neural networks
|
||||
\item Consider replacing library approach with compiler approach
|
||||
\item Implement a power model to analyze the power efficiency gains
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\section{Thank you for your attention!}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Outline}
|
||||
\tableofcontents
|
||||
|
||||
Reference in New Issue
Block a user