diff --git a/images/bare_metal.svg b/images/bare_metal.svg
index 23b03be..742724d 100644
--- a/images/bare_metal.svg
+++ b/images/bare_metal.svg
@@ -8,7 +8,7 @@
version="1.1"
id="svg1"
xml:space="preserve"
- inkscape:version="1.3.2 (091e20ef0f, 2023-11-25, custom)"
+ inkscape:version="1.3.2 (091e20ef0f, 2023-11-25)"
sodipodi:docname="bare_metal.svg"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
@@ -25,14 +25,14 @@
inkscape:deskcolor="#d1d1d1"
inkscape:document-units="mm"
inkscape:zoom="1.6997475"
- inkscape:cx="169.14277"
- inkscape:cy="38.829297"
- inkscape:window-width="2194"
- inkscape:window-height="1158"
+ inkscape:cx="168.84861"
+ inkscape:cy="37.946813"
+ inkscape:window-width="1536"
+ inkscape:window-height="932"
inkscape:window-x="0"
inkscape:window-y="0"
inkscape:window-maximized="1"
- inkscape:current-layer="layer1" />Bare-MetalKernelARM Processor Model
+ clip-path="url(#clipPath4052)" />
diff --git a/kernel.rs b/kernel.rs
new file mode 100644
index 0000000..80a52ba
--- /dev/null
+++ b/kernel.rs
@@ -0,0 +1,31 @@
+pub fn execute(
+ matrix: &Matrix,
+ input_vector: &Vector,
+ output_partial_sum_vector: &mut SVector,
+ dummy: &impl PimOperand,
+) {
+ // Load input vector into GRF-A registers
+ for chunk in input_vector.0.iter() {
+ chunk.execute_read();
+ }
+
+ // Execute the MAC instructions without memory barriers
+ for sub_matrix in matrix.0.iter() {
+ for column_block in sub_matrix.fixed_rows::<1>(0).iter() {
+ column_block.execute_read_async();
+ }
+ }
+
+ // Verify all memory accesses have finished
+ barrier::dsb(barrier::SY);
+
+ // Copy the partial sums into the bank
+ for chunk in output_partial_sum_vector
+ .fixed_rows_with_step_mut::(0, 16)
+ .iter_mut() {
+ chunk.execute_write();
+ }
+
+ // Execute the EXIT instruction
+ dummy.execute_read();
+}
diff --git a/kernel.s b/kernel.s
new file mode 100644
index 0000000..473805d
--- /dev/null
+++ b/kernel.s
@@ -0,0 +1,12 @@
+MOV GRF_A #0, BANK
+MOV GRF_A #1, BANK
+MOV GRF_A #2, BANK
+MOV GRF_A #3, BANK
+MOV GRF_A #4, BANK
+MOV GRF_A #5, BANK
+MOV GRF_A #6, BANK
+MOV GRF_A #7, BANK
+MAC(AAM) GRF_B, BANK, GRF_A
+JUMP -1, 7
+FILL BANK, GRF_B #0
+EXIT
diff --git a/main.tex b/main.tex
index 87c7008..9cf1ec7 100644
--- a/main.tex
+++ b/main.tex
@@ -3,6 +3,7 @@
\usepackage[style=verbose-ibid]{biblatex}
\usepackage{datetime}
+\usepackage{tabularray}
\usepackage[inkscapeversion=1]{svg}
\addbibresource{references.bib}
@@ -225,8 +226,162 @@
\end{frame}
\begin{frame}[fragile]{Example: GEMV Kernel}
+ \begin{center}
+ \only<1>{
+ \inputminted[firstline=1,lastline=8, fontsize=\footnotesize]{gas}{kernel.s}
+ \hrule
+ \inputminted[firstline=7,lastline=10,fontsize=\footnotesize]{rust}{kernel.rs}
+ }
+ \only<2>{
+ \inputminted[firstline=9,lastline=10, fontsize=\footnotesize]{gas}{kernel.s}
+ \hrule
+ \inputminted[firstline=12,lastline=20,fontsize=\footnotesize]{rust}{kernel.rs}
+ }
+ \only<3>{
+ \inputminted[firstline=11,lastline=12,fontsize=\footnotesize]{gas}{kernel.s}
+ \hrule
+ \inputminted[firstline=22,lastline=30,fontsize=\footnotesize]{rust}{kernel.rs}
+ }
+ \end{center}
\end{frame}
+\begin{frame}{Virtual Prototype Platform}
+ \begin{columns}
+ \begin{column}{0.5\textwidth}
+ \begin{itemize}
+ \item ARM processor model
+ \item Bare-metal kernel
+ \item Custom page table configuration
+ \begin{itemize}
+ \item Non-PIM DRAM region mapped as cacheable memory
+ \item PIM DRAM region mapped as non-cacheable memory
+ \end{itemize}
+ \end{itemize}
+ \end{column}
+ \begin{column}{0.5\textwidth}
+ \begin{figure}
+ \includesvg[width=0.8\textwidth]{images/bare_metal.svg}
+ \end{figure}
+ \end{column}
+ \end{columns}
+\end{frame}
+
+\section{Simulations}
+
+\begin{frame}{Microbenchmarks}
+ \begin{columns}
+ \begin{column}{0.5\textwidth}
+ \begin{itemize}
+ \item Vector benchmarks (BLAS level 1)
+ \item VADD: $z = x + y$
+ \item VMUL: $z = x \cdot y$
+ \item HAXPY: $z = a \cdot x + y$
+
+ \item Vector-Matrix benchmarks (BLAS level 2)
+ \begin{itemize}
+ \item GEMV: $z = A \cdot x$
+ \item Simple DNN:
+ \begin{itemize}
+ \item $f(x) = z = ReLU(A \cdot x)$
+ \item $z_{n+1} = f(z_n)$
+ \item 5 layers in total
+ \end{itemize}
+ \end{itemize}
+ \end{itemize}
+ \end{column}
+ \begin{column}{0.5\textwidth}
+ \begin{figure}
+ \begin{tblr}{
+ hlines,
+ column{1} = {c},
+ column{2} = {r},
+ column{3} = {r},
+ column{4} = {r},
+ row{1} = {l},
+ hline{2} = {2}{-}{solid,black},
+ }
+ Level & Vector & GEMV & DNN \\
+ X1 & 2M & (1k $\times$ 4k) & (256 $\times$ 256) \\
+ X2 & 4M & (2k $\times$ 4k) & (512 $\times$ 512) \\
+ X3 & 8M & (4k $\times$ 8k) & (1k $\times$ 1k) \\
+ X4 & 16M & (8k $\times$ 8k) & (2k $\times$ 2k)
+ \end{tblr}
+ \end{figure}
+ \end{column}
+ \end{columns}
+\end{frame}
+
+\begin{frame}{System Configuration}
+ \begin{columns}[t]
+ \begin{column}{0.5\textwidth}
+ Two simulated systems:
+ \begin{itemize}
+ \item Generic ARM system
+ \item Infinite compute system
+ \begin{itemize}
+ \item Unrealistic high frequency of 100 GHz
+ \item Completely memory bound
+ \item Lower bound of possible speedup
+ \end{itemize}
+ \end{itemize}
+ \end{column}
+ \begin{column}{0.5\textwidth}
+ Two real GPUs using HBM2:
+ \begin{itemize}
+ \item AMD RX Vega 56
+ \item NVIDIA Tesla V100
+ \end{itemize}
+ \end{column}
+ \end{columns}
+\end{frame}
+
+\begin{frame}{Speedups / Generic ARM System}
+ \begin{figure}
+ \includesvg[width=0.8\textwidth]{images/speedup_normal.svg}
+ \end{figure}
+\end{frame}
+
+\begin{frame}{Speedups / Infinite Compute System}
+ \begin{figure}
+ \includesvg[width=0.8\textwidth]{images/speedup_inf.svg}
+ \end{figure}
+\end{frame}
+
+\begin{frame}{Speedups / Samsung}
+ \begin{figure}
+ \includesvg[width=0.8\textwidth]{images/samsung.svg}
+ \end{figure}
+\end{frame}
+
+\begin{frame}{Runtimes / Vector Benchmarks}
+ \begin{figure}
+ \includesvg[width=0.8\textwidth]{images/runtimes_vector.svg}
+ \end{figure}
+\end{frame}
+
+\begin{frame}{Runtimes / Matrix Benchmarks}
+ \begin{figure}
+ \includesvg[width=0.8\textwidth]{images/runtimes_matrix.svg}
+ \end{figure}
+\end{frame}
+
+\begin{frame}{Conclusion and Future Work}
+ \begin{itemize}
+ \item PIM can accelerate memory-bound workloads
+ \item Special PIM-friendly memory layouts are required
+ \end{itemize}
+
+ Future work:
+ \begin{itemize}
+ \item Implementation of Linux driver
+ \item Comparison with real neural networks
+ \item Consider replacing library approach with compiler approach
+ \item Implement a power model to analyze the power efficiency gains
+ \end{itemize}
+\end{frame}
+
+\section{Thank you for your attention!}
+
\begin{frame}
\frametitle{Outline}
\tableofcontents