diff --git a/src/appendix.tex b/src/appendix.tex index 45a3a8e..2226f82 100644 --- a/src/appendix.tex +++ b/src/appendix.tex @@ -2,8 +2,226 @@ \label{sec:appendix} \subsection{Simulation Results} + +\begin{table}[!ht] +\centering +\input{tables/vadd_3GHz} +\caption{Runtime of the VADD benchmark in $\unit{\pico\second}$ on the generic ARM system.} +\end{table} + +\begin{table}[!ht] +\centering +\input{tables/vmul_3GHz} +\caption{Runtime of the VMUL benchmark in $\unit{\pico\second}$ on the generic ARM system.} +\end{table} + +\begin{table}[!ht] +\centering +\input{tables/haxpy_3GHz} +\caption{Runtime of the \ac{haxpy} benchmark in $\unit{\pico\second}$ on the generic ARM system.} +\end{table} + +\begin{table}[!ht] +\centering +\input{tables/gemv_3GHz} +\caption{Runtime of the \ac{gemv} benchmark in $\unit{\pico\second}$ on the generic ARM system.} +\end{table} + +\begin{table}[!ht] +\centering +\input{tables/gemv_layers_3GHz} +\caption{Runtime of the \ac{dnn} benchmark in $\unit{\pico\second}$ on the generic ARM system.} +\end{table} + +\begin{table}[!ht] +\centering +\input{tables/vadd_100GHz} +\caption{Runtime of the VADD benchmark in $\unit{\pico\second}$ on the infinite compute system.} +\end{table} + +\begin{table}[!ht] +\centering +\input{tables/vmul_100GHz} +\caption{Runtime of the VMUL benchmark in $\unit{\pico\second}$ on the infinite compute system.} +\end{table} + +\begin{table}[!ht] +\centering +\input{tables/haxpy_100GHz} +\caption{Runtime of the \ac{haxpy} benchmark in $\unit{\pico\second}$ on the infinite compute system.} +\end{table} + +\begin{table}[!ht] +\centering +\input{tables/gemv_100GHz} +\caption{Runtime of the \ac{gemv} benchmark in $\unit{\pico\second}$ on the infinite compute system.} +\end{table} + +\begin{table}[!ht] +\centering +\input{tables/gemv_layers_100GHz} +\caption{Runtime of the \ac{dnn} benchmark in $\unit{\pico\second}$ on the infinite compute system.} +\end{table} + \subsection{Microkernels} + +\begin{listing}[!ht] +\begin{verbatim} +MOV GRF_A #0, BANK +MOV GRF_A #1, BANK +MOV GRF_A #2, BANK +MOV GRF_A #3, BANK +MOV GRF_A #4, BANK +MOV GRF_A #5, BANK +MOV GRF_A #6, BANK +MOV GRF_A #7, BANK +ADD GRF_B #0, BANK, GRF_A #0 +ADD GRF_B #1, BANK, GRF_A #1 +ADD GRF_B #2, BANK, GRF_A #2 +ADD GRF_B #3, BANK, GRF_A #3 +ADD GRF_B #4, BANK, GRF_A #4 +ADD GRF_B #5, BANK, GRF_A #5 +ADD GRF_B #6, BANK, GRF_A #6 +ADD GRF_B #7, BANK, GRF_A #7 +FILL BANK, GRF_B #0 +FILL BANK, GRF_B #1 +FILL BANK, GRF_B #2 +FILL BANK, GRF_B #3 +FILL BANK, GRF_B #4 +FILL BANK, GRF_B #5 +FILL BANK, GRF_B #6 +FILL BANK, GRF_B #7 +EXIT +\end{verbatim} + \caption{The microkernel used in the VADD benchmark.} + \label{lst:vadd_bench} +\end{listing} + +\begin{listing}[!ht] +\begin{verbatim} +MOV GRF_A #0, BANK +MOV GRF_A #1, BANK +MOV GRF_A #2, BANK +MOV GRF_A #3, BANK +MOV GRF_A #4, BANK +MOV GRF_A #5, BANK +MOV GRF_A #6, BANK +MOV GRF_A #7, BANK +MUL GRF_B #0, BANK, GRF_A #0 +MUL GRF_B #1, BANK, GRF_A #1 +MUL GRF_B #2, BANK, GRF_A #2 +MUL GRF_B #3, BANK, GRF_A #3 +MUL GRF_B #4, BANK, GRF_A #4 +MUL GRF_B #5, BANK, GRF_A #5 +MUL GRF_B #6, BANK, GRF_A #6 +MUL GRF_B #7, BANK, GRF_A #7 +FILL BANK, GRF_B #0 +FILL BANK, GRF_B #1 +FILL BANK, GRF_B #2 +FILL BANK, GRF_B #3 +FILL BANK, GRF_B #4 +FILL BANK, GRF_B #5 +FILL BANK, GRF_B #6 +FILL BANK, GRF_B #7 +EXIT +\end{verbatim} + \caption{The microkernel used in the VMUL benchmark.} + \label{lst:vmul_bench} +\end{listing} + +\begin{listing}[!ht] +\begin{verbatim} +MOV SRF_M #0, BANK +MOV GRF_A #0, BANK +MOV GRF_A #1, BANK +MOV GRF_A #2, BANK +MOV GRF_A #3, BANK +MOV GRF_A #4, BANK +MOV GRF_A #5, BANK +MOV GRF_A #6, BANK +MOV GRF_A #7, BANK +MAD GRF_B #0, BANK, SRF_M #0 GRF_A #0 +MAD GRF_B #1, BANK, SRF_M #0 GRF_A #1 +MAD GRF_B #2, BANK, SRF_M #0 GRF_A #2 +MAD GRF_B #3, BANK, SRF_M #0 GRF_A #3 +MAD GRF_B #4, BANK, SRF_M #0 GRF_A #4 +MAD GRF_B #5, BANK, SRF_M #0 GRF_A #5 +MAD GRF_B #6, BANK, SRF_M #0 GRF_A #6 +MAD GRF_B #7, BANK, SRF_M #0 GRF_A #7 +FILL BANK, GRF_B #0 +FILL BANK, GRF_B #1 +FILL BANK, GRF_B #2 +FILL BANK, GRF_B #3 +FILL BANK, GRF_B #4 +FILL BANK, GRF_B #5 +FILL BANK, GRF_B #6 +FILL BANK, GRF_B #7 +EXIT +\end{verbatim} + \caption{The microkernel used in the \ac{haxpy} benchmark.} + \label{lst:haxpy_bench} +\end{listing} + +\begin{listing}[!ht] +\begin{verbatim} +MOV GRF_A #0, BANK +MOV GRF_A #1, BANK +MOV GRF_A #2, BANK +MOV GRF_A #3, BANK +MOV GRF_A #4, BANK +MOV GRF_A #5, BANK +MOV GRF_A #6, BANK +MOV GRF_A #7, BANK +MAC(AAM) GRF_B, BANK, GRF_A +JUMP -1, 63 +FILL BANK, GRF_B #0 +FILL BANK, GRF_B #1 +FILL BANK, GRF_B #2 +FILL BANK, GRF_B #3 +FILL BANK, GRF_B #4 +FILL BANK, GRF_B #5 +FILL BANK, GRF_B #6 +FILL BANK, GRF_B #7 +EXIT +\end{verbatim} + \caption{The microkernel used in the \ac{gemv} and \ac{dnn} benchmark.} + \label{lst:gemv_bench} +\end{listing} + \subsection{Source Code} + +\begin{listing}[!ht] +\begin{minted}{rust} +pub fn execute( + matrix: &Matrix, + input_vector: &Vector, + output_partial_sum_vector: &mut SVector, + dummy: &impl PimOperand, +) { + for chunk in input_vector.0.iter() { + chunk.execute_read(); + } + + for sub_matrix in matrix.0.iter() { + for column_block in sub_matrix.fixed_rows::<1>(0).iter() { + column_block.execute_read_async(); + } + } + + barrier::dsb(barrier::SY); + + for chunk in output_partial_sum_vector + .fixed_rows_with_step_mut::(0, 16) + .iter_mut() + { + chunk.execute_write(); + } + + dummy.execute_read(); +} +\end{minted} + \caption{The \ac{gemv} kernel execution code that runs on the host processor.} +\end{listing} % etwas source code, % von der vm % einige microkernels diff --git a/src/chapters/results.tex b/src/chapters/results.tex index ba05b75..63b2fe4 100644 --- a/src/chapters/results.tex +++ b/src/chapters/results.tex @@ -259,7 +259,7 @@ Since the Samsung \ac{fpga} platform can be assumed to be a highly optimized acc \begin{figure} \centering \includegraphics[width=0.8\linewidth]{plots/samsung} - \caption{Relative performance of the \ac{gemv} and ADD microbenchmark for different batch sizes \cite{lee2021}.} + \caption{Relative performance of the \ac{gemv} and ADD microbenchmark for different batch sizes on the hardware implementation of Samsung \cite{lee2021}.} \label{fig:samsung_speedup} \end{figure} diff --git a/src/tables/gemv_100GHz.tex b/src/tables/gemv_100GHz.tex new file mode 100644 index 0000000..1a2d751 --- /dev/null +++ b/src/tables/gemv_100GHz.tex @@ -0,0 +1,22 @@ + +\begin{tblr}{ + hlines, + vlines, + cell{2}{2} = {r}, + cell{2}{3} = {r}, + cell{3}{2} = {r}, + cell{3}{3} = {r}, + cell{4}{2} = {r}, + cell{4}{3} = {r}, + cell{5}{2} = {r}, + cell{5}{3} = {r}, + hline{2} = {-}{solid,black}, + hline{2} = {2}{-}{solid,black}, +} +Level & Non-\ac{pim} & \ac{pim} \\ +X1&96465760&457604440 \\ +X2&192178090&474407440 \\ +X3&430015980&508115440 \\ +X4&2720535980&575476440 \\ + +\end{tblr} diff --git a/src/tables/gemv_3GHz.tex b/src/tables/gemv_3GHz.tex new file mode 100644 index 0000000..5a0302e --- /dev/null +++ b/src/tables/gemv_3GHz.tex @@ -0,0 +1,22 @@ + +\begin{tblr}{ + hlines, + vlines, + cell{2}{2} = {r}, + cell{2}{3} = {r}, + cell{3}{2} = {r}, + cell{3}{3} = {r}, + cell{4}{2} = {r}, + cell{4}{3} = {r}, + cell{5}{2} = {r}, + cell{5}{3} = {r}, + hline{2} = {-}{solid,black}, + hline{2} = {2}{-}{solid,black}, +} +Level & Non-\ac{pim} & \ac{pim} \\ +X1&2734886043&788428116 \\ +X2&5462015184&812330856 \\ +X3&10958710653&859903902 \\ +X4&22594486896&955550160 \\ + +\end{tblr} diff --git a/src/tables/gemv_layers_100GHz.tex b/src/tables/gemv_layers_100GHz.tex new file mode 100644 index 0000000..de0fd78 --- /dev/null +++ b/src/tables/gemv_layers_100GHz.tex @@ -0,0 +1,22 @@ + +\begin{tblr}{ + hlines, + vlines, + cell{2}{2} = {r}, + cell{2}{3} = {r}, + cell{3}{2} = {r}, + cell{3}{3} = {r}, + cell{4}{2} = {r}, + cell{4}{3} = {r}, + cell{5}{2} = {r}, + cell{5}{3} = {r}, + hline{2} = {-}{solid,black}, + hline{2} = {2}{-}{solid,black}, +} +Level & Non-\ac{pim} & \ac{pim} \\ +X1&8957090&50066880 \\ +X2&33647200&55178880 \\ +X3&299035090&75442880 \\ +X4&951182090&156218880 \\ + +\end{tblr} diff --git a/src/tables/gemv_layers_3GHz.tex b/src/tables/gemv_layers_3GHz.tex new file mode 100644 index 0000000..ddbf54c --- /dev/null +++ b/src/tables/gemv_layers_3GHz.tex @@ -0,0 +1,22 @@ + +\begin{tblr}{ + hlines, + vlines, + cell{2}{2} = {r}, + cell{2}{3} = {r}, + cell{3}{2} = {r}, + cell{3}{3} = {r}, + cell{4}{2} = {r}, + cell{4}{3} = {r}, + cell{5}{2} = {r}, + cell{5}{3} = {r}, + hline{2} = {-}{solid,black}, + hline{2} = {2}{-}{solid,black}, +} +Level & Non-\ac{pim} & \ac{pim} \\ +X1&262765971&87800778 \\ +X2&1070441487&95181057 \\ +X3&4332005991&123962913 \\ +X4&17236314765&238281147 \\ + +\end{tblr} diff --git a/src/tables/haxpy_100GHz.tex b/src/tables/haxpy_100GHz.tex new file mode 100644 index 0000000..9181da2 --- /dev/null +++ b/src/tables/haxpy_100GHz.tex @@ -0,0 +1,22 @@ + +\begin{tblr}{ + hlines, + vlines, + cell{2}{2} = {r}, + cell{2}{3} = {r}, + cell{3}{2} = {r}, + cell{3}{3} = {r}, + cell{4}{2} = {r}, + cell{4}{3} = {r}, + cell{5}{2} = {r}, + cell{5}{3} = {r}, + hline{2} = {-}{solid,black}, + hline{2} = {2}{-}{solid,black}, +} +Level & Non-\ac{pim} & \ac{pim} \\ +X1&2374720&1088170 \\ +X2&3677220&1604170 \\ +X3&5875080&2708170 \\ +X4&11640050&4703250 \\ + +\end{tblr} diff --git a/src/tables/haxpy_3GHz.tex b/src/tables/haxpy_3GHz.tex new file mode 100644 index 0000000..099ea6b --- /dev/null +++ b/src/tables/haxpy_3GHz.tex @@ -0,0 +1,22 @@ + +\begin{tblr}{ + hlines, + vlines, + cell{2}{2} = {r}, + cell{2}{3} = {r}, + cell{3}{2} = {r}, + cell{3}{3} = {r}, + cell{4}{2} = {r}, + cell{4}{3} = {r}, + cell{5}{2} = {r}, + cell{5}{3} = {r}, + hline{2} = {-}{solid,black}, + hline{2} = {2}{-}{solid,black}, +} +Level & Non-\ac{pim} & \ac{pim} \\ +X1&30413223&1555443 \\ +X2&59499108&2417913 \\ +X3&117506376&4207788 \\ +X4&234943821&7578414 \\ + +\end{tblr} diff --git a/src/tables/vadd_100GHz.tex b/src/tables/vadd_100GHz.tex new file mode 100644 index 0000000..4754e87 --- /dev/null +++ b/src/tables/vadd_100GHz.tex @@ -0,0 +1,22 @@ + +\begin{tblr}{ + hlines, + vlines, + cell{2}{2} = {r}, + cell{2}{3} = {r}, + cell{3}{2} = {r}, + cell{3}{3} = {r}, + cell{4}{2} = {r}, + cell{4}{3} = {r}, + cell{5}{2} = {r}, + cell{5}{3} = {r}, + hline{2} = {-}{solid,black}, + hline{2} = {2}{-}{solid,black}, +} +Level & Non-\ac{pim} & \ac{pim} \\ +X1&2024360&844170 \\ +X2&2574730&1412170 \\ +X3&3899060&2496170 \\ +X4&8037930&4493240 \\ + +\end{tblr} diff --git a/src/tables/vadd_3GHz.tex b/src/tables/vadd_3GHz.tex new file mode 100644 index 0000000..a183fda --- /dev/null +++ b/src/tables/vadd_3GHz.tex @@ -0,0 +1,22 @@ + +\begin{tblr}{ + hlines, + vlines, + cell{2}{2} = {r}, + cell{2}{3} = {r}, + cell{3}{2} = {r}, + cell{3}{3} = {r}, + cell{4}{2} = {r}, + cell{4}{3} = {r}, + cell{5}{2} = {r}, + cell{5}{3} = {r}, + hline{2} = {-}{solid,black}, + hline{2} = {2}{-}{solid,black}, +} +Level & Non-\ac{pim} & \ac{pim} \\ +X1&15585399&1220778 \\ +X2&29988981&2112885 \\ +X3&59177430&3864465 \\ +X4&118902645&7236756 \\ + +\end{tblr} diff --git a/src/tables/vmul_100GHz.tex b/src/tables/vmul_100GHz.tex new file mode 100644 index 0000000..4e9589c --- /dev/null +++ b/src/tables/vmul_100GHz.tex @@ -0,0 +1,22 @@ + +\begin{tblr}{ + hlines, + vlines, + cell{2}{2} = {r}, + cell{2}{3} = {r}, + cell{3}{2} = {r}, + cell{3}{3} = {r}, + cell{4}{2} = {r}, + cell{4}{3} = {r}, + cell{5}{2} = {r}, + cell{5}{3} = {r}, + hline{2} = {-}{solid,black}, + hline{2} = {2}{-}{solid,black}, +} +Level & Non-\ac{pim} & \ac{pim} \\ +X1&2030090&845170 \\ +X2&3109090&1401170 \\ +X3&4654880&2493170 \\ +X4&7829930&4478240 \\ + +\end{tblr} diff --git a/src/tables/vmul_3GHz.tex b/src/tables/vmul_3GHz.tex new file mode 100644 index 0000000..10d76f8 --- /dev/null +++ b/src/tables/vmul_3GHz.tex @@ -0,0 +1,22 @@ + +\begin{tblr}{ + hlines, + vlines, + cell{2}{2} = {r}, + cell{2}{3} = {r}, + cell{3}{2} = {r}, + cell{3}{3} = {r}, + cell{4}{2} = {r}, + cell{4}{3} = {r}, + cell{5}{2} = {r}, + cell{5}{3} = {r}, + hline{2} = {-}{solid,black}, + hline{2} = {2}{-}{solid,black}, +} +Level & Non-\ac{pim} & \ac{pim} \\ +X1&17269047&1219779 \\ +X2&33834132&2125872 \\ +X3&66308292&3909753 \\ +X4&131863338&7239087 \\ + +\end{tblr}