Appendix
This commit is contained in:
218
src/appendix.tex
218
src/appendix.tex
@@ -2,8 +2,226 @@
|
||||
\label{sec:appendix}
|
||||
|
||||
\subsection{Simulation Results}
|
||||
|
||||
\begin{table}[!ht]
|
||||
\centering
|
||||
\input{tables/vadd_3GHz}
|
||||
\caption{Runtime of the VADD benchmark in $\unit{\pico\second}$ on the generic ARM system.}
|
||||
\end{table}
|
||||
|
||||
\begin{table}[!ht]
|
||||
\centering
|
||||
\input{tables/vmul_3GHz}
|
||||
\caption{Runtime of the VMUL benchmark in $\unit{\pico\second}$ on the generic ARM system.}
|
||||
\end{table}
|
||||
|
||||
\begin{table}[!ht]
|
||||
\centering
|
||||
\input{tables/haxpy_3GHz}
|
||||
\caption{Runtime of the \ac{haxpy} benchmark in $\unit{\pico\second}$ on the generic ARM system.}
|
||||
\end{table}
|
||||
|
||||
\begin{table}[!ht]
|
||||
\centering
|
||||
\input{tables/gemv_3GHz}
|
||||
\caption{Runtime of the \ac{gemv} benchmark in $\unit{\pico\second}$ on the generic ARM system.}
|
||||
\end{table}
|
||||
|
||||
\begin{table}[!ht]
|
||||
\centering
|
||||
\input{tables/gemv_layers_3GHz}
|
||||
\caption{Runtime of the \ac{dnn} benchmark in $\unit{\pico\second}$ on the generic ARM system.}
|
||||
\end{table}
|
||||
|
||||
\begin{table}[!ht]
|
||||
\centering
|
||||
\input{tables/vadd_100GHz}
|
||||
\caption{Runtime of the VADD benchmark in $\unit{\pico\second}$ on the infinite compute system.}
|
||||
\end{table}
|
||||
|
||||
\begin{table}[!ht]
|
||||
\centering
|
||||
\input{tables/vmul_100GHz}
|
||||
\caption{Runtime of the VMUL benchmark in $\unit{\pico\second}$ on the infinite compute system.}
|
||||
\end{table}
|
||||
|
||||
\begin{table}[!ht]
|
||||
\centering
|
||||
\input{tables/haxpy_100GHz}
|
||||
\caption{Runtime of the \ac{haxpy} benchmark in $\unit{\pico\second}$ on the infinite compute system.}
|
||||
\end{table}
|
||||
|
||||
\begin{table}[!ht]
|
||||
\centering
|
||||
\input{tables/gemv_100GHz}
|
||||
\caption{Runtime of the \ac{gemv} benchmark in $\unit{\pico\second}$ on the infinite compute system.}
|
||||
\end{table}
|
||||
|
||||
\begin{table}[!ht]
|
||||
\centering
|
||||
\input{tables/gemv_layers_100GHz}
|
||||
\caption{Runtime of the \ac{dnn} benchmark in $\unit{\pico\second}$ on the infinite compute system.}
|
||||
\end{table}
|
||||
|
||||
\subsection{Microkernels}
|
||||
|
||||
\begin{listing}[!ht]
|
||||
\begin{verbatim}
|
||||
MOV GRF_A #0, BANK
|
||||
MOV GRF_A #1, BANK
|
||||
MOV GRF_A #2, BANK
|
||||
MOV GRF_A #3, BANK
|
||||
MOV GRF_A #4, BANK
|
||||
MOV GRF_A #5, BANK
|
||||
MOV GRF_A #6, BANK
|
||||
MOV GRF_A #7, BANK
|
||||
ADD GRF_B #0, BANK, GRF_A #0
|
||||
ADD GRF_B #1, BANK, GRF_A #1
|
||||
ADD GRF_B #2, BANK, GRF_A #2
|
||||
ADD GRF_B #3, BANK, GRF_A #3
|
||||
ADD GRF_B #4, BANK, GRF_A #4
|
||||
ADD GRF_B #5, BANK, GRF_A #5
|
||||
ADD GRF_B #6, BANK, GRF_A #6
|
||||
ADD GRF_B #7, BANK, GRF_A #7
|
||||
FILL BANK, GRF_B #0
|
||||
FILL BANK, GRF_B #1
|
||||
FILL BANK, GRF_B #2
|
||||
FILL BANK, GRF_B #3
|
||||
FILL BANK, GRF_B #4
|
||||
FILL BANK, GRF_B #5
|
||||
FILL BANK, GRF_B #6
|
||||
FILL BANK, GRF_B #7
|
||||
EXIT
|
||||
\end{verbatim}
|
||||
\caption{The microkernel used in the VADD benchmark.}
|
||||
\label{lst:vadd_bench}
|
||||
\end{listing}
|
||||
|
||||
\begin{listing}[!ht]
|
||||
\begin{verbatim}
|
||||
MOV GRF_A #0, BANK
|
||||
MOV GRF_A #1, BANK
|
||||
MOV GRF_A #2, BANK
|
||||
MOV GRF_A #3, BANK
|
||||
MOV GRF_A #4, BANK
|
||||
MOV GRF_A #5, BANK
|
||||
MOV GRF_A #6, BANK
|
||||
MOV GRF_A #7, BANK
|
||||
MUL GRF_B #0, BANK, GRF_A #0
|
||||
MUL GRF_B #1, BANK, GRF_A #1
|
||||
MUL GRF_B #2, BANK, GRF_A #2
|
||||
MUL GRF_B #3, BANK, GRF_A #3
|
||||
MUL GRF_B #4, BANK, GRF_A #4
|
||||
MUL GRF_B #5, BANK, GRF_A #5
|
||||
MUL GRF_B #6, BANK, GRF_A #6
|
||||
MUL GRF_B #7, BANK, GRF_A #7
|
||||
FILL BANK, GRF_B #0
|
||||
FILL BANK, GRF_B #1
|
||||
FILL BANK, GRF_B #2
|
||||
FILL BANK, GRF_B #3
|
||||
FILL BANK, GRF_B #4
|
||||
FILL BANK, GRF_B #5
|
||||
FILL BANK, GRF_B #6
|
||||
FILL BANK, GRF_B #7
|
||||
EXIT
|
||||
\end{verbatim}
|
||||
\caption{The microkernel used in the VMUL benchmark.}
|
||||
\label{lst:vmul_bench}
|
||||
\end{listing}
|
||||
|
||||
\begin{listing}[!ht]
|
||||
\begin{verbatim}
|
||||
MOV SRF_M #0, BANK
|
||||
MOV GRF_A #0, BANK
|
||||
MOV GRF_A #1, BANK
|
||||
MOV GRF_A #2, BANK
|
||||
MOV GRF_A #3, BANK
|
||||
MOV GRF_A #4, BANK
|
||||
MOV GRF_A #5, BANK
|
||||
MOV GRF_A #6, BANK
|
||||
MOV GRF_A #7, BANK
|
||||
MAD GRF_B #0, BANK, SRF_M #0 GRF_A #0
|
||||
MAD GRF_B #1, BANK, SRF_M #0 GRF_A #1
|
||||
MAD GRF_B #2, BANK, SRF_M #0 GRF_A #2
|
||||
MAD GRF_B #3, BANK, SRF_M #0 GRF_A #3
|
||||
MAD GRF_B #4, BANK, SRF_M #0 GRF_A #4
|
||||
MAD GRF_B #5, BANK, SRF_M #0 GRF_A #5
|
||||
MAD GRF_B #6, BANK, SRF_M #0 GRF_A #6
|
||||
MAD GRF_B #7, BANK, SRF_M #0 GRF_A #7
|
||||
FILL BANK, GRF_B #0
|
||||
FILL BANK, GRF_B #1
|
||||
FILL BANK, GRF_B #2
|
||||
FILL BANK, GRF_B #3
|
||||
FILL BANK, GRF_B #4
|
||||
FILL BANK, GRF_B #5
|
||||
FILL BANK, GRF_B #6
|
||||
FILL BANK, GRF_B #7
|
||||
EXIT
|
||||
\end{verbatim}
|
||||
\caption{The microkernel used in the \ac{haxpy} benchmark.}
|
||||
\label{lst:haxpy_bench}
|
||||
\end{listing}
|
||||
|
||||
\begin{listing}[!ht]
|
||||
\begin{verbatim}
|
||||
MOV GRF_A #0, BANK
|
||||
MOV GRF_A #1, BANK
|
||||
MOV GRF_A #2, BANK
|
||||
MOV GRF_A #3, BANK
|
||||
MOV GRF_A #4, BANK
|
||||
MOV GRF_A #5, BANK
|
||||
MOV GRF_A #6, BANK
|
||||
MOV GRF_A #7, BANK
|
||||
MAC(AAM) GRF_B, BANK, GRF_A
|
||||
JUMP -1, 63
|
||||
FILL BANK, GRF_B #0
|
||||
FILL BANK, GRF_B #1
|
||||
FILL BANK, GRF_B #2
|
||||
FILL BANK, GRF_B #3
|
||||
FILL BANK, GRF_B #4
|
||||
FILL BANK, GRF_B #5
|
||||
FILL BANK, GRF_B #6
|
||||
FILL BANK, GRF_B #7
|
||||
EXIT
|
||||
\end{verbatim}
|
||||
\caption{The microkernel used in the \ac{gemv} and \ac{dnn} benchmark.}
|
||||
\label{lst:gemv_bench}
|
||||
\end{listing}
|
||||
|
||||
\subsection{Source Code}
|
||||
|
||||
\begin{listing}[!ht]
|
||||
\begin{minted}{rust}
|
||||
pub fn execute<const X16R: usize, const R: usize, const X16C: usize>(
|
||||
matrix: &Matrix<X16R, X16C>,
|
||||
input_vector: &Vector<X16C>,
|
||||
output_partial_sum_vector: &mut SVector<F16x16, R>,
|
||||
dummy: &impl PimOperand,
|
||||
) {
|
||||
for chunk in input_vector.0.iter() {
|
||||
chunk.execute_read();
|
||||
}
|
||||
|
||||
for sub_matrix in matrix.0.iter() {
|
||||
for column_block in sub_matrix.fixed_rows::<1>(0).iter() {
|
||||
column_block.execute_read_async();
|
||||
}
|
||||
}
|
||||
|
||||
barrier::dsb(barrier::SY);
|
||||
|
||||
for chunk in output_partial_sum_vector
|
||||
.fixed_rows_with_step_mut::<X16R>(0, 16)
|
||||
.iter_mut()
|
||||
{
|
||||
chunk.execute_write();
|
||||
}
|
||||
|
||||
dummy.execute_read();
|
||||
}
|
||||
\end{minted}
|
||||
\caption{The \ac{gemv} kernel execution code that runs on the host processor.}
|
||||
\end{listing}
|
||||
% etwas source code,
|
||||
% von der vm
|
||||
% einige microkernels
|
||||
|
||||
@@ -259,7 +259,7 @@ Since the Samsung \ac{fpga} platform can be assumed to be a highly optimized acc
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{plots/samsung}
|
||||
\caption{Relative performance of the \ac{gemv} and ADD microbenchmark for different batch sizes \cite{lee2021}.}
|
||||
\caption{Relative performance of the \ac{gemv} and ADD microbenchmark for different batch sizes on the hardware implementation of Samsung \cite{lee2021}.}
|
||||
\label{fig:samsung_speedup}
|
||||
\end{figure}
|
||||
|
||||
|
||||
22
src/tables/gemv_100GHz.tex
Normal file
22
src/tables/gemv_100GHz.tex
Normal file
@@ -0,0 +1,22 @@
|
||||
|
||||
\begin{tblr}{
|
||||
hlines,
|
||||
vlines,
|
||||
cell{2}{2} = {r},
|
||||
cell{2}{3} = {r},
|
||||
cell{3}{2} = {r},
|
||||
cell{3}{3} = {r},
|
||||
cell{4}{2} = {r},
|
||||
cell{4}{3} = {r},
|
||||
cell{5}{2} = {r},
|
||||
cell{5}{3} = {r},
|
||||
hline{2} = {-}{solid,black},
|
||||
hline{2} = {2}{-}{solid,black},
|
||||
}
|
||||
Level & Non-\ac{pim} & \ac{pim} \\
|
||||
X1&96465760&457604440 \\
|
||||
X2&192178090&474407440 \\
|
||||
X3&430015980&508115440 \\
|
||||
X4&2720535980&575476440 \\
|
||||
|
||||
\end{tblr}
|
||||
22
src/tables/gemv_3GHz.tex
Normal file
22
src/tables/gemv_3GHz.tex
Normal file
@@ -0,0 +1,22 @@
|
||||
|
||||
\begin{tblr}{
|
||||
hlines,
|
||||
vlines,
|
||||
cell{2}{2} = {r},
|
||||
cell{2}{3} = {r},
|
||||
cell{3}{2} = {r},
|
||||
cell{3}{3} = {r},
|
||||
cell{4}{2} = {r},
|
||||
cell{4}{3} = {r},
|
||||
cell{5}{2} = {r},
|
||||
cell{5}{3} = {r},
|
||||
hline{2} = {-}{solid,black},
|
||||
hline{2} = {2}{-}{solid,black},
|
||||
}
|
||||
Level & Non-\ac{pim} & \ac{pim} \\
|
||||
X1&2734886043&788428116 \\
|
||||
X2&5462015184&812330856 \\
|
||||
X3&10958710653&859903902 \\
|
||||
X4&22594486896&955550160 \\
|
||||
|
||||
\end{tblr}
|
||||
22
src/tables/gemv_layers_100GHz.tex
Normal file
22
src/tables/gemv_layers_100GHz.tex
Normal file
@@ -0,0 +1,22 @@
|
||||
|
||||
\begin{tblr}{
|
||||
hlines,
|
||||
vlines,
|
||||
cell{2}{2} = {r},
|
||||
cell{2}{3} = {r},
|
||||
cell{3}{2} = {r},
|
||||
cell{3}{3} = {r},
|
||||
cell{4}{2} = {r},
|
||||
cell{4}{3} = {r},
|
||||
cell{5}{2} = {r},
|
||||
cell{5}{3} = {r},
|
||||
hline{2} = {-}{solid,black},
|
||||
hline{2} = {2}{-}{solid,black},
|
||||
}
|
||||
Level & Non-\ac{pim} & \ac{pim} \\
|
||||
X1&8957090&50066880 \\
|
||||
X2&33647200&55178880 \\
|
||||
X3&299035090&75442880 \\
|
||||
X4&951182090&156218880 \\
|
||||
|
||||
\end{tblr}
|
||||
22
src/tables/gemv_layers_3GHz.tex
Normal file
22
src/tables/gemv_layers_3GHz.tex
Normal file
@@ -0,0 +1,22 @@
|
||||
|
||||
\begin{tblr}{
|
||||
hlines,
|
||||
vlines,
|
||||
cell{2}{2} = {r},
|
||||
cell{2}{3} = {r},
|
||||
cell{3}{2} = {r},
|
||||
cell{3}{3} = {r},
|
||||
cell{4}{2} = {r},
|
||||
cell{4}{3} = {r},
|
||||
cell{5}{2} = {r},
|
||||
cell{5}{3} = {r},
|
||||
hline{2} = {-}{solid,black},
|
||||
hline{2} = {2}{-}{solid,black},
|
||||
}
|
||||
Level & Non-\ac{pim} & \ac{pim} \\
|
||||
X1&262765971&87800778 \\
|
||||
X2&1070441487&95181057 \\
|
||||
X3&4332005991&123962913 \\
|
||||
X4&17236314765&238281147 \\
|
||||
|
||||
\end{tblr}
|
||||
22
src/tables/haxpy_100GHz.tex
Normal file
22
src/tables/haxpy_100GHz.tex
Normal file
@@ -0,0 +1,22 @@
|
||||
|
||||
\begin{tblr}{
|
||||
hlines,
|
||||
vlines,
|
||||
cell{2}{2} = {r},
|
||||
cell{2}{3} = {r},
|
||||
cell{3}{2} = {r},
|
||||
cell{3}{3} = {r},
|
||||
cell{4}{2} = {r},
|
||||
cell{4}{3} = {r},
|
||||
cell{5}{2} = {r},
|
||||
cell{5}{3} = {r},
|
||||
hline{2} = {-}{solid,black},
|
||||
hline{2} = {2}{-}{solid,black},
|
||||
}
|
||||
Level & Non-\ac{pim} & \ac{pim} \\
|
||||
X1&2374720&1088170 \\
|
||||
X2&3677220&1604170 \\
|
||||
X3&5875080&2708170 \\
|
||||
X4&11640050&4703250 \\
|
||||
|
||||
\end{tblr}
|
||||
22
src/tables/haxpy_3GHz.tex
Normal file
22
src/tables/haxpy_3GHz.tex
Normal file
@@ -0,0 +1,22 @@
|
||||
|
||||
\begin{tblr}{
|
||||
hlines,
|
||||
vlines,
|
||||
cell{2}{2} = {r},
|
||||
cell{2}{3} = {r},
|
||||
cell{3}{2} = {r},
|
||||
cell{3}{3} = {r},
|
||||
cell{4}{2} = {r},
|
||||
cell{4}{3} = {r},
|
||||
cell{5}{2} = {r},
|
||||
cell{5}{3} = {r},
|
||||
hline{2} = {-}{solid,black},
|
||||
hline{2} = {2}{-}{solid,black},
|
||||
}
|
||||
Level & Non-\ac{pim} & \ac{pim} \\
|
||||
X1&30413223&1555443 \\
|
||||
X2&59499108&2417913 \\
|
||||
X3&117506376&4207788 \\
|
||||
X4&234943821&7578414 \\
|
||||
|
||||
\end{tblr}
|
||||
22
src/tables/vadd_100GHz.tex
Normal file
22
src/tables/vadd_100GHz.tex
Normal file
@@ -0,0 +1,22 @@
|
||||
|
||||
\begin{tblr}{
|
||||
hlines,
|
||||
vlines,
|
||||
cell{2}{2} = {r},
|
||||
cell{2}{3} = {r},
|
||||
cell{3}{2} = {r},
|
||||
cell{3}{3} = {r},
|
||||
cell{4}{2} = {r},
|
||||
cell{4}{3} = {r},
|
||||
cell{5}{2} = {r},
|
||||
cell{5}{3} = {r},
|
||||
hline{2} = {-}{solid,black},
|
||||
hline{2} = {2}{-}{solid,black},
|
||||
}
|
||||
Level & Non-\ac{pim} & \ac{pim} \\
|
||||
X1&2024360&844170 \\
|
||||
X2&2574730&1412170 \\
|
||||
X3&3899060&2496170 \\
|
||||
X4&8037930&4493240 \\
|
||||
|
||||
\end{tblr}
|
||||
22
src/tables/vadd_3GHz.tex
Normal file
22
src/tables/vadd_3GHz.tex
Normal file
@@ -0,0 +1,22 @@
|
||||
|
||||
\begin{tblr}{
|
||||
hlines,
|
||||
vlines,
|
||||
cell{2}{2} = {r},
|
||||
cell{2}{3} = {r},
|
||||
cell{3}{2} = {r},
|
||||
cell{3}{3} = {r},
|
||||
cell{4}{2} = {r},
|
||||
cell{4}{3} = {r},
|
||||
cell{5}{2} = {r},
|
||||
cell{5}{3} = {r},
|
||||
hline{2} = {-}{solid,black},
|
||||
hline{2} = {2}{-}{solid,black},
|
||||
}
|
||||
Level & Non-\ac{pim} & \ac{pim} \\
|
||||
X1&15585399&1220778 \\
|
||||
X2&29988981&2112885 \\
|
||||
X3&59177430&3864465 \\
|
||||
X4&118902645&7236756 \\
|
||||
|
||||
\end{tblr}
|
||||
22
src/tables/vmul_100GHz.tex
Normal file
22
src/tables/vmul_100GHz.tex
Normal file
@@ -0,0 +1,22 @@
|
||||
|
||||
\begin{tblr}{
|
||||
hlines,
|
||||
vlines,
|
||||
cell{2}{2} = {r},
|
||||
cell{2}{3} = {r},
|
||||
cell{3}{2} = {r},
|
||||
cell{3}{3} = {r},
|
||||
cell{4}{2} = {r},
|
||||
cell{4}{3} = {r},
|
||||
cell{5}{2} = {r},
|
||||
cell{5}{3} = {r},
|
||||
hline{2} = {-}{solid,black},
|
||||
hline{2} = {2}{-}{solid,black},
|
||||
}
|
||||
Level & Non-\ac{pim} & \ac{pim} \\
|
||||
X1&2030090&845170 \\
|
||||
X2&3109090&1401170 \\
|
||||
X3&4654880&2493170 \\
|
||||
X4&7829930&4478240 \\
|
||||
|
||||
\end{tblr}
|
||||
22
src/tables/vmul_3GHz.tex
Normal file
22
src/tables/vmul_3GHz.tex
Normal file
@@ -0,0 +1,22 @@
|
||||
|
||||
\begin{tblr}{
|
||||
hlines,
|
||||
vlines,
|
||||
cell{2}{2} = {r},
|
||||
cell{2}{3} = {r},
|
||||
cell{3}{2} = {r},
|
||||
cell{3}{3} = {r},
|
||||
cell{4}{2} = {r},
|
||||
cell{4}{3} = {r},
|
||||
cell{5}{2} = {r},
|
||||
cell{5}{3} = {r},
|
||||
hline{2} = {-}{solid,black},
|
||||
hline{2} = {2}{-}{solid,black},
|
||||
}
|
||||
Level & Non-\ac{pim} & \ac{pim} \\
|
||||
X1&17269047&1219779 \\
|
||||
X2&33834132&2125872 \\
|
||||
X3&66308292&3909753 \\
|
||||
X4&131863338&7239087 \\
|
||||
|
||||
\end{tblr}
|
||||
Reference in New Issue
Block a user