First simulation plots
This commit is contained in:
@@ -1,6 +1,9 @@
|
||||
\section{Appendix}
|
||||
\label{sec:appendix}
|
||||
|
||||
\subsection{Simulation Results}
|
||||
\subsection{Microkernels}
|
||||
\subsection{Source Code}
|
||||
% etwas source code,
|
||||
% von der vm
|
||||
% einige microkernels
|
||||
|
||||
@@ -22,7 +22,7 @@ Because the charge stored in each cell is very small, so-called \acp{psa} are ne
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{images/psa}
|
||||
\caption[\ac{psa} of an open bitline architecture]{\ac{psa} of an open bitline architecture \cite{jacob2008} \cite{jung2017a}.}
|
||||
\caption[\ac{psa} of an open bitline architecture.]{\ac{psa} of an open bitline architecture \cite{jacob2008} \cite{jung2017a}.}
|
||||
\label{img:psa}
|
||||
\end{figure}
|
||||
|
||||
@@ -39,7 +39,7 @@ The \cref{img:bank} summarizes the basic architecture of a single storage device
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{images/bank}
|
||||
\caption[Architecture of a single DRAM device]{Architecture of a single DRAM device \cite{jung2017a}.}
|
||||
\caption[Architecture of a single DRAM device.]{Architecture of a single DRAM device \cite{jung2017a}.}
|
||||
\label{img:bank}
|
||||
\end{figure}
|
||||
|
||||
@@ -83,7 +83,7 @@ Because banks can be controlled independently, one bank can be outputting the ne
|
||||
\bitbox{3}[bgcolor=verylightgray]{}
|
||||
\end{bytefield}
|
||||
|
||||
\caption[Exemplary address mapping scheme]{Exemplary address mapping scheme for an input address of size 32.}
|
||||
\caption{Exemplary address mapping scheme for an input address of size 32.}
|
||||
\label{img:bank_interleaving}
|
||||
\end{figure}
|
||||
|
||||
@@ -102,7 +102,7 @@ Several \ac{dram} dies are stacked on top of each other and connected with \acp{
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{images/sip}
|
||||
\caption[Cross-section view of an \ac{hbm} \ac{sip}]{Cross-section view of a \ac{hbm} \ac{sip} \cite{lee2021}.}
|
||||
\caption[Cross-section view of an \ac{hbm} \ac{sip}.]{Cross-section view of a \ac{hbm} \ac{sip} \cite{lee2021}.}
|
||||
\label{img:sip}
|
||||
\end{figure}
|
||||
Such a cube is then placed onto a common silicon interposer that connects the \ac{dram} to its host processor.
|
||||
@@ -123,7 +123,7 @@ In the center of the die, the \acp{tsv} connect the die to the next die above it
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{images/hbm}
|
||||
\caption[\aca{hbm} memory die architecture]{\aca{hbm} memory die architecture \cite{lee2021}.}
|
||||
\caption[\aca{hbm} memory die architecture.]{\aca{hbm} memory die architecture \cite{lee2021}.}
|
||||
\label{img:hbm}
|
||||
\end{figure}
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@ Each granule size has a different maximum amount of page table nesting, with up
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{images/pagetable_granule}
|
||||
\caption[The distinct page table levels for the $\qty{4}{\kilo\byte}$ granule]{The distinct page table levels for the $\qty{4}{\kilo\byte}$ granule \cite{arm2015}.}
|
||||
\caption[The distinct page table levels for the $\qty{4}{\kilo\byte}$ granule.]{The distinct page table levels for the $\qty{4}{\kilo\byte}$ granule \cite{arm2015}.}
|
||||
\label{img:pagetable_granule}
|
||||
\end{figure}
|
||||
|
||||
@@ -117,7 +117,7 @@ The simplified \ac{am} following this scheme is shown in \cref{img:hbm2_am}.
|
||||
\bitbox{3}[bgcolor=verylightgray]{}
|
||||
\end{bytefield}
|
||||
|
||||
\caption[Simplified \aca{hbm} address mapping with a split column mapping]{Simplified \aca{hbm} address mapping with a split column mapping.}
|
||||
\caption{Simplified \aca{hbm} address mapping with a split column mapping.}
|
||||
\label{img:hbm2_am}
|
||||
\end{figure}
|
||||
|
||||
@@ -150,7 +150,7 @@ The concrete values for these parameters are listed in \cref{tab:memspec}.
|
||||
Width & Width of the Data Bus & 64
|
||||
\end{tblr}
|
||||
% }
|
||||
\caption[A list of the used configuration parameters of \aca{hbm}]{A list of the used configuration parameters of \aca{hbm}.}
|
||||
\caption{A list of the used configuration parameters of \aca{hbm}.}
|
||||
\label{tab:memspec}
|
||||
\end{table}
|
||||
|
||||
@@ -179,7 +179,7 @@ JUMP -1, 7
|
||||
FILL BANK, GRF_B #0
|
||||
EXIT
|
||||
\end{verbatim}
|
||||
\caption[A complete \ac{gemv} microkernel]{A complete \ac{gemv} microkernel.}
|
||||
\caption{A complete \ac{gemv} microkernel.}
|
||||
\label{lst:gemv_microkernel}
|
||||
\end{listing}
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ enum File {
|
||||
}
|
||||
\end{minted}
|
||||
\end{minipage}
|
||||
\caption[The \texttt{enum} definitions of the instructions and register files]{The \texttt{enum} definitions of the instructions and register files.}
|
||||
\caption{The \texttt{enum} definitions of the instructions and register files.}
|
||||
\label{lst:instruction_enums}
|
||||
\end{listing}
|
||||
A microkernel is then simply an array consisting of instructions of size 32.
|
||||
@@ -81,7 +81,7 @@ This \texttt{ComputeArray} and \texttt{BankArray} layout is illustrated in \cref
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{images/compute_array}
|
||||
\caption[Memory layout of a flat \ac{fp16} array spanning over four banks]{Memory layout of a flat \ac{fp16} array spanning over four banks.}
|
||||
\caption{Memory layout of a flat \ac{fp16} array spanning over four banks.}
|
||||
\label{img:compute_array}
|
||||
\end{figure}
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ In addition, Moore's Law is slowing down as further device scaling approaches ph
|
||||
\begin{figure}[!ht]
|
||||
\centering
|
||||
\input{plots/energy_chart}
|
||||
\caption[Total energy of computing]{Total energy of computing \cite{src2021}.}
|
||||
\caption[Total energy of computing.]{Total energy of computing \cite{src2021}.}
|
||||
\label{plt:enery_chart}
|
||||
\end{figure}
|
||||
|
||||
@@ -34,7 +34,7 @@ In contrast, compute-intensive workloads, such as visual processing, are referre
|
||||
\begin{figure}[!ht]
|
||||
\centering
|
||||
\input{plots/roofline}
|
||||
\caption[Roofline model of GPT revisions]{Roofline model of GPT revisions \cite{ivobolsens2023}.}
|
||||
\caption[Roofline model of GPT revisions.]{Roofline model of GPT revisions \cite{ivobolsens2023}.}
|
||||
\label{plt:roofline}
|
||||
\end{figure}
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ This process is illustrated in \cref{img:dnn} where one \ac{dnn} layer is proces
|
||||
\begin{figure}
|
||||
\centering
|
||||
\input{images/dnn}
|
||||
\caption[A fully connected \ac{dnn} layer]{A fully connected \ac{dnn} layer \cite{he2020}.}
|
||||
\caption[A fully connected \ac{dnn} layer.]{A fully connected \ac{dnn} layer \cite{he2020}.}
|
||||
\label{img:dnn}
|
||||
\end{figure}
|
||||
|
||||
@@ -108,7 +108,7 @@ To make full use of the output buffering, the matrix rows are interleaved in an
|
||||
\begin{figure}
|
||||
\centering
|
||||
\input{images/hynix}
|
||||
\caption[Newton memory layout for a \ac{gemv} operation]{Newton memory layout for a \ac{gemv} operation \cite{he2020}.}
|
||||
\caption[Newton memory layout for a \ac{gemv} operation.]{Newton memory layout for a \ac{gemv} operation \cite{he2020}.}
|
||||
\label{img:hynix}
|
||||
\end{figure}
|
||||
|
||||
@@ -142,7 +142,7 @@ This general architecture is shown in detail in \cref{img:fimdram}, with (a) the
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{images/fimdram}
|
||||
\caption[Architecture of \aca{fimdram}]{Architecture of \aca{fimdram} \cite{lee2021}.}
|
||||
\caption[Architecture of \aca{fimdram}.]{Architecture of \aca{fimdram} \cite{lee2021}.}
|
||||
\label{img:fimdram}
|
||||
\end{figure}
|
||||
|
||||
@@ -185,7 +185,7 @@ This processing unit architecture is illustrated in \cref{img:pcu}, along with t
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{images/pcu}
|
||||
\caption[Architecture of a \ac{pim} processing unit]{Architecture of a \ac{pim} processing unit \cite{lee2021}.}
|
||||
\caption[Architecture of a \ac{pim} processing unit.]{Architecture of a \ac{pim} processing unit \cite{lee2021}.}
|
||||
\label{img:pcu}
|
||||
\end{figure}
|
||||
|
||||
@@ -200,7 +200,7 @@ The data layout of these three instruction groups is shown in \cref{tab:isa}.
|
||||
\begin{table}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{images/isa}
|
||||
\caption[The instruction format of the processing units]{The instruction format of the processing units \cite{lee2021}.}
|
||||
\caption[The instruction format of the processing units.]{The instruction format of the processing units \cite{lee2021}.}
|
||||
\label{tab:isa}
|
||||
\end{table}
|
||||
|
||||
@@ -235,7 +235,7 @@ Another special field \textit{A} enables the \ac{aam}, which will be explained i
|
||||
Arithmetic & MAC & multiply-accumulate & GRF-B & GRF, BANK & GRF, BANK, SRF & GRF, BANK, SRF \\
|
||||
Arithmetic & MAD & multiply-and-add & GRF & GRF, BANK & GRF, BANK, SRF & GRF, BANK, SRF
|
||||
\end{tblr}}
|
||||
\caption[A list of all supported \ac{pim} instructions and their possible sources and destinations]{A list of all supported \ac{pim} instructions and their possible sources and destinations \cite{shin-haengkang2023}.}
|
||||
\caption[A list of all supported \ac{pim} instructions and their possible sources and destinations.]{A list of all supported \ac{pim} instructions and their possible sources and destinations \cite{shin-haengkang2023}.}
|
||||
\label{tab:instruction_set}
|
||||
\end{table}
|
||||
|
||||
@@ -259,7 +259,7 @@ For example, as shown in \cref{lst:reorder}, two consecutive \ac{mac} instructio
|
||||
MAC GRF_B #0, BANK, GRF_A #0
|
||||
MAC GRF_B #1, BANK, GRF_A #1
|
||||
\end{verbatim}
|
||||
\caption[Exemplary sequence of \ac{mac} instructions in a microkernel]{Exemplary sequence of \ac{mac} instructions in a microkernel.}
|
||||
\caption[Exemplary sequence of \ac{mac} instructions in a microkernel.]{Exemplary sequence of \ac{mac} instructions in a microkernel.}
|
||||
\label{lst:reorder}
|
||||
\end{listing}
|
||||
|
||||
@@ -276,7 +276,7 @@ With this method, the register indices and the bank address cannot get out of sy
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=0.5\linewidth]{images/aam}
|
||||
\caption[Exemplary calculation of the \ac{grf}-A and \ac{grf}-B index using the row and column address]{Exemplary calculation of the \ac{grf}-A and \ac{grf}-B index using the row and column address \cite{lee2021}.}
|
||||
\caption[Exemplary calculation of the \ac{grf}-A and \ac{grf}-B index using the row and column address.]{Exemplary calculation of the \ac{grf}-A and \ac{grf}-B index using the row and column address \cite{lee2021}.}
|
||||
\label{img:aam}
|
||||
\end{figure}
|
||||
|
||||
@@ -288,7 +288,7 @@ At the core of a \ac{gemv} microkernel is an iterative \ac{mac} instruction, fol
|
||||
MAC(AAM) GRF_B, BANK, GRF_A
|
||||
JUMP -1, 7
|
||||
\end{verbatim}
|
||||
\caption[The core of a \ac{gemv} microkernel]{The core of a \ac{gemv} microkernel.}
|
||||
\caption[The core of a \ac{gemv} microkernel.]{The core of a \ac{gemv} microkernel.}
|
||||
\label{lst:gemv}
|
||||
\end{listing}
|
||||
|
||||
@@ -342,7 +342,7 @@ This interleaving is illustrated in \cref{img:input_vector}.
|
||||
\begin{figure}
|
||||
\centering
|
||||
\input{images/input_vector}
|
||||
\caption[Input vector in linear address space, where one chunk is mapped to all banks]{Input vector in linear address space, where one chunk is mapped to all banks.}
|
||||
\caption{Input vector in linear address space, where one chunk is mapped to all banks.}
|
||||
\label{img:input_vector}
|
||||
\end{figure}
|
||||
|
||||
@@ -366,7 +366,7 @@ The operation of this concrete \ac{gemv} microkernel is illustrated in \cref{img
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{images/memory_layout}
|
||||
\caption[Procedure to perform a (128$\times$8)$\times$(128) \ac{gemv} operation]{Procedure to perform a (128$\times$8)$\times$(128) \ac{gemv} operation. One cell represents 16 \ac{fp16} elements, forming a $\qty{32}{\byte}$ block \cite{kang2022}.}
|
||||
\caption[Procedure to perform a (128$\times$8)$\times$(128) \ac{gemv} operation.]{Procedure to perform a (128$\times$8)$\times$(128) \ac{gemv} operation. One cell represents 16 \ac{fp16} elements, forming a $\qty{32}{\byte}$ block \cite{kang2022}.}
|
||||
\label{img:memory_layout}
|
||||
\end{figure}
|
||||
|
||||
@@ -380,7 +380,7 @@ As a side effect of the incremented matrix row address, this also results in an
|
||||
MAC(AAM) GRF_B, BANK, GRF_A
|
||||
JUMP -1, 63
|
||||
\end{verbatim}
|
||||
\caption[The core of a \ac{mac} microkernel that utilizes the maximum number of register entries]{The core of a \ac{mac} microkernel that utilizes the maximum number of register entries.}
|
||||
\caption[The core of a \ac{mac} microkernel that utilizes the maximum number of register entries.]{The core of a \ac{mac} microkernel that utilizes the maximum number of register entries.}
|
||||
\label{lst:gemv64}
|
||||
\end{listing}
|
||||
A further increase in the total number of rows can be achieved by distributing the weight matrix over multiple \acp{pch} and running the microkernel multiple times, concatenating the output vectors on the host at the end.
|
||||
@@ -405,7 +405,7 @@ The following \cref{sec:vp} introduces the concept of virtual prototyping, which
|
||||
\begin{landscape}
|
||||
\begin{figure}
|
||||
\input{images/matrix_layout}
|
||||
\caption[Mapping of the weight matrix onto the memory banks and its layout in the linear address space]{Mapping of the weight matrix onto the memory banks and its layout in the linear address space.}
|
||||
\caption[Mapping of the weight matrix onto the memory banks and its layout in the linear address space.]{Mapping of the weight matrix onto the memory banks and its layout in the linear address space.}
|
||||
\label{img:matrix_layout}
|
||||
\end{figure}
|
||||
\end{landscape}
|
||||
|
||||
@@ -72,16 +72,94 @@ This allows an exaggerated evaluation of the performance gains of \ac{pim} in an
|
||||
% comparison with normal clock and infinite compute (immer 4 simulationen, bzw. 5 mit echter hardware)
|
||||
|
||||
\subsection{Simulation Results}
|
||||
\subsubsection{Workload Kernels}
|
||||
\subsubsection{Vector Operations}
|
||||
% Vector ADD und Vector MUL
|
||||
% Vector Skalar ADD und Vector Skalar MUL (HCAL)
|
||||
% Vector Skalar ADD und Vector Skalar MUL (HCAL) (wird wohl übersprungen)
|
||||
% Vector HAXPY x*a+y
|
||||
|
||||
% Plots zB VADD VMUL nebeneinander für versch. Dimensionen und einer Frequenz
|
||||
% andere Frequenz nächster Plot
|
||||
% dann HAXPY
|
||||
|
||||
The first set of benchmarks analyzes the speedup of \aca{fimdram} for various vector operations, namely an element-wise vector add operation (VADD), an element-wise vector multiply operation (VMUL), and a \ac{haxpy} operation.
|
||||
Such vector operations have a low operational density and are particularly memory-bounded because there is no data reuse at all and two input operands must be loaded for each operation.
|
||||
As a result, the on-chip cache does not accelerate such workloads because all operand data must be fetched from memory anyway.
|
||||
The workloads adhere to the following calculation patterns:
|
||||
|
||||
\begin{itemize}
|
||||
\item VADD: $z = x + y$
|
||||
\item VMUL: $z = x \cdot y$
|
||||
\item \ac{haxpy}: $z = a \cdot x + y$
|
||||
\end{itemize}
|
||||
|
||||
Each workload is run with different input vector dimensions to examine the effect of setup overhead and potentially identify a break-even point at which \ac{pim} becomes viable.
|
||||
\Cref{tab:dimensions_vector} lists the specific vector dimensions for the following benchmarks.
|
||||
The levels X1-X4 denote the increasing dimensions, with each step doubling in size.
|
||||
|
||||
\begin{table}
|
||||
\centering
|
||||
\begin{tblr}{
|
||||
cell{2}{2} = {r},
|
||||
cell{3}{2} = {r},
|
||||
cell{4}{2} = {r},
|
||||
cell{5}{2} = {r},
|
||||
hlines,
|
||||
vlines,
|
||||
hline{2} = {-}{solid,black},
|
||||
hline{2} = {2}{-}{solid,black},
|
||||
}
|
||||
Level & Dimensions \\
|
||||
X1 & (256 $\times$ 1) \\
|
||||
X2 & (512 $\times$ 1) \\
|
||||
X3 & (1024 $\times$ 1) \\
|
||||
X4 & (2048 $\times$ 1)
|
||||
\end{tblr}
|
||||
\caption{List of the input vector dimensions for the vector benchmarks.}
|
||||
\label{tab:dimensions_vector}
|
||||
\end{table}
|
||||
|
||||
The benchmarks analyze the relative number of processor ticks where the speedup is calculated as follows:
|
||||
\begin{equation}
|
||||
S = \frac{\textrm{# of ticks in non-\ac{pim} mode}}{# of ticks in \ac{pim} mode}
|
||||
\end{equation}
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
\input{plots/vector_normal}
|
||||
\caption{Comparison between non-\ac{pim} and \ac{pim} for the vector benchmarks running at a \ac{cpu} frequency of $\qty{3}{\giga\hertz}$.}
|
||||
\label{fig:vector_normal}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
\input{plots/vector_infinite}
|
||||
\caption{test}
|
||||
\label{fig:vector_infinite}
|
||||
\end{figure}
|
||||
|
||||
\subsubsection{Neural Network Layers}
|
||||
% GEMV
|
||||
% Samsung 7.4x-8.9x
|
||||
% "inference" mit mehreren layern
|
||||
% ReLU vergleich
|
||||
|
||||
% GEMM mit stark interleavten matrizen
|
||||
% GEMM mit stark interleavten matrizen (eher nicht)
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
\input{plots/matrix_normal}
|
||||
\caption{test}
|
||||
\label{fig:matrix_normal}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
\input{plots/matrix_infinite}
|
||||
\caption{test}
|
||||
\label{fig:matrix_infinite}
|
||||
\end{figure}
|
||||
|
||||
\subsubsection{Comparison to Real Hardware}
|
||||
|
||||
% \subsubsection{Initialization Overhead}
|
||||
% conversion der operanden im verhältnis zur laufzeit abschätzen
|
||||
|
||||
@@ -45,7 +45,7 @@ The framework is optimized for high simulation speed and uses the \ac{at} coding
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{images/dramsys}
|
||||
\caption[The internal architecture of DRAMSys]{The internal architecture of DRAMSys \cite{jung2017a}.}
|
||||
\caption[The internal architecture of DRAMSys.]{The internal architecture of DRAMSys \cite{jung2017a}.}
|
||||
\label{img:dramsys}
|
||||
\end{figure}
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
\usepackage{url}
|
||||
\usepackage[urldate=long,sorting=none,maxbibnames=5]{biblatex}
|
||||
\usepackage{pgfplots}
|
||||
\usepackage{pgfplotstable}
|
||||
\usepackage{bytefield}
|
||||
\usepackage{mathdots}
|
||||
\usepackage{tabularray}
|
||||
@@ -46,6 +47,11 @@
|
||||
|
||||
% Custom colors
|
||||
\definecolor{verylightgray}{gray}{0.85}
|
||||
\definecolor{_darkblue}{RGB}{68, 114, 196}
|
||||
\definecolor{_blue}{RGB}{91, 155, 213}
|
||||
\definecolor{_green}{RGB}{112, 173, 71}
|
||||
\definecolor{_orange}{RGB}{237, 125, 49}
|
||||
\definecolor{_yellow}{RGB}{255, 192, 0}
|
||||
|
||||
% Penalties
|
||||
\clubpenalty = 10000
|
||||
|
||||
31
src/plots/matrix_infinite.tex
Normal file
31
src/plots/matrix_infinite.tex
Normal file
@@ -0,0 +1,31 @@
|
||||
\begin{tikzpicture}
|
||||
\pgfplotstableread[col sep=comma]{plots/tables/gemv_100GHz.csv}\gemv
|
||||
\pgfplotstableread[col sep=comma]{plots/tables/gemv_layers_100GHz.csv}\gemvlayers
|
||||
\begin{axis}[
|
||||
width=0.9\textwidth,
|
||||
ybar=1pt,
|
||||
bar width = 15pt,
|
||||
ymin=0,
|
||||
ymax=5,
|
||||
ytick distance=1,
|
||||
ymajorgrids,
|
||||
ylabel={Speedup},
|
||||
tick pos=left,
|
||||
xtick=data,
|
||||
xticklabels from table={\gemv}{level},
|
||||
enlarge x limits=0.25,
|
||||
legend style={
|
||||
at={(current bounding box.south-|current axis.south)},
|
||||
anchor=north,
|
||||
legend columns=-1,
|
||||
draw=none,
|
||||
/tikz/every even column/.append style={column sep=0.5cm}
|
||||
},
|
||||
]
|
||||
\addplot[fill=_blue!90] table [x expr=\coordindex, y={speedup}]{\gemv};
|
||||
\addlegendentry{GEMV}
|
||||
|
||||
\addplot[fill=_orange!90] table [x expr=\coordindex, y={speedup}]{\gemvlayers};
|
||||
\addlegendentry{DNN Layers}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
31
src/plots/matrix_normal.tex
Normal file
31
src/plots/matrix_normal.tex
Normal file
@@ -0,0 +1,31 @@
|
||||
\begin{tikzpicture}
|
||||
\pgfplotstableread[col sep=comma]{plots/tables/gemv_3GHz.csv}\gemv
|
||||
\pgfplotstableread[col sep=comma]{plots/tables/gemv_layers_3GHz.csv}\gemvlayers
|
||||
\begin{axis}[
|
||||
width=0.9\textwidth,
|
||||
ybar=1pt,
|
||||
bar width = 15pt,
|
||||
ymin=0,
|
||||
ymax=35,
|
||||
minor y tick num = 5,
|
||||
ymajorgrids,
|
||||
ylabel={Speedup},
|
||||
tick pos=left,
|
||||
xtick=data,
|
||||
xticklabels from table={\gemv}{level},
|
||||
enlarge x limits=0.25,
|
||||
legend style={
|
||||
at={(current bounding box.south-|current axis.south)},
|
||||
anchor=north,
|
||||
legend columns=-1,
|
||||
draw=none,
|
||||
/tikz/every even column/.append style={column sep=0.5cm}
|
||||
},
|
||||
]
|
||||
\addplot[fill=_blue!90] table [x expr=\coordindex, y={speedup}]{\gemv};
|
||||
\addlegendentry{GEMV}
|
||||
|
||||
\addplot[fill=_orange!90] table [x expr=\coordindex, y={speedup}]{\gemvlayers};
|
||||
\addlegendentry{DNN Layers}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
5
src/plots/tables/gemv_100GHz.csv
Normal file
5
src/plots/tables/gemv_100GHz.csv
Normal file
@@ -0,0 +1,5 @@
|
||||
workload,level,frequency,speedup
|
||||
gemv,X1,100GHz,0.2108059965502083
|
||||
gemv,X2,100GHz,0.40509080127411157
|
||||
gemv,X3,100GHz,0.8462958338758609
|
||||
gemv,X4,100GHz,4.7274497979448125
|
||||
|
5
src/plots/tables/gemv_3GHz.csv
Normal file
5
src/plots/tables/gemv_3GHz.csv
Normal file
@@ -0,0 +1,5 @@
|
||||
workload,level,frequency,speedup
|
||||
gemv,X1,3GHz,3.468782996825547
|
||||
gemv,X2,3GHz,6.723879985176877
|
||||
gemv,X3,3GHz,12.744110856471028
|
||||
gemv,X4,3GHz,23.645526777997713
|
||||
|
5
src/plots/tables/gemv_layers_100GHz.csv
Normal file
5
src/plots/tables/gemv_layers_100GHz.csv
Normal file
@@ -0,0 +1,5 @@
|
||||
workload,level,frequency,speedup
|
||||
gemv_layers,X1,100GHz,0.18703680911951287
|
||||
gemv_layers,X2,100GHz,0.35722454947444127
|
||||
gemv_layers,X3,100GHz,0.6338568319278073
|
||||
gemv_layers,X4,100GHz,1.638629460755059
|
||||
|
5
src/plots/tables/gemv_layers_3GHz.csv
Normal file
5
src/plots/tables/gemv_layers_3GHz.csv
Normal file
@@ -0,0 +1,5 @@
|
||||
workload,level,frequency,speedup
|
||||
gemv_layers,X1,3GHz,3.194018430394461
|
||||
gemv_layers,X2,3GHz,6.206580081241512
|
||||
gemv_layers,X3,3GHz,11.305511591995977
|
||||
gemv_layers,X4,3GHz,20.27760945615218
|
||||
|
5
src/plots/tables/haxpy_100GHz.csv
Normal file
5
src/plots/tables/haxpy_100GHz.csv
Normal file
@@ -0,0 +1,5 @@
|
||||
workload,level,frequency,speedup
|
||||
haxpy,X1,100GHz,2.0481358611246403
|
||||
haxpy,X2,100GHz,2.3234133539462776
|
||||
haxpy,X3,100GHz,2.272582592673281
|
||||
haxpy,X4,100GHz,2.3895030032424387
|
||||
|
5
src/plots/tables/haxpy_3GHz.csv
Normal file
5
src/plots/tables/haxpy_3GHz.csv
Normal file
@@ -0,0 +1,5 @@
|
||||
workload,level,frequency,speedup
|
||||
haxpy,X1,3GHz,19.816741597088416
|
||||
haxpy,X2,3GHz,25.395400082633245
|
||||
haxpy,X3,3GHz,28.676005064893953
|
||||
haxpy,X4,3GHz,31.783592582828017
|
||||
|
5
src/plots/tables/vadd_100GHz.csv
Normal file
5
src/plots/tables/vadd_100GHz.csv
Normal file
@@ -0,0 +1,5 @@
|
||||
workload,level,frequency,speedup
|
||||
vadd,X1,100GHz,2.398047786583271
|
||||
vadd,X2,100GHz,1.823243660465808
|
||||
vadd,X3,100GHz,1.562017010059411
|
||||
vadd,X4,100GHz,1.7888939829610704
|
||||
|
5
src/plots/tables/vadd_3GHz.csv
Normal file
5
src/plots/tables/vadd_3GHz.csv
Normal file
@@ -0,0 +1,5 @@
|
||||
workload,level,frequency,speedup
|
||||
vadd,X1,3GHz,12.766775777414075
|
||||
vadd,X2,3GHz,14.19338061465721
|
||||
vadd,X3,3GHz,15.313227057302887
|
||||
vadd,X4,3GHz,16.430379164365913
|
||||
|
5
src/plots/tables/vmul_100GHz.csv
Normal file
5
src/plots/tables/vmul_100GHz.csv
Normal file
@@ -0,0 +1,5 @@
|
||||
workload,level,frequency,speedup
|
||||
vmul,X1,100GHz,2.4019901321627604
|
||||
vmul,X2,100GHz,2.2189241847884267
|
||||
vmul,X3,100GHz,1.86705278821741
|
||||
vmul,X4,100GHz,1.7484391189395834
|
||||
|
5
src/plots/tables/vmul_3GHz.csv
Normal file
5
src/plots/tables/vmul_3GHz.csv
Normal file
@@ -0,0 +1,5 @@
|
||||
workload,level,frequency,speedup
|
||||
vmul,X1,3GHz,14.157521157521158
|
||||
vmul,X2,3GHz,15.915413533834586
|
||||
vmul,X3,3GHz,16.959713823354058
|
||||
vmul,X4,3GHz,18.215465292791755
|
||||
|
35
src/plots/vector_infinite.tex
Normal file
35
src/plots/vector_infinite.tex
Normal file
@@ -0,0 +1,35 @@
|
||||
\begin{tikzpicture}
|
||||
\pgfplotstableread[col sep=comma]{plots/tables/vadd_100GHz.csv}\vadd
|
||||
\pgfplotstableread[col sep=comma]{plots/tables/vmul_100GHz.csv}\vmul
|
||||
\pgfplotstableread[col sep=comma]{plots/tables/haxpy_100GHz.csv}\haxpy
|
||||
\begin{axis}[
|
||||
width=0.9\textwidth,
|
||||
ybar=1pt,
|
||||
bar width = 15pt,
|
||||
ymin=0,
|
||||
ymax=5,
|
||||
ytick distance=1,
|
||||
ymajorgrids,
|
||||
ylabel={Speedup},
|
||||
tick pos=left,
|
||||
xtick=data,
|
||||
xticklabels from table={\vadd}{level},
|
||||
enlarge x limits=0.25,
|
||||
legend style={
|
||||
at={(current bounding box.south-|current axis.south)},
|
||||
anchor=north,
|
||||
legend columns=-1,
|
||||
draw=none,
|
||||
/tikz/every even column/.append style={column sep=0.5cm}
|
||||
},
|
||||
]
|
||||
\addplot[fill=_blue!90] table [x expr=\coordindex, y={speedup}]{\vadd};
|
||||
\addlegendentry{VADD}
|
||||
|
||||
\addplot[fill=_orange!90] table [x expr=\coordindex, y={speedup}]{\vmul};
|
||||
\addlegendentry{VMUL}
|
||||
|
||||
\addplot[fill=_yellow!90] table [x expr=\coordindex, y={speedup}]{\haxpy};
|
||||
\addlegendentry{HAXPY}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
35
src/plots/vector_normal.tex
Normal file
35
src/plots/vector_normal.tex
Normal file
@@ -0,0 +1,35 @@
|
||||
\begin{tikzpicture}
|
||||
\pgfplotstableread[col sep=comma]{plots/tables/vadd_3GHz.csv}\vadd
|
||||
\pgfplotstableread[col sep=comma]{plots/tables/vmul_3GHz.csv}\vmul
|
||||
\pgfplotstableread[col sep=comma]{plots/tables/haxpy_3GHz.csv}\haxpy
|
||||
\begin{axis}[
|
||||
width=0.9\textwidth,
|
||||
ybar=1pt,
|
||||
bar width = 15pt,
|
||||
ymin=0,
|
||||
ymax=35,
|
||||
minor y tick num = 5,
|
||||
ymajorgrids,
|
||||
ylabel={Speedup},
|
||||
tick pos=left,
|
||||
xtick=data,
|
||||
xticklabels from table={\vadd}{level},
|
||||
enlarge x limits=0.25,
|
||||
legend style={
|
||||
at={(current bounding box.south-|current axis.south)},
|
||||
anchor=north,
|
||||
legend columns=-1,
|
||||
draw=none,
|
||||
/tikz/every even column/.append style={column sep=0.5cm}
|
||||
},
|
||||
]
|
||||
\addplot[fill=_blue!90] table [x expr=\coordindex, y={speedup}]{\vadd};
|
||||
\addlegendentry{VADD}
|
||||
|
||||
\addplot[fill=_orange!90] table [x expr=\coordindex, y={speedup}]{\vmul};
|
||||
\addlegendentry{VMUL}
|
||||
|
||||
\addplot[fill=_yellow!90] table [x expr=\coordindex, y={speedup}]{\haxpy};
|
||||
\addlegendentry{HAXPY}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
||||
Reference in New Issue
Block a user