First simulation plots

This commit is contained in:
2024-02-28 20:33:12 +01:00
parent b197caa00b
commit ee2405aaa9
23 changed files with 299 additions and 30 deletions

View File

@@ -1,6 +1,9 @@
\section{Appendix}
\label{sec:appendix}
\subsection{Simulation Results}
\subsection{Microkernels}
\subsection{Source Code}
% etwas source code,
% von der vm
% einige microkernels

View File

@@ -22,7 +22,7 @@ Because the charge stored in each cell is very small, so-called \acp{psa} are ne
\begin{figure}
\centering
\includegraphics[width=\linewidth]{images/psa}
\caption[\ac{psa} of an open bitline architecture]{\ac{psa} of an open bitline architecture \cite{jacob2008} \cite{jung2017a}.}
\caption[\ac{psa} of an open bitline architecture.]{\ac{psa} of an open bitline architecture \cite{jacob2008} \cite{jung2017a}.}
\label{img:psa}
\end{figure}
@@ -39,7 +39,7 @@ The \cref{img:bank} summarizes the basic architecture of a single storage device
\begin{figure}
\centering
\includegraphics[width=\linewidth]{images/bank}
\caption[Architecture of a single DRAM device]{Architecture of a single DRAM device \cite{jung2017a}.}
\caption[Architecture of a single DRAM device.]{Architecture of a single DRAM device \cite{jung2017a}.}
\label{img:bank}
\end{figure}
@@ -83,7 +83,7 @@ Because banks can be controlled independently, one bank can be outputting the ne
\bitbox{3}[bgcolor=verylightgray]{}
\end{bytefield}
\caption[Exemplary address mapping scheme]{Exemplary address mapping scheme for an input address of size 32.}
\caption{Exemplary address mapping scheme for an input address of size 32.}
\label{img:bank_interleaving}
\end{figure}
@@ -102,7 +102,7 @@ Several \ac{dram} dies are stacked on top of each other and connected with \acp{
\begin{figure}
\centering
\includegraphics[width=0.8\linewidth]{images/sip}
\caption[Cross-section view of an \ac{hbm} \ac{sip}]{Cross-section view of a \ac{hbm} \ac{sip} \cite{lee2021}.}
\caption[Cross-section view of an \ac{hbm} \ac{sip}.]{Cross-section view of a \ac{hbm} \ac{sip} \cite{lee2021}.}
\label{img:sip}
\end{figure}
Such a cube is then placed onto a common silicon interposer that connects the \ac{dram} to its host processor.
@@ -123,7 +123,7 @@ In the center of the die, the \acp{tsv} connect the die to the next die above it
\begin{figure}
\centering
\includegraphics[width=0.8\linewidth]{images/hbm}
\caption[\aca{hbm} memory die architecture]{\aca{hbm} memory die architecture \cite{lee2021}.}
\caption[\aca{hbm} memory die architecture.]{\aca{hbm} memory die architecture \cite{lee2021}.}
\label{img:hbm}
\end{figure}

View File

@@ -59,7 +59,7 @@ Each granule size has a different maximum amount of page table nesting, with up
\begin{figure}
\centering
\includegraphics[width=\linewidth]{images/pagetable_granule}
\caption[The distinct page table levels for the $\qty{4}{\kilo\byte}$ granule]{The distinct page table levels for the $\qty{4}{\kilo\byte}$ granule \cite{arm2015}.}
\caption[The distinct page table levels for the $\qty{4}{\kilo\byte}$ granule.]{The distinct page table levels for the $\qty{4}{\kilo\byte}$ granule \cite{arm2015}.}
\label{img:pagetable_granule}
\end{figure}
@@ -117,7 +117,7 @@ The simplified \ac{am} following this scheme is shown in \cref{img:hbm2_am}.
\bitbox{3}[bgcolor=verylightgray]{}
\end{bytefield}
\caption[Simplified \aca{hbm} address mapping with a split column mapping]{Simplified \aca{hbm} address mapping with a split column mapping.}
\caption{Simplified \aca{hbm} address mapping with a split column mapping.}
\label{img:hbm2_am}
\end{figure}
@@ -150,7 +150,7 @@ The concrete values for these parameters are listed in \cref{tab:memspec}.
Width & Width of the Data Bus & 64
\end{tblr}
% }
\caption[A list of the used configuration parameters of \aca{hbm}]{A list of the used configuration parameters of \aca{hbm}.}
\caption{A list of the used configuration parameters of \aca{hbm}.}
\label{tab:memspec}
\end{table}
@@ -179,7 +179,7 @@ JUMP -1, 7
FILL BANK, GRF_B #0
EXIT
\end{verbatim}
\caption[A complete \ac{gemv} microkernel]{A complete \ac{gemv} microkernel.}
\caption{A complete \ac{gemv} microkernel.}
\label{lst:gemv_microkernel}
\end{listing}

View File

@@ -51,7 +51,7 @@ enum File {
}
\end{minted}
\end{minipage}
\caption[The \texttt{enum} definitions of the instructions and register files]{The \texttt{enum} definitions of the instructions and register files.}
\caption{The \texttt{enum} definitions of the instructions and register files.}
\label{lst:instruction_enums}
\end{listing}
A microkernel is then simply an array consisting of instructions of size 32.
@@ -81,7 +81,7 @@ This \texttt{ComputeArray} and \texttt{BankArray} layout is illustrated in \cref
\begin{figure}
\centering
\includegraphics[width=\linewidth]{images/compute_array}
\caption[Memory layout of a flat \ac{fp16} array spanning over four banks]{Memory layout of a flat \ac{fp16} array spanning over four banks.}
\caption{Memory layout of a flat \ac{fp16} array spanning over four banks.}
\label{img:compute_array}
\end{figure}

View File

@@ -17,7 +17,7 @@ In addition, Moore's Law is slowing down as further device scaling approaches ph
\begin{figure}[!ht]
\centering
\input{plots/energy_chart}
\caption[Total energy of computing]{Total energy of computing \cite{src2021}.}
\caption[Total energy of computing.]{Total energy of computing \cite{src2021}.}
\label{plt:enery_chart}
\end{figure}
@@ -34,7 +34,7 @@ In contrast, compute-intensive workloads, such as visual processing, are referre
\begin{figure}[!ht]
\centering
\input{plots/roofline}
\caption[Roofline model of GPT revisions]{Roofline model of GPT revisions \cite{ivobolsens2023}.}
\caption[Roofline model of GPT revisions.]{Roofline model of GPT revisions \cite{ivobolsens2023}.}
\label{plt:roofline}
\end{figure}

View File

@@ -26,7 +26,7 @@ This process is illustrated in \cref{img:dnn} where one \ac{dnn} layer is proces
\begin{figure}
\centering
\input{images/dnn}
\caption[A fully connected \ac{dnn} layer]{A fully connected \ac{dnn} layer \cite{he2020}.}
\caption[A fully connected \ac{dnn} layer.]{A fully connected \ac{dnn} layer \cite{he2020}.}
\label{img:dnn}
\end{figure}
@@ -108,7 +108,7 @@ To make full use of the output buffering, the matrix rows are interleaved in an
\begin{figure}
\centering
\input{images/hynix}
\caption[Newton memory layout for a \ac{gemv} operation]{Newton memory layout for a \ac{gemv} operation \cite{he2020}.}
\caption[Newton memory layout for a \ac{gemv} operation.]{Newton memory layout for a \ac{gemv} operation \cite{he2020}.}
\label{img:hynix}
\end{figure}
@@ -142,7 +142,7 @@ This general architecture is shown in detail in \cref{img:fimdram}, with (a) the
\begin{figure}
\centering
\includegraphics[width=\linewidth]{images/fimdram}
\caption[Architecture of \aca{fimdram}]{Architecture of \aca{fimdram} \cite{lee2021}.}
\caption[Architecture of \aca{fimdram}.]{Architecture of \aca{fimdram} \cite{lee2021}.}
\label{img:fimdram}
\end{figure}
@@ -185,7 +185,7 @@ This processing unit architecture is illustrated in \cref{img:pcu}, along with t
\begin{figure}
\centering
\includegraphics[width=0.8\linewidth]{images/pcu}
\caption[Architecture of a \ac{pim} processing unit]{Architecture of a \ac{pim} processing unit \cite{lee2021}.}
\caption[Architecture of a \ac{pim} processing unit.]{Architecture of a \ac{pim} processing unit \cite{lee2021}.}
\label{img:pcu}
\end{figure}
@@ -200,7 +200,7 @@ The data layout of these three instruction groups is shown in \cref{tab:isa}.
\begin{table}
\centering
\includegraphics[width=\linewidth]{images/isa}
\caption[The instruction format of the processing units]{The instruction format of the processing units \cite{lee2021}.}
\caption[The instruction format of the processing units.]{The instruction format of the processing units \cite{lee2021}.}
\label{tab:isa}
\end{table}
@@ -235,7 +235,7 @@ Another special field \textit{A} enables the \ac{aam}, which will be explained i
Arithmetic & MAC & multiply-accumulate & GRF-B & GRF, BANK & GRF, BANK, SRF & GRF, BANK, SRF \\
Arithmetic & MAD & multiply-and-add & GRF & GRF, BANK & GRF, BANK, SRF & GRF, BANK, SRF
\end{tblr}}
\caption[A list of all supported \ac{pim} instructions and their possible sources and destinations]{A list of all supported \ac{pim} instructions and their possible sources and destinations \cite{shin-haengkang2023}.}
\caption[A list of all supported \ac{pim} instructions and their possible sources and destinations.]{A list of all supported \ac{pim} instructions and their possible sources and destinations \cite{shin-haengkang2023}.}
\label{tab:instruction_set}
\end{table}
@@ -259,7 +259,7 @@ For example, as shown in \cref{lst:reorder}, two consecutive \ac{mac} instructio
MAC GRF_B #0, BANK, GRF_A #0
MAC GRF_B #1, BANK, GRF_A #1
\end{verbatim}
\caption[Exemplary sequence of \ac{mac} instructions in a microkernel]{Exemplary sequence of \ac{mac} instructions in a microkernel.}
\caption[Exemplary sequence of \ac{mac} instructions in a microkernel.]{Exemplary sequence of \ac{mac} instructions in a microkernel.}
\label{lst:reorder}
\end{listing}
@@ -276,7 +276,7 @@ With this method, the register indices and the bank address cannot get out of sy
\begin{figure}
\centering
\includegraphics[width=0.5\linewidth]{images/aam}
\caption[Exemplary calculation of the \ac{grf}-A and \ac{grf}-B index using the row and column address]{Exemplary calculation of the \ac{grf}-A and \ac{grf}-B index using the row and column address \cite{lee2021}.}
\caption[Exemplary calculation of the \ac{grf}-A and \ac{grf}-B index using the row and column address.]{Exemplary calculation of the \ac{grf}-A and \ac{grf}-B index using the row and column address \cite{lee2021}.}
\label{img:aam}
\end{figure}
@@ -288,7 +288,7 @@ At the core of a \ac{gemv} microkernel is an iterative \ac{mac} instruction, fol
MAC(AAM) GRF_B, BANK, GRF_A
JUMP -1, 7
\end{verbatim}
\caption[The core of a \ac{gemv} microkernel]{The core of a \ac{gemv} microkernel.}
\caption[The core of a \ac{gemv} microkernel.]{The core of a \ac{gemv} microkernel.}
\label{lst:gemv}
\end{listing}
@@ -342,7 +342,7 @@ This interleaving is illustrated in \cref{img:input_vector}.
\begin{figure}
\centering
\input{images/input_vector}
\caption[Input vector in linear address space, where one chunk is mapped to all banks]{Input vector in linear address space, where one chunk is mapped to all banks.}
\caption{Input vector in linear address space, where one chunk is mapped to all banks.}
\label{img:input_vector}
\end{figure}
@@ -366,7 +366,7 @@ The operation of this concrete \ac{gemv} microkernel is illustrated in \cref{img
\begin{figure}
\centering
\includegraphics[width=0.8\linewidth]{images/memory_layout}
\caption[Procedure to perform a (128$\times$8)$\times$(128) \ac{gemv} operation]{Procedure to perform a (128$\times$8)$\times$(128) \ac{gemv} operation. One cell represents 16 \ac{fp16} elements, forming a $\qty{32}{\byte}$ block \cite{kang2022}.}
\caption[Procedure to perform a (128$\times$8)$\times$(128) \ac{gemv} operation.]{Procedure to perform a (128$\times$8)$\times$(128) \ac{gemv} operation. One cell represents 16 \ac{fp16} elements, forming a $\qty{32}{\byte}$ block \cite{kang2022}.}
\label{img:memory_layout}
\end{figure}
@@ -380,7 +380,7 @@ As a side effect of the incremented matrix row address, this also results in an
MAC(AAM) GRF_B, BANK, GRF_A
JUMP -1, 63
\end{verbatim}
\caption[The core of a \ac{mac} microkernel that utilizes the maximum number of register entries]{The core of a \ac{mac} microkernel that utilizes the maximum number of register entries.}
\caption[The core of a \ac{mac} microkernel that utilizes the maximum number of register entries.]{The core of a \ac{mac} microkernel that utilizes the maximum number of register entries.}
\label{lst:gemv64}
\end{listing}
A further increase in the total number of rows can be achieved by distributing the weight matrix over multiple \acp{pch} and running the microkernel multiple times, concatenating the output vectors on the host at the end.
@@ -405,7 +405,7 @@ The following \cref{sec:vp} introduces the concept of virtual prototyping, which
\begin{landscape}
\begin{figure}
\input{images/matrix_layout}
\caption[Mapping of the weight matrix onto the memory banks and its layout in the linear address space]{Mapping of the weight matrix onto the memory banks and its layout in the linear address space.}
\caption[Mapping of the weight matrix onto the memory banks and its layout in the linear address space.]{Mapping of the weight matrix onto the memory banks and its layout in the linear address space.}
\label{img:matrix_layout}
\end{figure}
\end{landscape}

View File

@@ -72,16 +72,94 @@ This allows an exaggerated evaluation of the performance gains of \ac{pim} in an
% comparison with normal clock and infinite compute (immer 4 simulationen, bzw. 5 mit echter hardware)
\subsection{Simulation Results}
\subsubsection{Workload Kernels}
\subsubsection{Vector Operations}
% Vector ADD und Vector MUL
% Vector Skalar ADD und Vector Skalar MUL (HCAL)
% Vector Skalar ADD und Vector Skalar MUL (HCAL) (wird wohl übersprungen)
% Vector HAXPY x*a+y
% Plots zB VADD VMUL nebeneinander für versch. Dimensionen und einer Frequenz
% andere Frequenz nächster Plot
% dann HAXPY
The first set of benchmarks analyzes the speedup of \aca{fimdram} for various vector operations, namely an element-wise vector add operation (VADD), an element-wise vector multiply operation (VMUL), and a \ac{haxpy} operation.
Such vector operations have a low operational density and are particularly memory-bounded because there is no data reuse at all and two input operands must be loaded for each operation.
As a result, the on-chip cache does not accelerate such workloads because all operand data must be fetched from memory anyway.
The workloads adhere to the following calculation patterns:
\begin{itemize}
\item VADD: $z = x + y$
\item VMUL: $z = x \cdot y$
\item \ac{haxpy}: $z = a \cdot x + y$
\end{itemize}
Each workload is run with different input vector dimensions to examine the effect of setup overhead and potentially identify a break-even point at which \ac{pim} becomes viable.
\Cref{tab:dimensions_vector} lists the specific vector dimensions for the following benchmarks.
The levels X1-X4 denote the increasing dimensions, with each step doubling in size.
\begin{table}
\centering
\begin{tblr}{
cell{2}{2} = {r},
cell{3}{2} = {r},
cell{4}{2} = {r},
cell{5}{2} = {r},
hlines,
vlines,
hline{2} = {-}{solid,black},
hline{2} = {2}{-}{solid,black},
}
Level & Dimensions \\
X1 & (256 $\times$ 1) \\
X2 & (512 $\times$ 1) \\
X3 & (1024 $\times$ 1) \\
X4 & (2048 $\times$ 1)
\end{tblr}
\caption{List of the input vector dimensions for the vector benchmarks.}
\label{tab:dimensions_vector}
\end{table}
The benchmarks analyze the relative number of processor ticks where the speedup is calculated as follows:
\begin{equation}
S = \frac{\textrm{# of ticks in non-\ac{pim} mode}}{# of ticks in \ac{pim} mode}
\end{equation}
\begin{figure}
\centering
\input{plots/vector_normal}
\caption{Comparison between non-\ac{pim} and \ac{pim} for the vector benchmarks running at a \ac{cpu} frequency of $\qty{3}{\giga\hertz}$.}
\label{fig:vector_normal}
\end{figure}
\begin{figure}
\centering
\input{plots/vector_infinite}
\caption{test}
\label{fig:vector_infinite}
\end{figure}
\subsubsection{Neural Network Layers}
% GEMV
% Samsung 7.4x-8.9x
% "inference" mit mehreren layern
% ReLU vergleich
% GEMM mit stark interleavten matrizen
% GEMM mit stark interleavten matrizen (eher nicht)
\begin{figure}
\centering
\input{plots/matrix_normal}
\caption{test}
\label{fig:matrix_normal}
\end{figure}
\begin{figure}
\centering
\input{plots/matrix_infinite}
\caption{test}
\label{fig:matrix_infinite}
\end{figure}
\subsubsection{Comparison to Real Hardware}
% \subsubsection{Initialization Overhead}
% conversion der operanden im verhältnis zur laufzeit abschätzen

View File

@@ -45,7 +45,7 @@ The framework is optimized for high simulation speed and uses the \ac{at} coding
\begin{figure}
\centering
\includegraphics[width=0.8\linewidth]{images/dramsys}
\caption[The internal architecture of DRAMSys]{The internal architecture of DRAMSys \cite{jung2017a}.}
\caption[The internal architecture of DRAMSys.]{The internal architecture of DRAMSys \cite{jung2017a}.}
\label{img:dramsys}
\end{figure}

View File

@@ -21,6 +21,7 @@
\usepackage{url}
\usepackage[urldate=long,sorting=none,maxbibnames=5]{biblatex}
\usepackage{pgfplots}
\usepackage{pgfplotstable}
\usepackage{bytefield}
\usepackage{mathdots}
\usepackage{tabularray}
@@ -46,6 +47,11 @@
% Custom colors
\definecolor{verylightgray}{gray}{0.85}
\definecolor{_darkblue}{RGB}{68, 114, 196}
\definecolor{_blue}{RGB}{91, 155, 213}
\definecolor{_green}{RGB}{112, 173, 71}
\definecolor{_orange}{RGB}{237, 125, 49}
\definecolor{_yellow}{RGB}{255, 192, 0}
% Penalties
\clubpenalty = 10000

View File

@@ -0,0 +1,31 @@
\begin{tikzpicture}
\pgfplotstableread[col sep=comma]{plots/tables/gemv_100GHz.csv}\gemv
\pgfplotstableread[col sep=comma]{plots/tables/gemv_layers_100GHz.csv}\gemvlayers
\begin{axis}[
width=0.9\textwidth,
ybar=1pt,
bar width = 15pt,
ymin=0,
ymax=5,
ytick distance=1,
ymajorgrids,
ylabel={Speedup},
tick pos=left,
xtick=data,
xticklabels from table={\gemv}{level},
enlarge x limits=0.25,
legend style={
at={(current bounding box.south-|current axis.south)},
anchor=north,
legend columns=-1,
draw=none,
/tikz/every even column/.append style={column sep=0.5cm}
},
]
\addplot[fill=_blue!90] table [x expr=\coordindex, y={speedup}]{\gemv};
\addlegendentry{GEMV}
\addplot[fill=_orange!90] table [x expr=\coordindex, y={speedup}]{\gemvlayers};
\addlegendentry{DNN Layers}
\end{axis}
\end{tikzpicture}

View File

@@ -0,0 +1,31 @@
\begin{tikzpicture}
\pgfplotstableread[col sep=comma]{plots/tables/gemv_3GHz.csv}\gemv
\pgfplotstableread[col sep=comma]{plots/tables/gemv_layers_3GHz.csv}\gemvlayers
\begin{axis}[
width=0.9\textwidth,
ybar=1pt,
bar width = 15pt,
ymin=0,
ymax=35,
minor y tick num = 5,
ymajorgrids,
ylabel={Speedup},
tick pos=left,
xtick=data,
xticklabels from table={\gemv}{level},
enlarge x limits=0.25,
legend style={
at={(current bounding box.south-|current axis.south)},
anchor=north,
legend columns=-1,
draw=none,
/tikz/every even column/.append style={column sep=0.5cm}
},
]
\addplot[fill=_blue!90] table [x expr=\coordindex, y={speedup}]{\gemv};
\addlegendentry{GEMV}
\addplot[fill=_orange!90] table [x expr=\coordindex, y={speedup}]{\gemvlayers};
\addlegendentry{DNN Layers}
\end{axis}
\end{tikzpicture}

View File

@@ -0,0 +1,5 @@
workload,level,frequency,speedup
gemv,X1,100GHz,0.2108059965502083
gemv,X2,100GHz,0.40509080127411157
gemv,X3,100GHz,0.8462958338758609
gemv,X4,100GHz,4.7274497979448125
1 workload level frequency speedup
2 gemv X1 100GHz 0.2108059965502083
3 gemv X2 100GHz 0.40509080127411157
4 gemv X3 100GHz 0.8462958338758609
5 gemv X4 100GHz 4.7274497979448125

View File

@@ -0,0 +1,5 @@
workload,level,frequency,speedup
gemv,X1,3GHz,3.468782996825547
gemv,X2,3GHz,6.723879985176877
gemv,X3,3GHz,12.744110856471028
gemv,X4,3GHz,23.645526777997713
1 workload level frequency speedup
2 gemv X1 3GHz 3.468782996825547
3 gemv X2 3GHz 6.723879985176877
4 gemv X3 3GHz 12.744110856471028
5 gemv X4 3GHz 23.645526777997713

View File

@@ -0,0 +1,5 @@
workload,level,frequency,speedup
gemv_layers,X1,100GHz,0.18703680911951287
gemv_layers,X2,100GHz,0.35722454947444127
gemv_layers,X3,100GHz,0.6338568319278073
gemv_layers,X4,100GHz,1.638629460755059
1 workload level frequency speedup
2 gemv_layers X1 100GHz 0.18703680911951287
3 gemv_layers X2 100GHz 0.35722454947444127
4 gemv_layers X3 100GHz 0.6338568319278073
5 gemv_layers X4 100GHz 1.638629460755059

View File

@@ -0,0 +1,5 @@
workload,level,frequency,speedup
gemv_layers,X1,3GHz,3.194018430394461
gemv_layers,X2,3GHz,6.206580081241512
gemv_layers,X3,3GHz,11.305511591995977
gemv_layers,X4,3GHz,20.27760945615218
1 workload level frequency speedup
2 gemv_layers X1 3GHz 3.194018430394461
3 gemv_layers X2 3GHz 6.206580081241512
4 gemv_layers X3 3GHz 11.305511591995977
5 gemv_layers X4 3GHz 20.27760945615218

View File

@@ -0,0 +1,5 @@
workload,level,frequency,speedup
haxpy,X1,100GHz,2.0481358611246403
haxpy,X2,100GHz,2.3234133539462776
haxpy,X3,100GHz,2.272582592673281
haxpy,X4,100GHz,2.3895030032424387
1 workload level frequency speedup
2 haxpy X1 100GHz 2.0481358611246403
3 haxpy X2 100GHz 2.3234133539462776
4 haxpy X3 100GHz 2.272582592673281
5 haxpy X4 100GHz 2.3895030032424387

View File

@@ -0,0 +1,5 @@
workload,level,frequency,speedup
haxpy,X1,3GHz,19.816741597088416
haxpy,X2,3GHz,25.395400082633245
haxpy,X3,3GHz,28.676005064893953
haxpy,X4,3GHz,31.783592582828017
1 workload level frequency speedup
2 haxpy X1 3GHz 19.816741597088416
3 haxpy X2 3GHz 25.395400082633245
4 haxpy X3 3GHz 28.676005064893953
5 haxpy X4 3GHz 31.783592582828017

View File

@@ -0,0 +1,5 @@
workload,level,frequency,speedup
vadd,X1,100GHz,2.398047786583271
vadd,X2,100GHz,1.823243660465808
vadd,X3,100GHz,1.562017010059411
vadd,X4,100GHz,1.7888939829610704
1 workload level frequency speedup
2 vadd X1 100GHz 2.398047786583271
3 vadd X2 100GHz 1.823243660465808
4 vadd X3 100GHz 1.562017010059411
5 vadd X4 100GHz 1.7888939829610704

View File

@@ -0,0 +1,5 @@
workload,level,frequency,speedup
vadd,X1,3GHz,12.766775777414075
vadd,X2,3GHz,14.19338061465721
vadd,X3,3GHz,15.313227057302887
vadd,X4,3GHz,16.430379164365913
1 workload level frequency speedup
2 vadd X1 3GHz 12.766775777414075
3 vadd X2 3GHz 14.19338061465721
4 vadd X3 3GHz 15.313227057302887
5 vadd X4 3GHz 16.430379164365913

View File

@@ -0,0 +1,5 @@
workload,level,frequency,speedup
vmul,X1,100GHz,2.4019901321627604
vmul,X2,100GHz,2.2189241847884267
vmul,X3,100GHz,1.86705278821741
vmul,X4,100GHz,1.7484391189395834
1 workload level frequency speedup
2 vmul X1 100GHz 2.4019901321627604
3 vmul X2 100GHz 2.2189241847884267
4 vmul X3 100GHz 1.86705278821741
5 vmul X4 100GHz 1.7484391189395834

View File

@@ -0,0 +1,5 @@
workload,level,frequency,speedup
vmul,X1,3GHz,14.157521157521158
vmul,X2,3GHz,15.915413533834586
vmul,X3,3GHz,16.959713823354058
vmul,X4,3GHz,18.215465292791755
1 workload level frequency speedup
2 vmul X1 3GHz 14.157521157521158
3 vmul X2 3GHz 15.915413533834586
4 vmul X3 3GHz 16.959713823354058
5 vmul X4 3GHz 18.215465292791755

View File

@@ -0,0 +1,35 @@
\begin{tikzpicture}
\pgfplotstableread[col sep=comma]{plots/tables/vadd_100GHz.csv}\vadd
\pgfplotstableread[col sep=comma]{plots/tables/vmul_100GHz.csv}\vmul
\pgfplotstableread[col sep=comma]{plots/tables/haxpy_100GHz.csv}\haxpy
\begin{axis}[
width=0.9\textwidth,
ybar=1pt,
bar width = 15pt,
ymin=0,
ymax=5,
ytick distance=1,
ymajorgrids,
ylabel={Speedup},
tick pos=left,
xtick=data,
xticklabels from table={\vadd}{level},
enlarge x limits=0.25,
legend style={
at={(current bounding box.south-|current axis.south)},
anchor=north,
legend columns=-1,
draw=none,
/tikz/every even column/.append style={column sep=0.5cm}
},
]
\addplot[fill=_blue!90] table [x expr=\coordindex, y={speedup}]{\vadd};
\addlegendentry{VADD}
\addplot[fill=_orange!90] table [x expr=\coordindex, y={speedup}]{\vmul};
\addlegendentry{VMUL}
\addplot[fill=_yellow!90] table [x expr=\coordindex, y={speedup}]{\haxpy};
\addlegendentry{HAXPY}
\end{axis}
\end{tikzpicture}

View File

@@ -0,0 +1,35 @@
\begin{tikzpicture}
\pgfplotstableread[col sep=comma]{plots/tables/vadd_3GHz.csv}\vadd
\pgfplotstableread[col sep=comma]{plots/tables/vmul_3GHz.csv}\vmul
\pgfplotstableread[col sep=comma]{plots/tables/haxpy_3GHz.csv}\haxpy
\begin{axis}[
width=0.9\textwidth,
ybar=1pt,
bar width = 15pt,
ymin=0,
ymax=35,
minor y tick num = 5,
ymajorgrids,
ylabel={Speedup},
tick pos=left,
xtick=data,
xticklabels from table={\vadd}{level},
enlarge x limits=0.25,
legend style={
at={(current bounding box.south-|current axis.south)},
anchor=north,
legend columns=-1,
draw=none,
/tikz/every even column/.append style={column sep=0.5cm}
},
]
\addplot[fill=_blue!90] table [x expr=\coordindex, y={speedup}]{\vadd};
\addlegendentry{VADD}
\addplot[fill=_orange!90] table [x expr=\coordindex, y={speedup}]{\vmul};
\addlegendentry{VMUL}
\addplot[fill=_yellow!90] table [x expr=\coordindex, y={speedup}]{\haxpy};
\addlegendentry{HAXPY}
\end{axis}
\end{tikzpicture}