From ee2405aaa96e1815e7ad4d131e41cea110ade69e Mon Sep 17 00:00:00 2001 From: Derek Christ Date: Wed, 28 Feb 2024 20:33:12 +0100 Subject: [PATCH] First simulation plots --- src/appendix.tex | 3 + src/chapters/dram.tex | 10 +-- src/chapters/implementation/kernel.tex | 8 +-- src/chapters/implementation/library.tex | 4 +- src/chapters/introduction.tex | 4 +- src/chapters/pim.tex | 26 ++++---- src/chapters/results.tex | 84 ++++++++++++++++++++++++- src/chapters/vp.tex | 2 +- src/index.tex | 6 ++ src/plots/matrix_infinite.tex | 31 +++++++++ src/plots/matrix_normal.tex | 31 +++++++++ src/plots/tables/gemv_100GHz.csv | 5 ++ src/plots/tables/gemv_3GHz.csv | 5 ++ src/plots/tables/gemv_layers_100GHz.csv | 5 ++ src/plots/tables/gemv_layers_3GHz.csv | 5 ++ src/plots/tables/haxpy_100GHz.csv | 5 ++ src/plots/tables/haxpy_3GHz.csv | 5 ++ src/plots/tables/vadd_100GHz.csv | 5 ++ src/plots/tables/vadd_3GHz.csv | 5 ++ src/plots/tables/vmul_100GHz.csv | 5 ++ src/plots/tables/vmul_3GHz.csv | 5 ++ src/plots/vector_infinite.tex | 35 +++++++++++ src/plots/vector_normal.tex | 35 +++++++++++ 23 files changed, 299 insertions(+), 30 deletions(-) create mode 100644 src/plots/matrix_infinite.tex create mode 100644 src/plots/matrix_normal.tex create mode 100644 src/plots/tables/gemv_100GHz.csv create mode 100644 src/plots/tables/gemv_3GHz.csv create mode 100644 src/plots/tables/gemv_layers_100GHz.csv create mode 100644 src/plots/tables/gemv_layers_3GHz.csv create mode 100644 src/plots/tables/haxpy_100GHz.csv create mode 100644 src/plots/tables/haxpy_3GHz.csv create mode 100644 src/plots/tables/vadd_100GHz.csv create mode 100644 src/plots/tables/vadd_3GHz.csv create mode 100644 src/plots/tables/vmul_100GHz.csv create mode 100644 src/plots/tables/vmul_3GHz.csv create mode 100644 src/plots/vector_infinite.tex create mode 100644 src/plots/vector_normal.tex diff --git a/src/appendix.tex b/src/appendix.tex index 2d858db..45a3a8e 100644 --- a/src/appendix.tex +++ b/src/appendix.tex @@ -1,6 +1,9 @@ \section{Appendix} \label{sec:appendix} +\subsection{Simulation Results} +\subsection{Microkernels} +\subsection{Source Code} % etwas source code, % von der vm % einige microkernels diff --git a/src/chapters/dram.tex b/src/chapters/dram.tex index 9855283..44ab5a6 100644 --- a/src/chapters/dram.tex +++ b/src/chapters/dram.tex @@ -22,7 +22,7 @@ Because the charge stored in each cell is very small, so-called \acp{psa} are ne \begin{figure} \centering \includegraphics[width=\linewidth]{images/psa} - \caption[\ac{psa} of an open bitline architecture]{\ac{psa} of an open bitline architecture \cite{jacob2008} \cite{jung2017a}.} + \caption[\ac{psa} of an open bitline architecture.]{\ac{psa} of an open bitline architecture \cite{jacob2008} \cite{jung2017a}.} \label{img:psa} \end{figure} @@ -39,7 +39,7 @@ The \cref{img:bank} summarizes the basic architecture of a single storage device \begin{figure} \centering \includegraphics[width=\linewidth]{images/bank} - \caption[Architecture of a single DRAM device]{Architecture of a single DRAM device \cite{jung2017a}.} + \caption[Architecture of a single DRAM device.]{Architecture of a single DRAM device \cite{jung2017a}.} \label{img:bank} \end{figure} @@ -83,7 +83,7 @@ Because banks can be controlled independently, one bank can be outputting the ne \bitbox{3}[bgcolor=verylightgray]{} \end{bytefield} - \caption[Exemplary address mapping scheme]{Exemplary address mapping scheme for an input address of size 32.} + \caption{Exemplary address mapping scheme for an input address of size 32.} \label{img:bank_interleaving} \end{figure} @@ -102,7 +102,7 @@ Several \ac{dram} dies are stacked on top of each other and connected with \acp{ \begin{figure} \centering \includegraphics[width=0.8\linewidth]{images/sip} - \caption[Cross-section view of an \ac{hbm} \ac{sip}]{Cross-section view of a \ac{hbm} \ac{sip} \cite{lee2021}.} + \caption[Cross-section view of an \ac{hbm} \ac{sip}.]{Cross-section view of a \ac{hbm} \ac{sip} \cite{lee2021}.} \label{img:sip} \end{figure} Such a cube is then placed onto a common silicon interposer that connects the \ac{dram} to its host processor. @@ -123,7 +123,7 @@ In the center of the die, the \acp{tsv} connect the die to the next die above it \begin{figure} \centering \includegraphics[width=0.8\linewidth]{images/hbm} - \caption[\aca{hbm} memory die architecture]{\aca{hbm} memory die architecture \cite{lee2021}.} + \caption[\aca{hbm} memory die architecture.]{\aca{hbm} memory die architecture \cite{lee2021}.} \label{img:hbm} \end{figure} diff --git a/src/chapters/implementation/kernel.tex b/src/chapters/implementation/kernel.tex index 456597e..5ac8a84 100644 --- a/src/chapters/implementation/kernel.tex +++ b/src/chapters/implementation/kernel.tex @@ -59,7 +59,7 @@ Each granule size has a different maximum amount of page table nesting, with up \begin{figure} \centering \includegraphics[width=\linewidth]{images/pagetable_granule} - \caption[The distinct page table levels for the $\qty{4}{\kilo\byte}$ granule]{The distinct page table levels for the $\qty{4}{\kilo\byte}$ granule \cite{arm2015}.} + \caption[The distinct page table levels for the $\qty{4}{\kilo\byte}$ granule.]{The distinct page table levels for the $\qty{4}{\kilo\byte}$ granule \cite{arm2015}.} \label{img:pagetable_granule} \end{figure} @@ -117,7 +117,7 @@ The simplified \ac{am} following this scheme is shown in \cref{img:hbm2_am}. \bitbox{3}[bgcolor=verylightgray]{} \end{bytefield} - \caption[Simplified \aca{hbm} address mapping with a split column mapping]{Simplified \aca{hbm} address mapping with a split column mapping.} + \caption{Simplified \aca{hbm} address mapping with a split column mapping.} \label{img:hbm2_am} \end{figure} @@ -150,7 +150,7 @@ The concrete values for these parameters are listed in \cref{tab:memspec}. Width & Width of the Data Bus & 64 \end{tblr} % } - \caption[A list of the used configuration parameters of \aca{hbm}]{A list of the used configuration parameters of \aca{hbm}.} + \caption{A list of the used configuration parameters of \aca{hbm}.} \label{tab:memspec} \end{table} @@ -179,7 +179,7 @@ JUMP -1, 7 FILL BANK, GRF_B #0 EXIT \end{verbatim} - \caption[A complete \ac{gemv} microkernel]{A complete \ac{gemv} microkernel.} + \caption{A complete \ac{gemv} microkernel.} \label{lst:gemv_microkernel} \end{listing} diff --git a/src/chapters/implementation/library.tex b/src/chapters/implementation/library.tex index d6609ed..ee9738d 100644 --- a/src/chapters/implementation/library.tex +++ b/src/chapters/implementation/library.tex @@ -51,7 +51,7 @@ enum File { } \end{minted} \end{minipage} - \caption[The \texttt{enum} definitions of the instructions and register files]{The \texttt{enum} definitions of the instructions and register files.} + \caption{The \texttt{enum} definitions of the instructions and register files.} \label{lst:instruction_enums} \end{listing} A microkernel is then simply an array consisting of instructions of size 32. @@ -81,7 +81,7 @@ This \texttt{ComputeArray} and \texttt{BankArray} layout is illustrated in \cref \begin{figure} \centering \includegraphics[width=\linewidth]{images/compute_array} - \caption[Memory layout of a flat \ac{fp16} array spanning over four banks]{Memory layout of a flat \ac{fp16} array spanning over four banks.} + \caption{Memory layout of a flat \ac{fp16} array spanning over four banks.} \label{img:compute_array} \end{figure} diff --git a/src/chapters/introduction.tex b/src/chapters/introduction.tex index dbe90ce..8692858 100644 --- a/src/chapters/introduction.tex +++ b/src/chapters/introduction.tex @@ -17,7 +17,7 @@ In addition, Moore's Law is slowing down as further device scaling approaches ph \begin{figure}[!ht] \centering \input{plots/energy_chart} - \caption[Total energy of computing]{Total energy of computing \cite{src2021}.} + \caption[Total energy of computing.]{Total energy of computing \cite{src2021}.} \label{plt:enery_chart} \end{figure} @@ -34,7 +34,7 @@ In contrast, compute-intensive workloads, such as visual processing, are referre \begin{figure}[!ht] \centering \input{plots/roofline} - \caption[Roofline model of GPT revisions]{Roofline model of GPT revisions \cite{ivobolsens2023}.} + \caption[Roofline model of GPT revisions.]{Roofline model of GPT revisions \cite{ivobolsens2023}.} \label{plt:roofline} \end{figure} diff --git a/src/chapters/pim.tex b/src/chapters/pim.tex index 0664578..ffb6e19 100644 --- a/src/chapters/pim.tex +++ b/src/chapters/pim.tex @@ -26,7 +26,7 @@ This process is illustrated in \cref{img:dnn} where one \ac{dnn} layer is proces \begin{figure} \centering \input{images/dnn} - \caption[A fully connected \ac{dnn} layer]{A fully connected \ac{dnn} layer \cite{he2020}.} + \caption[A fully connected \ac{dnn} layer.]{A fully connected \ac{dnn} layer \cite{he2020}.} \label{img:dnn} \end{figure} @@ -108,7 +108,7 @@ To make full use of the output buffering, the matrix rows are interleaved in an \begin{figure} \centering \input{images/hynix} - \caption[Newton memory layout for a \ac{gemv} operation]{Newton memory layout for a \ac{gemv} operation \cite{he2020}.} + \caption[Newton memory layout for a \ac{gemv} operation.]{Newton memory layout for a \ac{gemv} operation \cite{he2020}.} \label{img:hynix} \end{figure} @@ -142,7 +142,7 @@ This general architecture is shown in detail in \cref{img:fimdram}, with (a) the \begin{figure} \centering \includegraphics[width=\linewidth]{images/fimdram} - \caption[Architecture of \aca{fimdram}]{Architecture of \aca{fimdram} \cite{lee2021}.} + \caption[Architecture of \aca{fimdram}.]{Architecture of \aca{fimdram} \cite{lee2021}.} \label{img:fimdram} \end{figure} @@ -185,7 +185,7 @@ This processing unit architecture is illustrated in \cref{img:pcu}, along with t \begin{figure} \centering \includegraphics[width=0.8\linewidth]{images/pcu} - \caption[Architecture of a \ac{pim} processing unit]{Architecture of a \ac{pim} processing unit \cite{lee2021}.} + \caption[Architecture of a \ac{pim} processing unit.]{Architecture of a \ac{pim} processing unit \cite{lee2021}.} \label{img:pcu} \end{figure} @@ -200,7 +200,7 @@ The data layout of these three instruction groups is shown in \cref{tab:isa}. \begin{table} \centering \includegraphics[width=\linewidth]{images/isa} - \caption[The instruction format of the processing units]{The instruction format of the processing units \cite{lee2021}.} + \caption[The instruction format of the processing units.]{The instruction format of the processing units \cite{lee2021}.} \label{tab:isa} \end{table} @@ -235,7 +235,7 @@ Another special field \textit{A} enables the \ac{aam}, which will be explained i Arithmetic & MAC & multiply-accumulate & GRF-B & GRF, BANK & GRF, BANK, SRF & GRF, BANK, SRF \\ Arithmetic & MAD & multiply-and-add & GRF & GRF, BANK & GRF, BANK, SRF & GRF, BANK, SRF \end{tblr}} - \caption[A list of all supported \ac{pim} instructions and their possible sources and destinations]{A list of all supported \ac{pim} instructions and their possible sources and destinations \cite{shin-haengkang2023}.} + \caption[A list of all supported \ac{pim} instructions and their possible sources and destinations.]{A list of all supported \ac{pim} instructions and their possible sources and destinations \cite{shin-haengkang2023}.} \label{tab:instruction_set} \end{table} @@ -259,7 +259,7 @@ For example, as shown in \cref{lst:reorder}, two consecutive \ac{mac} instructio MAC GRF_B #0, BANK, GRF_A #0 MAC GRF_B #1, BANK, GRF_A #1 \end{verbatim} - \caption[Exemplary sequence of \ac{mac} instructions in a microkernel]{Exemplary sequence of \ac{mac} instructions in a microkernel.} + \caption[Exemplary sequence of \ac{mac} instructions in a microkernel.]{Exemplary sequence of \ac{mac} instructions in a microkernel.} \label{lst:reorder} \end{listing} @@ -276,7 +276,7 @@ With this method, the register indices and the bank address cannot get out of sy \begin{figure} \centering \includegraphics[width=0.5\linewidth]{images/aam} - \caption[Exemplary calculation of the \ac{grf}-A and \ac{grf}-B index using the row and column address]{Exemplary calculation of the \ac{grf}-A and \ac{grf}-B index using the row and column address \cite{lee2021}.} + \caption[Exemplary calculation of the \ac{grf}-A and \ac{grf}-B index using the row and column address.]{Exemplary calculation of the \ac{grf}-A and \ac{grf}-B index using the row and column address \cite{lee2021}.} \label{img:aam} \end{figure} @@ -288,7 +288,7 @@ At the core of a \ac{gemv} microkernel is an iterative \ac{mac} instruction, fol MAC(AAM) GRF_B, BANK, GRF_A JUMP -1, 7 \end{verbatim} - \caption[The core of a \ac{gemv} microkernel]{The core of a \ac{gemv} microkernel.} + \caption[The core of a \ac{gemv} microkernel.]{The core of a \ac{gemv} microkernel.} \label{lst:gemv} \end{listing} @@ -342,7 +342,7 @@ This interleaving is illustrated in \cref{img:input_vector}. \begin{figure} \centering \input{images/input_vector} - \caption[Input vector in linear address space, where one chunk is mapped to all banks]{Input vector in linear address space, where one chunk is mapped to all banks.} + \caption{Input vector in linear address space, where one chunk is mapped to all banks.} \label{img:input_vector} \end{figure} @@ -366,7 +366,7 @@ The operation of this concrete \ac{gemv} microkernel is illustrated in \cref{img \begin{figure} \centering \includegraphics[width=0.8\linewidth]{images/memory_layout} - \caption[Procedure to perform a (128$\times$8)$\times$(128) \ac{gemv} operation]{Procedure to perform a (128$\times$8)$\times$(128) \ac{gemv} operation. One cell represents 16 \ac{fp16} elements, forming a $\qty{32}{\byte}$ block \cite{kang2022}.} + \caption[Procedure to perform a (128$\times$8)$\times$(128) \ac{gemv} operation.]{Procedure to perform a (128$\times$8)$\times$(128) \ac{gemv} operation. One cell represents 16 \ac{fp16} elements, forming a $\qty{32}{\byte}$ block \cite{kang2022}.} \label{img:memory_layout} \end{figure} @@ -380,7 +380,7 @@ As a side effect of the incremented matrix row address, this also results in an MAC(AAM) GRF_B, BANK, GRF_A JUMP -1, 63 \end{verbatim} - \caption[The core of a \ac{mac} microkernel that utilizes the maximum number of register entries]{The core of a \ac{mac} microkernel that utilizes the maximum number of register entries.} + \caption[The core of a \ac{mac} microkernel that utilizes the maximum number of register entries.]{The core of a \ac{mac} microkernel that utilizes the maximum number of register entries.} \label{lst:gemv64} \end{listing} A further increase in the total number of rows can be achieved by distributing the weight matrix over multiple \acp{pch} and running the microkernel multiple times, concatenating the output vectors on the host at the end. @@ -405,7 +405,7 @@ The following \cref{sec:vp} introduces the concept of virtual prototyping, which \begin{landscape} \begin{figure} \input{images/matrix_layout} -\caption[Mapping of the weight matrix onto the memory banks and its layout in the linear address space]{Mapping of the weight matrix onto the memory banks and its layout in the linear address space.} +\caption[Mapping of the weight matrix onto the memory banks and its layout in the linear address space.]{Mapping of the weight matrix onto the memory banks and its layout in the linear address space.} \label{img:matrix_layout} \end{figure} \end{landscape} diff --git a/src/chapters/results.tex b/src/chapters/results.tex index 6203539..65af49c 100644 --- a/src/chapters/results.tex +++ b/src/chapters/results.tex @@ -72,16 +72,94 @@ This allows an exaggerated evaluation of the performance gains of \ac{pim} in an % comparison with normal clock and infinite compute (immer 4 simulationen, bzw. 5 mit echter hardware) \subsection{Simulation Results} -\subsubsection{Workload Kernels} +\subsubsection{Vector Operations} % Vector ADD und Vector MUL -% Vector Skalar ADD und Vector Skalar MUL (HCAL) +% Vector Skalar ADD und Vector Skalar MUL (HCAL) (wird wohl übersprungen) % Vector HAXPY x*a+y + +% Plots zB VADD VMUL nebeneinander für versch. Dimensionen und einer Frequenz +% andere Frequenz nächster Plot +% dann HAXPY + +The first set of benchmarks analyzes the speedup of \aca{fimdram} for various vector operations, namely an element-wise vector add operation (VADD), an element-wise vector multiply operation (VMUL), and a \ac{haxpy} operation. +Such vector operations have a low operational density and are particularly memory-bounded because there is no data reuse at all and two input operands must be loaded for each operation. +As a result, the on-chip cache does not accelerate such workloads because all operand data must be fetched from memory anyway. +The workloads adhere to the following calculation patterns: + +\begin{itemize} +\item VADD: $z = x + y$ +\item VMUL: $z = x \cdot y$ +\item \ac{haxpy}: $z = a \cdot x + y$ +\end{itemize} + +Each workload is run with different input vector dimensions to examine the effect of setup overhead and potentially identify a break-even point at which \ac{pim} becomes viable. +\Cref{tab:dimensions_vector} lists the specific vector dimensions for the following benchmarks. +The levels X1-X4 denote the increasing dimensions, with each step doubling in size. + +\begin{table} +\centering +\begin{tblr}{ + cell{2}{2} = {r}, + cell{3}{2} = {r}, + cell{4}{2} = {r}, + cell{5}{2} = {r}, + hlines, + vlines, + hline{2} = {-}{solid,black}, + hline{2} = {2}{-}{solid,black}, +} +Level & Dimensions \\ +X1 & (256 $\times$ 1) \\ +X2 & (512 $\times$ 1) \\ +X3 & (1024 $\times$ 1) \\ +X4 & (2048 $\times$ 1) +\end{tblr} +\caption{List of the input vector dimensions for the vector benchmarks.} +\label{tab:dimensions_vector} +\end{table} + +The benchmarks analyze the relative number of processor ticks where the speedup is calculated as follows: +\begin{equation} +S = \frac{\textrm{# of ticks in non-\ac{pim} mode}}{# of ticks in \ac{pim} mode} +\end{equation} + +\begin{figure} + \centering + \input{plots/vector_normal} + \caption{Comparison between non-\ac{pim} and \ac{pim} for the vector benchmarks running at a \ac{cpu} frequency of $\qty{3}{\giga\hertz}$.} + \label{fig:vector_normal} +\end{figure} + +\begin{figure} + \centering + \input{plots/vector_infinite} + \caption{test} + \label{fig:vector_infinite} +\end{figure} + +\subsubsection{Neural Network Layers} % GEMV % Samsung 7.4x-8.9x % "inference" mit mehreren layern % ReLU vergleich -% GEMM mit stark interleavten matrizen +% GEMM mit stark interleavten matrizen (eher nicht) + +\begin{figure} + \centering + \input{plots/matrix_normal} + \caption{test} + \label{fig:matrix_normal} +\end{figure} + +\begin{figure} + \centering + \input{plots/matrix_infinite} + \caption{test} + \label{fig:matrix_infinite} +\end{figure} + +\subsubsection{Comparison to Real Hardware} % \subsubsection{Initialization Overhead} % conversion der operanden im verhältnis zur laufzeit abschätzen diff --git a/src/chapters/vp.tex b/src/chapters/vp.tex index 405a365..f86f56a 100644 --- a/src/chapters/vp.tex +++ b/src/chapters/vp.tex @@ -45,7 +45,7 @@ The framework is optimized for high simulation speed and uses the \ac{at} coding \begin{figure} \centering \includegraphics[width=0.8\linewidth]{images/dramsys} - \caption[The internal architecture of DRAMSys]{The internal architecture of DRAMSys \cite{jung2017a}.} + \caption[The internal architecture of DRAMSys.]{The internal architecture of DRAMSys \cite{jung2017a}.} \label{img:dramsys} \end{figure} diff --git a/src/index.tex b/src/index.tex index 7485723..f25fdd1 100644 --- a/src/index.tex +++ b/src/index.tex @@ -21,6 +21,7 @@ \usepackage{url} \usepackage[urldate=long,sorting=none,maxbibnames=5]{biblatex} \usepackage{pgfplots} +\usepackage{pgfplotstable} \usepackage{bytefield} \usepackage{mathdots} \usepackage{tabularray} @@ -46,6 +47,11 @@ % Custom colors \definecolor{verylightgray}{gray}{0.85} +\definecolor{_darkblue}{RGB}{68, 114, 196} +\definecolor{_blue}{RGB}{91, 155, 213} +\definecolor{_green}{RGB}{112, 173, 71} +\definecolor{_orange}{RGB}{237, 125, 49} +\definecolor{_yellow}{RGB}{255, 192, 0} % Penalties \clubpenalty = 10000 diff --git a/src/plots/matrix_infinite.tex b/src/plots/matrix_infinite.tex new file mode 100644 index 0000000..dbe5ffe --- /dev/null +++ b/src/plots/matrix_infinite.tex @@ -0,0 +1,31 @@ +\begin{tikzpicture} + \pgfplotstableread[col sep=comma]{plots/tables/gemv_100GHz.csv}\gemv + \pgfplotstableread[col sep=comma]{plots/tables/gemv_layers_100GHz.csv}\gemvlayers + \begin{axis}[ + width=0.9\textwidth, + ybar=1pt, + bar width = 15pt, + ymin=0, + ymax=5, + ytick distance=1, + ymajorgrids, + ylabel={Speedup}, + tick pos=left, + xtick=data, + xticklabels from table={\gemv}{level}, + enlarge x limits=0.25, + legend style={ + at={(current bounding box.south-|current axis.south)}, + anchor=north, + legend columns=-1, + draw=none, + /tikz/every even column/.append style={column sep=0.5cm} + }, + ] + \addplot[fill=_blue!90] table [x expr=\coordindex, y={speedup}]{\gemv}; + \addlegendentry{GEMV} + + \addplot[fill=_orange!90] table [x expr=\coordindex, y={speedup}]{\gemvlayers}; + \addlegendentry{DNN Layers} + \end{axis} +\end{tikzpicture} diff --git a/src/plots/matrix_normal.tex b/src/plots/matrix_normal.tex new file mode 100644 index 0000000..19d20cc --- /dev/null +++ b/src/plots/matrix_normal.tex @@ -0,0 +1,31 @@ +\begin{tikzpicture} + \pgfplotstableread[col sep=comma]{plots/tables/gemv_3GHz.csv}\gemv + \pgfplotstableread[col sep=comma]{plots/tables/gemv_layers_3GHz.csv}\gemvlayers + \begin{axis}[ + width=0.9\textwidth, + ybar=1pt, + bar width = 15pt, + ymin=0, + ymax=35, + minor y tick num = 5, + ymajorgrids, + ylabel={Speedup}, + tick pos=left, + xtick=data, + xticklabels from table={\gemv}{level}, + enlarge x limits=0.25, + legend style={ + at={(current bounding box.south-|current axis.south)}, + anchor=north, + legend columns=-1, + draw=none, + /tikz/every even column/.append style={column sep=0.5cm} + }, + ] + \addplot[fill=_blue!90] table [x expr=\coordindex, y={speedup}]{\gemv}; + \addlegendentry{GEMV} + + \addplot[fill=_orange!90] table [x expr=\coordindex, y={speedup}]{\gemvlayers}; + \addlegendentry{DNN Layers} + \end{axis} +\end{tikzpicture} diff --git a/src/plots/tables/gemv_100GHz.csv b/src/plots/tables/gemv_100GHz.csv new file mode 100644 index 0000000..566d659 --- /dev/null +++ b/src/plots/tables/gemv_100GHz.csv @@ -0,0 +1,5 @@ +workload,level,frequency,speedup +gemv,X1,100GHz,0.2108059965502083 +gemv,X2,100GHz,0.40509080127411157 +gemv,X3,100GHz,0.8462958338758609 +gemv,X4,100GHz,4.7274497979448125 diff --git a/src/plots/tables/gemv_3GHz.csv b/src/plots/tables/gemv_3GHz.csv new file mode 100644 index 0000000..d5e7712 --- /dev/null +++ b/src/plots/tables/gemv_3GHz.csv @@ -0,0 +1,5 @@ +workload,level,frequency,speedup +gemv,X1,3GHz,3.468782996825547 +gemv,X2,3GHz,6.723879985176877 +gemv,X3,3GHz,12.744110856471028 +gemv,X4,3GHz,23.645526777997713 diff --git a/src/plots/tables/gemv_layers_100GHz.csv b/src/plots/tables/gemv_layers_100GHz.csv new file mode 100644 index 0000000..787a2ff --- /dev/null +++ b/src/plots/tables/gemv_layers_100GHz.csv @@ -0,0 +1,5 @@ +workload,level,frequency,speedup +gemv_layers,X1,100GHz,0.18703680911951287 +gemv_layers,X2,100GHz,0.35722454947444127 +gemv_layers,X3,100GHz,0.6338568319278073 +gemv_layers,X4,100GHz,1.638629460755059 diff --git a/src/plots/tables/gemv_layers_3GHz.csv b/src/plots/tables/gemv_layers_3GHz.csv new file mode 100644 index 0000000..6c5060e --- /dev/null +++ b/src/plots/tables/gemv_layers_3GHz.csv @@ -0,0 +1,5 @@ +workload,level,frequency,speedup +gemv_layers,X1,3GHz,3.194018430394461 +gemv_layers,X2,3GHz,6.206580081241512 +gemv_layers,X3,3GHz,11.305511591995977 +gemv_layers,X4,3GHz,20.27760945615218 diff --git a/src/plots/tables/haxpy_100GHz.csv b/src/plots/tables/haxpy_100GHz.csv new file mode 100644 index 0000000..33ab8fd --- /dev/null +++ b/src/plots/tables/haxpy_100GHz.csv @@ -0,0 +1,5 @@ +workload,level,frequency,speedup +haxpy,X1,100GHz,2.0481358611246403 +haxpy,X2,100GHz,2.3234133539462776 +haxpy,X3,100GHz,2.272582592673281 +haxpy,X4,100GHz,2.3895030032424387 diff --git a/src/plots/tables/haxpy_3GHz.csv b/src/plots/tables/haxpy_3GHz.csv new file mode 100644 index 0000000..4970b87 --- /dev/null +++ b/src/plots/tables/haxpy_3GHz.csv @@ -0,0 +1,5 @@ +workload,level,frequency,speedup +haxpy,X1,3GHz,19.816741597088416 +haxpy,X2,3GHz,25.395400082633245 +haxpy,X3,3GHz,28.676005064893953 +haxpy,X4,3GHz,31.783592582828017 diff --git a/src/plots/tables/vadd_100GHz.csv b/src/plots/tables/vadd_100GHz.csv new file mode 100644 index 0000000..4ad7277 --- /dev/null +++ b/src/plots/tables/vadd_100GHz.csv @@ -0,0 +1,5 @@ +workload,level,frequency,speedup +vadd,X1,100GHz,2.398047786583271 +vadd,X2,100GHz,1.823243660465808 +vadd,X3,100GHz,1.562017010059411 +vadd,X4,100GHz,1.7888939829610704 diff --git a/src/plots/tables/vadd_3GHz.csv b/src/plots/tables/vadd_3GHz.csv new file mode 100644 index 0000000..71f7e18 --- /dev/null +++ b/src/plots/tables/vadd_3GHz.csv @@ -0,0 +1,5 @@ +workload,level,frequency,speedup +vadd,X1,3GHz,12.766775777414075 +vadd,X2,3GHz,14.19338061465721 +vadd,X3,3GHz,15.313227057302887 +vadd,X4,3GHz,16.430379164365913 diff --git a/src/plots/tables/vmul_100GHz.csv b/src/plots/tables/vmul_100GHz.csv new file mode 100644 index 0000000..2f93e8e --- /dev/null +++ b/src/plots/tables/vmul_100GHz.csv @@ -0,0 +1,5 @@ +workload,level,frequency,speedup +vmul,X1,100GHz,2.4019901321627604 +vmul,X2,100GHz,2.2189241847884267 +vmul,X3,100GHz,1.86705278821741 +vmul,X4,100GHz,1.7484391189395834 diff --git a/src/plots/tables/vmul_3GHz.csv b/src/plots/tables/vmul_3GHz.csv new file mode 100644 index 0000000..d6898d6 --- /dev/null +++ b/src/plots/tables/vmul_3GHz.csv @@ -0,0 +1,5 @@ +workload,level,frequency,speedup +vmul,X1,3GHz,14.157521157521158 +vmul,X2,3GHz,15.915413533834586 +vmul,X3,3GHz,16.959713823354058 +vmul,X4,3GHz,18.215465292791755 diff --git a/src/plots/vector_infinite.tex b/src/plots/vector_infinite.tex new file mode 100644 index 0000000..0b7f593 --- /dev/null +++ b/src/plots/vector_infinite.tex @@ -0,0 +1,35 @@ +\begin{tikzpicture} + \pgfplotstableread[col sep=comma]{plots/tables/vadd_100GHz.csv}\vadd + \pgfplotstableread[col sep=comma]{plots/tables/vmul_100GHz.csv}\vmul + \pgfplotstableread[col sep=comma]{plots/tables/haxpy_100GHz.csv}\haxpy + \begin{axis}[ + width=0.9\textwidth, + ybar=1pt, + bar width = 15pt, + ymin=0, + ymax=5, + ytick distance=1, + ymajorgrids, + ylabel={Speedup}, + tick pos=left, + xtick=data, + xticklabels from table={\vadd}{level}, + enlarge x limits=0.25, + legend style={ + at={(current bounding box.south-|current axis.south)}, + anchor=north, + legend columns=-1, + draw=none, + /tikz/every even column/.append style={column sep=0.5cm} + }, + ] + \addplot[fill=_blue!90] table [x expr=\coordindex, y={speedup}]{\vadd}; + \addlegendentry{VADD} + + \addplot[fill=_orange!90] table [x expr=\coordindex, y={speedup}]{\vmul}; + \addlegendentry{VMUL} + + \addplot[fill=_yellow!90] table [x expr=\coordindex, y={speedup}]{\haxpy}; + \addlegendentry{HAXPY} + \end{axis} +\end{tikzpicture} diff --git a/src/plots/vector_normal.tex b/src/plots/vector_normal.tex new file mode 100644 index 0000000..37631c6 --- /dev/null +++ b/src/plots/vector_normal.tex @@ -0,0 +1,35 @@ +\begin{tikzpicture} + \pgfplotstableread[col sep=comma]{plots/tables/vadd_3GHz.csv}\vadd + \pgfplotstableread[col sep=comma]{plots/tables/vmul_3GHz.csv}\vmul + \pgfplotstableread[col sep=comma]{plots/tables/haxpy_3GHz.csv}\haxpy + \begin{axis}[ + width=0.9\textwidth, + ybar=1pt, + bar width = 15pt, + ymin=0, + ymax=35, + minor y tick num = 5, + ymajorgrids, + ylabel={Speedup}, + tick pos=left, + xtick=data, + xticklabels from table={\vadd}{level}, + enlarge x limits=0.25, + legend style={ + at={(current bounding box.south-|current axis.south)}, + anchor=north, + legend columns=-1, + draw=none, + /tikz/every even column/.append style={column sep=0.5cm} + }, + ] + \addplot[fill=_blue!90] table [x expr=\coordindex, y={speedup}]{\vadd}; + \addlegendentry{VADD} + + \addplot[fill=_orange!90] table [x expr=\coordindex, y={speedup}]{\vmul}; + \addlegendentry{VMUL} + + \addplot[fill=_yellow!90] table [x expr=\coordindex, y={speedup}]{\haxpy}; + \addlegendentry{HAXPY} + \end{axis} +\end{tikzpicture}