From ee2405aaa96e1815e7ad4d131e41cea110ade69e Mon Sep 17 00:00:00 2001
From: Derek Christ <christ.derek@gmail.com>
Date: Wed, 28 Feb 2024 20:33:12 +0100
Subject: [PATCH] First simulation plots

---
 src/appendix.tex                        |  3 +
 src/chapters/dram.tex                   | 10 +--
 src/chapters/implementation/kernel.tex  |  8 +--
 src/chapters/implementation/library.tex |  4 +-
 src/chapters/introduction.tex           |  4 +-
 src/chapters/pim.tex                    | 26 ++++----
 src/chapters/results.tex                | 84 ++++++++++++++++++++++++-
 src/chapters/vp.tex                     |  2 +-
 src/index.tex                           |  6 ++
 src/plots/matrix_infinite.tex           | 31 +++++++++
 src/plots/matrix_normal.tex             | 31 +++++++++
 src/plots/tables/gemv_100GHz.csv        |  5 ++
 src/plots/tables/gemv_3GHz.csv          |  5 ++
 src/plots/tables/gemv_layers_100GHz.csv |  5 ++
 src/plots/tables/gemv_layers_3GHz.csv   |  5 ++
 src/plots/tables/haxpy_100GHz.csv       |  5 ++
 src/plots/tables/haxpy_3GHz.csv         |  5 ++
 src/plots/tables/vadd_100GHz.csv        |  5 ++
 src/plots/tables/vadd_3GHz.csv          |  5 ++
 src/plots/tables/vmul_100GHz.csv        |  5 ++
 src/plots/tables/vmul_3GHz.csv          |  5 ++
 src/plots/vector_infinite.tex           | 35 +++++++++++
 src/plots/vector_normal.tex             | 35 +++++++++++
 23 files changed, 299 insertions(+), 30 deletions(-)
 create mode 100644 src/plots/matrix_infinite.tex
 create mode 100644 src/plots/matrix_normal.tex
 create mode 100644 src/plots/tables/gemv_100GHz.csv
 create mode 100644 src/plots/tables/gemv_3GHz.csv
 create mode 100644 src/plots/tables/gemv_layers_100GHz.csv
 create mode 100644 src/plots/tables/gemv_layers_3GHz.csv
 create mode 100644 src/plots/tables/haxpy_100GHz.csv
 create mode 100644 src/plots/tables/haxpy_3GHz.csv
 create mode 100644 src/plots/tables/vadd_100GHz.csv
 create mode 100644 src/plots/tables/vadd_3GHz.csv
 create mode 100644 src/plots/tables/vmul_100GHz.csv
 create mode 100644 src/plots/tables/vmul_3GHz.csv
 create mode 100644 src/plots/vector_infinite.tex
 create mode 100644 src/plots/vector_normal.tex

diff --git a/src/appendix.tex b/src/appendix.tex
index 2d858db..45a3a8e 100644
--- a/src/appendix.tex
+++ b/src/appendix.tex
@@ -1,6 +1,9 @@
 \section{Appendix}
 \label{sec:appendix}
 
+\subsection{Simulation Results}
+\subsection{Microkernels}
+\subsection{Source Code}
 % etwas source code,
 % von der vm
 % einige microkernels
diff --git a/src/chapters/dram.tex b/src/chapters/dram.tex
index 9855283..44ab5a6 100644
--- a/src/chapters/dram.tex
+++ b/src/chapters/dram.tex
@@ -22,7 +22,7 @@ Because the charge stored in each cell is very small, so-called \acp{psa} are ne
 \begin{figure}
 	\centering
 	\includegraphics[width=\linewidth]{images/psa}
-	\caption[\ac{psa} of an open bitline architecture]{\ac{psa} of an open bitline architecture \cite{jacob2008} \cite{jung2017a}.}
+	\caption[\ac{psa} of an open bitline architecture.]{\ac{psa} of an open bitline architecture \cite{jacob2008} \cite{jung2017a}.}
 	\label{img:psa}
 \end{figure}
 
@@ -39,7 +39,7 @@ The \cref{img:bank} summarizes the basic architecture of a single storage device
 \begin{figure}
 	\centering
 	\includegraphics[width=\linewidth]{images/bank}
-	\caption[Architecture of a single DRAM device]{Architecture of a single DRAM device \cite{jung2017a}.}
+	\caption[Architecture of a single DRAM device.]{Architecture of a single DRAM device \cite{jung2017a}.}
 	\label{img:bank}
 \end{figure}
 
@@ -83,7 +83,7 @@ Because banks can be controlled independently, one bank can be outputting the ne
 		\bitbox{3}[bgcolor=verylightgray]{}
 	\end{bytefield}
 
-	\caption[Exemplary address mapping scheme]{Exemplary address mapping scheme for an input address of size 32.}
+	\caption{Exemplary address mapping scheme for an input address of size 32.}
 	\label{img:bank_interleaving}
 \end{figure}
 
@@ -102,7 +102,7 @@ Several \ac{dram} dies are stacked on top of each other and connected with \acp{
 \begin{figure}
 	\centering
 	\includegraphics[width=0.8\linewidth]{images/sip}
-	\caption[Cross-section view of an \ac{hbm} \ac{sip}]{Cross-section view of a \ac{hbm} \ac{sip} \cite{lee2021}.}
+	\caption[Cross-section view of an \ac{hbm} \ac{sip}.]{Cross-section view of a \ac{hbm} \ac{sip} \cite{lee2021}.}
 	\label{img:sip}
 \end{figure}
 Such a cube is then placed onto a common silicon interposer that connects the \ac{dram} to its host processor.
@@ -123,7 +123,7 @@ In the center of the die, the \acp{tsv} connect the die to the next die above it
 \begin{figure}
 	\centering
 	\includegraphics[width=0.8\linewidth]{images/hbm}
-	\caption[\aca{hbm} memory die architecture]{\aca{hbm} memory die architecture \cite{lee2021}.}
+	\caption[\aca{hbm} memory die architecture.]{\aca{hbm} memory die architecture \cite{lee2021}.}
 	\label{img:hbm}
 \end{figure}
 
diff --git a/src/chapters/implementation/kernel.tex b/src/chapters/implementation/kernel.tex
index 456597e..5ac8a84 100644
--- a/src/chapters/implementation/kernel.tex
+++ b/src/chapters/implementation/kernel.tex
@@ -59,7 +59,7 @@ Each granule size has a different maximum amount of page table nesting, with up
 \begin{figure}
 	\centering
 	\includegraphics[width=\linewidth]{images/pagetable_granule}
-	\caption[The distinct page table levels for the $\qty{4}{\kilo\byte}$ granule]{The distinct page table levels for the $\qty{4}{\kilo\byte}$ granule \cite{arm2015}.}
+	\caption[The distinct page table levels for the $\qty{4}{\kilo\byte}$ granule.]{The distinct page table levels for the $\qty{4}{\kilo\byte}$ granule \cite{arm2015}.}
 	\label{img:pagetable_granule}
 \end{figure}
 
@@ -117,7 +117,7 @@ The simplified \ac{am} following this scheme is shown in \cref{img:hbm2_am}.
 		\bitbox{3}[bgcolor=verylightgray]{}
 	\end{bytefield}
 
-	\caption[Simplified \aca{hbm} address mapping with a split column mapping]{Simplified \aca{hbm} address mapping with a split column mapping.}
+	\caption{Simplified \aca{hbm} address mapping with a split column mapping.}
 	\label{img:hbm2_am}
 \end{figure}
 
@@ -150,7 +150,7 @@ The concrete values for these parameters are listed in \cref{tab:memspec}.
     Width                  & Width of the Data Bus     & 64    
     \end{tblr}
  %    }
-	\caption[A list of the used configuration parameters of \aca{hbm}]{A list of the used configuration parameters of \aca{hbm}.}
+	\caption{A list of the used configuration parameters of \aca{hbm}.}
 	\label{tab:memspec}
 \end{table}
 
@@ -179,7 +179,7 @@ JUMP -1, 7
 FILL BANK, GRF_B #0
 EXIT
 \end{verbatim}
-	\caption[A complete \ac{gemv} microkernel]{A complete \ac{gemv} microkernel.}
+	\caption{A complete \ac{gemv} microkernel.}
 	\label{lst:gemv_microkernel}
 \end{listing}
 
diff --git a/src/chapters/implementation/library.tex b/src/chapters/implementation/library.tex
index d6609ed..ee9738d 100644
--- a/src/chapters/implementation/library.tex
+++ b/src/chapters/implementation/library.tex
@@ -51,7 +51,7 @@ enum File {
 }
 \end{minted}
 \end{minipage}
-	\caption[The \texttt{enum} definitions of the instructions and register files]{The \texttt{enum} definitions of the instructions and register files.}
+	\caption{The \texttt{enum} definitions of the instructions and register files.}
 	\label{lst:instruction_enums}
 \end{listing}
 A microkernel is then simply an array consisting of instructions of size 32.
@@ -81,7 +81,7 @@ This \texttt{ComputeArray} and \texttt{BankArray} layout is illustrated in \cref
 \begin{figure}
 	\centering
 	\includegraphics[width=\linewidth]{images/compute_array}
-	\caption[Memory layout of a flat \ac{fp16} array spanning over four banks]{Memory layout of a flat \ac{fp16} array spanning over four banks.}
+	\caption{Memory layout of a flat \ac{fp16} array spanning over four banks.}
 	\label{img:compute_array}
 \end{figure}
 
diff --git a/src/chapters/introduction.tex b/src/chapters/introduction.tex
index dbe90ce..8692858 100644
--- a/src/chapters/introduction.tex
+++ b/src/chapters/introduction.tex
@@ -17,7 +17,7 @@ In addition, Moore's Law is slowing down as further device scaling approaches ph
 \begin{figure}[!ht]
 	\centering
 	\input{plots/energy_chart}
-	\caption[Total energy of computing]{Total energy of computing \cite{src2021}.}
+	\caption[Total energy of computing.]{Total energy of computing \cite{src2021}.}
 	\label{plt:enery_chart}
 \end{figure}
 
@@ -34,7 +34,7 @@ In contrast, compute-intensive workloads, such as visual processing, are referre
 \begin{figure}[!ht]
 	\centering
 	\input{plots/roofline}
-	\caption[Roofline model of GPT revisions]{Roofline model of GPT revisions \cite{ivobolsens2023}.}
+	\caption[Roofline model of GPT revisions.]{Roofline model of GPT revisions \cite{ivobolsens2023}.}
 	\label{plt:roofline}
 \end{figure}
 
diff --git a/src/chapters/pim.tex b/src/chapters/pim.tex
index 0664578..ffb6e19 100644
--- a/src/chapters/pim.tex
+++ b/src/chapters/pim.tex
@@ -26,7 +26,7 @@ This process is illustrated in \cref{img:dnn} where one \ac{dnn} layer is proces
 \begin{figure}
 	\centering
 	\input{images/dnn}
-	\caption[A fully connected \ac{dnn} layer]{A fully connected \ac{dnn} layer \cite{he2020}.}
+	\caption[A fully connected \ac{dnn} layer.]{A fully connected \ac{dnn} layer \cite{he2020}.}
 	\label{img:dnn}
 \end{figure}
 
@@ -108,7 +108,7 @@ To make full use of the output buffering, the matrix rows are interleaved in an
 \begin{figure}
 	\centering
 	\input{images/hynix}
-	\caption[Newton memory layout for a \ac{gemv} operation]{Newton memory layout for a \ac{gemv} operation \cite{he2020}.}
+	\caption[Newton memory layout for a \ac{gemv} operation.]{Newton memory layout for a \ac{gemv} operation \cite{he2020}.}
 	\label{img:hynix}
 \end{figure}
 
@@ -142,7 +142,7 @@ This general architecture is shown in detail in \cref{img:fimdram}, with (a) the
 \begin{figure}
 	\centering
 	\includegraphics[width=\linewidth]{images/fimdram}
-	\caption[Architecture of \aca{fimdram}]{Architecture of \aca{fimdram} \cite{lee2021}.}
+	\caption[Architecture of \aca{fimdram}.]{Architecture of \aca{fimdram} \cite{lee2021}.}
 	\label{img:fimdram}
 \end{figure}
 
@@ -185,7 +185,7 @@ This processing unit architecture is illustrated in \cref{img:pcu}, along with t
 \begin{figure}
 	\centering
 	\includegraphics[width=0.8\linewidth]{images/pcu}
-	\caption[Architecture of a \ac{pim} processing unit]{Architecture of a \ac{pim} processing unit \cite{lee2021}.}
+	\caption[Architecture of a \ac{pim} processing unit.]{Architecture of a \ac{pim} processing unit \cite{lee2021}.}
 	\label{img:pcu}
 \end{figure}
 
@@ -200,7 +200,7 @@ The data layout of these three instruction groups is shown in \cref{tab:isa}.
 \begin{table}
 	\centering
 	\includegraphics[width=\linewidth]{images/isa}
-	\caption[The instruction format of the processing units]{The instruction format of the processing units \cite{lee2021}.}
+	\caption[The instruction format of the processing units.]{The instruction format of the processing units \cite{lee2021}.}
 	\label{tab:isa}
 \end{table}
 
@@ -235,7 +235,7 @@ Another special field \textit{A} enables the \ac{aam}, which will be explained i
 	Arithmetic & MAC     & multiply-accumulate                               & GRF-B        & GRF, BANK      & GRF, BANK, SRF & GRF, BANK, SRF \\
 	Arithmetic & MAD     & multiply-and-add                                  & GRF          & GRF, BANK      & GRF, BANK, SRF & GRF, BANK, SRF  
 	\end{tblr}}
-	\caption[A list of all supported \ac{pim} instructions and their possible sources and destinations]{A list of all supported \ac{pim} instructions and their possible sources and destinations \cite{shin-haengkang2023}.}
+	\caption[A list of all supported \ac{pim} instructions and their possible sources and destinations.]{A list of all supported \ac{pim} instructions and their possible sources and destinations \cite{shin-haengkang2023}.}
 	\label{tab:instruction_set}
 \end{table}
 
@@ -259,7 +259,7 @@ For example, as shown in \cref{lst:reorder}, two consecutive \ac{mac} instructio
 MAC GRF_B #0, BANK, GRF_A #0
 MAC GRF_B #1, BANK, GRF_A #1
 \end{verbatim}
-	\caption[Exemplary sequence of \ac{mac} instructions in a microkernel]{Exemplary sequence of \ac{mac} instructions in a microkernel.}
+	\caption[Exemplary sequence of \ac{mac} instructions in a microkernel.]{Exemplary sequence of \ac{mac} instructions in a microkernel.}
 	\label{lst:reorder}
 \end{listing}
 
@@ -276,7 +276,7 @@ With this method, the register indices and the bank address cannot get out of sy
 \begin{figure}
 	\centering
 	\includegraphics[width=0.5\linewidth]{images/aam}
-	\caption[Exemplary calculation of the \ac{grf}-A and \ac{grf}-B index using the row and column address]{Exemplary calculation of the \ac{grf}-A and \ac{grf}-B index using the row and column address \cite{lee2021}.}
+	\caption[Exemplary calculation of the \ac{grf}-A and \ac{grf}-B index using the row and column address.]{Exemplary calculation of the \ac{grf}-A and \ac{grf}-B index using the row and column address \cite{lee2021}.}
 	\label{img:aam}
 \end{figure}
 
@@ -288,7 +288,7 @@ At the core of a \ac{gemv} microkernel is an iterative \ac{mac} instruction, fol
 MAC(AAM) GRF_B, BANK, GRF_A
 JUMP -1, 7
 \end{verbatim}
-	\caption[The core of a \ac{gemv} microkernel]{The core of a \ac{gemv} microkernel.}
+	\caption[The core of a \ac{gemv} microkernel.]{The core of a \ac{gemv} microkernel.}
 	\label{lst:gemv}
 \end{listing}
 
@@ -342,7 +342,7 @@ This interleaving is illustrated in \cref{img:input_vector}.
 \begin{figure}
 	\centering
 	\input{images/input_vector}
-	\caption[Input vector in linear address space, where one chunk is mapped to all banks]{Input vector in linear address space, where one chunk is mapped to all banks.}
+	\caption{Input vector in linear address space, where one chunk is mapped to all banks.}
 	\label{img:input_vector}
 \end{figure}
 
@@ -366,7 +366,7 @@ The operation of this concrete \ac{gemv} microkernel is illustrated in \cref{img
 \begin{figure}
 	\centering
 	\includegraphics[width=0.8\linewidth]{images/memory_layout}
-	\caption[Procedure to perform a (128$\times$8)$\times$(128) \ac{gemv} operation]{Procedure to perform a (128$\times$8)$\times$(128) \ac{gemv} operation. One cell represents 16 \ac{fp16} elements, forming a $\qty{32}{\byte}$ block \cite{kang2022}.}
+	\caption[Procedure to perform a (128$\times$8)$\times$(128) \ac{gemv} operation.]{Procedure to perform a (128$\times$8)$\times$(128) \ac{gemv} operation. One cell represents 16 \ac{fp16} elements, forming a $\qty{32}{\byte}$ block \cite{kang2022}.}
 	\label{img:memory_layout}
 \end{figure}
 
@@ -380,7 +380,7 @@ As a side effect of the incremented matrix row address, this also results in an
 MAC(AAM) GRF_B, BANK, GRF_A
 JUMP -1, 63
 \end{verbatim}
-	\caption[The core of a \ac{mac} microkernel that utilizes the maximum number of register entries]{The core of a \ac{mac} microkernel that utilizes the maximum number of register entries.}
+	\caption[The core of a \ac{mac} microkernel that utilizes the maximum number of register entries.]{The core of a \ac{mac} microkernel that utilizes the maximum number of register entries.}
 	\label{lst:gemv64}
 \end{listing}
 A further increase in the total number of rows can be achieved by distributing the weight matrix over multiple \acp{pch} and running the microkernel multiple times, concatenating the output vectors on the host at the end.
@@ -405,7 +405,7 @@ The following \cref{sec:vp} introduces the concept of virtual prototyping, which
 \begin{landscape}
 \begin{figure}
 \input{images/matrix_layout}
-\caption[Mapping of the weight matrix onto the memory banks and its layout in the linear address space]{Mapping of the weight matrix onto the memory banks and its layout in the linear address space.}
+\caption[Mapping of the weight matrix onto the memory banks and its layout in the linear address space.]{Mapping of the weight matrix onto the memory banks and its layout in the linear address space.}
 \label{img:matrix_layout}
 \end{figure}
 \end{landscape}
diff --git a/src/chapters/results.tex b/src/chapters/results.tex
index 6203539..65af49c 100644
--- a/src/chapters/results.tex
+++ b/src/chapters/results.tex
@@ -72,16 +72,94 @@ This allows an exaggerated evaluation of the performance gains of \ac{pim} in an
 % comparison with normal clock and infinite compute (immer 4 simulationen, bzw. 5 mit echter hardware)
 
 \subsection{Simulation Results}
-\subsubsection{Workload Kernels}
+\subsubsection{Vector Operations}
 % Vector ADD und Vector MUL
-% Vector Skalar ADD und Vector Skalar MUL (HCAL)
+% Vector Skalar ADD und Vector Skalar MUL (HCAL) (wird wohl übersprungen)
 % Vector HAXPY x*a+y
+
+% Plots zB VADD VMUL nebeneinander für versch. Dimensionen und einer Frequenz
+% andere Frequenz nächster Plot
+% dann HAXPY
+
+The first set of benchmarks analyzes the speedup of \aca{fimdram} for various vector operations, namely an element-wise vector add operation (VADD), an element-wise vector multiply operation (VMUL), and a \ac{haxpy} operation.
+Such vector operations have a low operational density and are particularly memory-bounded because there is no data reuse at all and two input operands must be loaded for each operation.
+As a result, the on-chip cache does not accelerate such workloads because all operand data must be fetched from memory anyway.
+The workloads adhere to the following calculation patterns:
+
+\begin{itemize}
+\item VADD: $z = x + y$
+\item VMUL: $z = x \cdot y$
+\item \ac{haxpy}: $z = a \cdot x + y$
+\end{itemize}
+
+Each workload is run with different input vector dimensions to examine the effect of setup overhead and potentially identify a break-even point at which \ac{pim} becomes viable.
+\Cref{tab:dimensions_vector} lists the specific vector dimensions for the following benchmarks.
+The levels X1-X4 denote the increasing dimensions, with each step doubling in size.
+
+\begin{table}
+\centering
+\begin{tblr}{
+  cell{2}{2} = {r},
+  cell{3}{2} = {r},
+  cell{4}{2} = {r},
+  cell{5}{2} = {r},
+  hlines,
+  vlines,
+  hline{2} = {-}{solid,black},
+  hline{2} = {2}{-}{solid,black},
+}
+Level & Dimensions        \\
+X1    & (256 $\times$ 1)  \\
+X2    & (512 $\times$ 1)  \\
+X3    & (1024 $\times$ 1) \\
+X4    & (2048 $\times$ 1) 
+\end{tblr}
+\caption{List of the input vector dimensions for the vector benchmarks.}
+\label{tab:dimensions_vector}
+\end{table}
+
+The benchmarks analyze the relative number of processor ticks where the speedup is calculated as follows:
+\begin{equation}
+S = \frac{\textrm{# of ticks in non-\ac{pim} mode}}{# of ticks in \ac{pim} mode}
+\end{equation}
+
+\begin{figure}
+    \centering
+    \input{plots/vector_normal}
+    \caption{Comparison between non-\ac{pim} and \ac{pim} for the vector benchmarks running at a \ac{cpu} frequency of $\qty{3}{\giga\hertz}$.}
+    \label{fig:vector_normal}
+\end{figure}
+
+\begin{figure}
+    \centering
+    \input{plots/vector_infinite}
+    \caption{test}
+    \label{fig:vector_infinite}
+\end{figure}
+
+\subsubsection{Neural Network Layers}
 % GEMV
     % Samsung 7.4x-8.9x
 % "inference" mit mehreren layern
      % ReLU vergleich
 
-% GEMM mit stark interleavten matrizen
+% GEMM mit stark interleavten matrizen (eher nicht)
+
+\begin{figure}
+    \centering
+    \input{plots/matrix_normal}
+    \caption{test}
+    \label{fig:matrix_normal}
+\end{figure}
+
+\begin{figure}
+    \centering
+    \input{plots/matrix_infinite}
+    \caption{test}
+    \label{fig:matrix_infinite}
+\end{figure}
+
+\subsubsection{Comparison to Real Hardware}
 
 % \subsubsection{Initialization Overhead}
 % conversion der operanden im verhältnis zur laufzeit abschätzen
diff --git a/src/chapters/vp.tex b/src/chapters/vp.tex
index 405a365..f86f56a 100644
--- a/src/chapters/vp.tex
+++ b/src/chapters/vp.tex
@@ -45,7 +45,7 @@ The framework is optimized for high simulation speed and uses the \ac{at} coding
 \begin{figure}
 	\centering
 	\includegraphics[width=0.8\linewidth]{images/dramsys}
-	\caption[The internal architecture of DRAMSys]{The internal architecture of DRAMSys \cite{jung2017a}.}
+	\caption[The internal architecture of DRAMSys.]{The internal architecture of DRAMSys \cite{jung2017a}.}
 	\label{img:dramsys}
 \end{figure}
 
diff --git a/src/index.tex b/src/index.tex
index 7485723..f25fdd1 100644
--- a/src/index.tex
+++ b/src/index.tex
@@ -21,6 +21,7 @@
 \usepackage{url}
 \usepackage[urldate=long,sorting=none,maxbibnames=5]{biblatex}
 \usepackage{pgfplots}
+\usepackage{pgfplotstable}
 \usepackage{bytefield}
 \usepackage{mathdots}
 \usepackage{tabularray}
@@ -46,6 +47,11 @@
 
 % Custom colors
 \definecolor{verylightgray}{gray}{0.85}
+\definecolor{_darkblue}{RGB}{68, 114, 196}
+\definecolor{_blue}{RGB}{91, 155, 213}
+\definecolor{_green}{RGB}{112, 173, 71}
+\definecolor{_orange}{RGB}{237, 125, 49}
+\definecolor{_yellow}{RGB}{255, 192, 0}
 
 % Penalties
 \clubpenalty = 10000
diff --git a/src/plots/matrix_infinite.tex b/src/plots/matrix_infinite.tex
new file mode 100644
index 0000000..dbe5ffe
--- /dev/null
+++ b/src/plots/matrix_infinite.tex
@@ -0,0 +1,31 @@
+\begin{tikzpicture}
+    \pgfplotstableread[col sep=comma]{plots/tables/gemv_100GHz.csv}\gemv
+    \pgfplotstableread[col sep=comma]{plots/tables/gemv_layers_100GHz.csv}\gemvlayers
+    \begin{axis}[
+            width=0.9\textwidth,
+            ybar=1pt,
+            bar width = 15pt,
+            ymin=0,
+            ymax=5,
+            ytick distance=1,
+            ymajorgrids,
+            ylabel={Speedup},
+            tick pos=left,
+            xtick=data,
+            xticklabels from table={\gemv}{level},
+            enlarge x limits=0.25,
+            legend style={
+                at={(current bounding box.south-|current axis.south)},
+                anchor=north,
+                legend columns=-1,
+                draw=none,
+                /tikz/every even column/.append style={column sep=0.5cm}
+            },
+        ]
+        \addplot[fill=_blue!90] table [x expr=\coordindex, y={speedup}]{\gemv};
+        \addlegendentry{GEMV}
+    
+        \addplot[fill=_orange!90] table [x expr=\coordindex, y={speedup}]{\gemvlayers};
+        \addlegendentry{DNN Layers}
+    \end{axis}
+\end{tikzpicture}
diff --git a/src/plots/matrix_normal.tex b/src/plots/matrix_normal.tex
new file mode 100644
index 0000000..19d20cc
--- /dev/null
+++ b/src/plots/matrix_normal.tex
@@ -0,0 +1,31 @@
+\begin{tikzpicture}
+    \pgfplotstableread[col sep=comma]{plots/tables/gemv_3GHz.csv}\gemv
+    \pgfplotstableread[col sep=comma]{plots/tables/gemv_layers_3GHz.csv}\gemvlayers
+    \begin{axis}[
+            width=0.9\textwidth,
+            ybar=1pt,
+            bar width = 15pt,
+            ymin=0,
+            ymax=35,
+            minor y tick num = 5,
+            ymajorgrids,
+            ylabel={Speedup},
+            tick pos=left,
+            xtick=data,
+            xticklabels from table={\gemv}{level},
+            enlarge x limits=0.25,
+            legend style={
+                at={(current bounding box.south-|current axis.south)},
+                anchor=north,
+                legend columns=-1,
+                draw=none,
+                /tikz/every even column/.append style={column sep=0.5cm}
+            },
+        ]
+        \addplot[fill=_blue!90] table [x expr=\coordindex, y={speedup}]{\gemv};
+        \addlegendentry{GEMV}
+    
+        \addplot[fill=_orange!90] table [x expr=\coordindex, y={speedup}]{\gemvlayers};
+        \addlegendentry{DNN Layers}
+    \end{axis}
+\end{tikzpicture}
diff --git a/src/plots/tables/gemv_100GHz.csv b/src/plots/tables/gemv_100GHz.csv
new file mode 100644
index 0000000..566d659
--- /dev/null
+++ b/src/plots/tables/gemv_100GHz.csv
@@ -0,0 +1,5 @@
+workload,level,frequency,speedup
+gemv,X1,100GHz,0.2108059965502083
+gemv,X2,100GHz,0.40509080127411157
+gemv,X3,100GHz,0.8462958338758609
+gemv,X4,100GHz,4.7274497979448125
diff --git a/src/plots/tables/gemv_3GHz.csv b/src/plots/tables/gemv_3GHz.csv
new file mode 100644
index 0000000..d5e7712
--- /dev/null
+++ b/src/plots/tables/gemv_3GHz.csv
@@ -0,0 +1,5 @@
+workload,level,frequency,speedup
+gemv,X1,3GHz,3.468782996825547
+gemv,X2,3GHz,6.723879985176877
+gemv,X3,3GHz,12.744110856471028
+gemv,X4,3GHz,23.645526777997713
diff --git a/src/plots/tables/gemv_layers_100GHz.csv b/src/plots/tables/gemv_layers_100GHz.csv
new file mode 100644
index 0000000..787a2ff
--- /dev/null
+++ b/src/plots/tables/gemv_layers_100GHz.csv
@@ -0,0 +1,5 @@
+workload,level,frequency,speedup
+gemv_layers,X1,100GHz,0.18703680911951287
+gemv_layers,X2,100GHz,0.35722454947444127
+gemv_layers,X3,100GHz,0.6338568319278073
+gemv_layers,X4,100GHz,1.638629460755059
diff --git a/src/plots/tables/gemv_layers_3GHz.csv b/src/plots/tables/gemv_layers_3GHz.csv
new file mode 100644
index 0000000..6c5060e
--- /dev/null
+++ b/src/plots/tables/gemv_layers_3GHz.csv
@@ -0,0 +1,5 @@
+workload,level,frequency,speedup
+gemv_layers,X1,3GHz,3.194018430394461
+gemv_layers,X2,3GHz,6.206580081241512
+gemv_layers,X3,3GHz,11.305511591995977
+gemv_layers,X4,3GHz,20.27760945615218
diff --git a/src/plots/tables/haxpy_100GHz.csv b/src/plots/tables/haxpy_100GHz.csv
new file mode 100644
index 0000000..33ab8fd
--- /dev/null
+++ b/src/plots/tables/haxpy_100GHz.csv
@@ -0,0 +1,5 @@
+workload,level,frequency,speedup
+haxpy,X1,100GHz,2.0481358611246403
+haxpy,X2,100GHz,2.3234133539462776
+haxpy,X3,100GHz,2.272582592673281
+haxpy,X4,100GHz,2.3895030032424387
diff --git a/src/plots/tables/haxpy_3GHz.csv b/src/plots/tables/haxpy_3GHz.csv
new file mode 100644
index 0000000..4970b87
--- /dev/null
+++ b/src/plots/tables/haxpy_3GHz.csv
@@ -0,0 +1,5 @@
+workload,level,frequency,speedup
+haxpy,X1,3GHz,19.816741597088416
+haxpy,X2,3GHz,25.395400082633245
+haxpy,X3,3GHz,28.676005064893953
+haxpy,X4,3GHz,31.783592582828017
diff --git a/src/plots/tables/vadd_100GHz.csv b/src/plots/tables/vadd_100GHz.csv
new file mode 100644
index 0000000..4ad7277
--- /dev/null
+++ b/src/plots/tables/vadd_100GHz.csv
@@ -0,0 +1,5 @@
+workload,level,frequency,speedup
+vadd,X1,100GHz,2.398047786583271
+vadd,X2,100GHz,1.823243660465808
+vadd,X3,100GHz,1.562017010059411
+vadd,X4,100GHz,1.7888939829610704
diff --git a/src/plots/tables/vadd_3GHz.csv b/src/plots/tables/vadd_3GHz.csv
new file mode 100644
index 0000000..71f7e18
--- /dev/null
+++ b/src/plots/tables/vadd_3GHz.csv
@@ -0,0 +1,5 @@
+workload,level,frequency,speedup
+vadd,X1,3GHz,12.766775777414075
+vadd,X2,3GHz,14.19338061465721
+vadd,X3,3GHz,15.313227057302887
+vadd,X4,3GHz,16.430379164365913
diff --git a/src/plots/tables/vmul_100GHz.csv b/src/plots/tables/vmul_100GHz.csv
new file mode 100644
index 0000000..2f93e8e
--- /dev/null
+++ b/src/plots/tables/vmul_100GHz.csv
@@ -0,0 +1,5 @@
+workload,level,frequency,speedup
+vmul,X1,100GHz,2.4019901321627604
+vmul,X2,100GHz,2.2189241847884267
+vmul,X3,100GHz,1.86705278821741
+vmul,X4,100GHz,1.7484391189395834
diff --git a/src/plots/tables/vmul_3GHz.csv b/src/plots/tables/vmul_3GHz.csv
new file mode 100644
index 0000000..d6898d6
--- /dev/null
+++ b/src/plots/tables/vmul_3GHz.csv
@@ -0,0 +1,5 @@
+workload,level,frequency,speedup
+vmul,X1,3GHz,14.157521157521158
+vmul,X2,3GHz,15.915413533834586
+vmul,X3,3GHz,16.959713823354058
+vmul,X4,3GHz,18.215465292791755
diff --git a/src/plots/vector_infinite.tex b/src/plots/vector_infinite.tex
new file mode 100644
index 0000000..0b7f593
--- /dev/null
+++ b/src/plots/vector_infinite.tex
@@ -0,0 +1,35 @@
+\begin{tikzpicture}
+    \pgfplotstableread[col sep=comma]{plots/tables/vadd_100GHz.csv}\vadd
+    \pgfplotstableread[col sep=comma]{plots/tables/vmul_100GHz.csv}\vmul
+    \pgfplotstableread[col sep=comma]{plots/tables/haxpy_100GHz.csv}\haxpy
+    \begin{axis}[
+            width=0.9\textwidth,
+            ybar=1pt,
+            bar width = 15pt,
+            ymin=0,
+            ymax=5,
+            ytick distance=1,
+            ymajorgrids,
+            ylabel={Speedup},
+            tick pos=left,
+            xtick=data,
+            xticklabels from table={\vadd}{level},
+            enlarge x limits=0.25,
+            legend style={
+                at={(current bounding box.south-|current axis.south)},
+                anchor=north,
+                legend columns=-1,
+                draw=none,
+                /tikz/every even column/.append style={column sep=0.5cm}
+            },
+        ]
+        \addplot[fill=_blue!90] table [x expr=\coordindex, y={speedup}]{\vadd};
+        \addlegendentry{VADD}
+    
+        \addplot[fill=_orange!90] table [x expr=\coordindex, y={speedup}]{\vmul};
+        \addlegendentry{VMUL}
+    
+        \addplot[fill=_yellow!90] table [x expr=\coordindex, y={speedup}]{\haxpy};
+        \addlegendentry{HAXPY}
+    \end{axis}
+\end{tikzpicture}
diff --git a/src/plots/vector_normal.tex b/src/plots/vector_normal.tex
new file mode 100644
index 0000000..37631c6
--- /dev/null
+++ b/src/plots/vector_normal.tex
@@ -0,0 +1,35 @@
+\begin{tikzpicture}
+    \pgfplotstableread[col sep=comma]{plots/tables/vadd_3GHz.csv}\vadd
+    \pgfplotstableread[col sep=comma]{plots/tables/vmul_3GHz.csv}\vmul
+    \pgfplotstableread[col sep=comma]{plots/tables/haxpy_3GHz.csv}\haxpy
+    \begin{axis}[
+            width=0.9\textwidth,
+            ybar=1pt,
+            bar width = 15pt,
+            ymin=0,
+            ymax=35,
+            minor y tick num = 5,
+            ymajorgrids,
+            ylabel={Speedup},
+            tick pos=left,
+            xtick=data,
+            xticklabels from table={\vadd}{level},
+            enlarge x limits=0.25,
+            legend style={
+                at={(current bounding box.south-|current axis.south)},
+                anchor=north,
+                legend columns=-1,
+                draw=none,
+                /tikz/every even column/.append style={column sep=0.5cm}
+            },
+        ]
+        \addplot[fill=_blue!90] table [x expr=\coordindex, y={speedup}]{\vadd};
+        \addlegendentry{VADD}
+    
+        \addplot[fill=_orange!90] table [x expr=\coordindex, y={speedup}]{\vmul};
+        \addlegendentry{VMUL}
+    
+        \addplot[fill=_yellow!90] table [x expr=\coordindex, y={speedup}]{\haxpy};
+        \addlegendentry{HAXPY}
+    \end{axis}
+\end{tikzpicture}