From 88d46788c7b11e8fbc7636bc8530621af3712687 Mon Sep 17 00:00:00 2001
From: Derek Christ <christ.derek@gmail.com>
Date: Thu, 15 Feb 2024 14:49:08 +0100
Subject: [PATCH] Data Structures

---
 src/chapters/conclusion.tex             |   7 ++
 src/chapters/implementation/kernel.tex  |   8 ++
 src/chapters/implementation/library.tex |  96 ++++++++++++++++++++++--
 src/chapters/pim.tex                    |  25 +++---
 src/chapters/results.tex                |   2 +
 src/doc.bib                             |   5 ++
 src/images/compute_array.pdf            | Bin 0 -> 9019 bytes
 src/images/input_vector.tex             |  24 +++---
 8 files changed, 139 insertions(+), 28 deletions(-)
 create mode 100644 src/images/compute_array.pdf

diff --git a/src/chapters/conclusion.tex b/src/chapters/conclusion.tex
index 5a8db11..36b1cce 100644
--- a/src/chapters/conclusion.tex
+++ b/src/chapters/conclusion.tex
@@ -1,2 +1,9 @@
 \section{Conclusion and Future Work}
 \label{sec:conclusion}
+
+% what to do better:
+    % implement samsungs real mode switching and programming of crfs
+    % implement linux kernel driver
+    % make use of sasmsung pim in a real dnn application and measure the effects
+    % compare with SIMD insts in ARM
+    % compare with real TPUs and GPU platforms
diff --git a/src/chapters/implementation/kernel.tex b/src/chapters/implementation/kernel.tex
index 620fdee..fafdf09 100644
--- a/src/chapters/implementation/kernel.tex
+++ b/src/chapters/implementation/kernel.tex
@@ -1,2 +1,10 @@
 \subsection{Application Kernel}
 \label{sec:kernel}
+
+% python config
+% bare metal vs linux
+% linker script
+% start assembly script
+% ARM page tables
+% cache management
+% heap allocation
diff --git a/src/chapters/implementation/library.tex b/src/chapters/implementation/library.tex
index f19a1a5..b55e8f4 100644
--- a/src/chapters/implementation/library.tex
+++ b/src/chapters/implementation/library.tex
@@ -7,18 +7,100 @@ While it is possible to shift the responsibility for interacting with the \aca{f
 Such a \ac{pim} library must include the following essential features to fully interact with the processing units in memory:
 
 \begin{itemize}
-\item It must support the \textbf{mode-setting} required to switch between \ac{sb}, \ac{ab} and \ac{abp} mode.
-\item It should provide data structures to build up \textbf{microkernels} and functions to upload the kernels to the \acp{crf} of the processing units.
-\item To meet the special requirements for the \textbf{memory layout} of the algorithm's inputs and outputs, it should provide data structures to represent vectors and matrices according to the layout constraints.
+\item It must support the \textbf{mode setting} required to switch between \ac{sb}, \ac{ab} and \ac{abp} mode.
+\item It should provide data structures to assemble \textbf{microkernels} and functions to transfer the microkernels to the \acp{crf} of the processing units.
+\item To meet the \textbf{memory layout} requirements of the inputs and outputs of an algorithm, it should provide data structures to represent vectors and matrices according to the special layout constraints.
 \item After switching the mode to \ac{abp}, the library should provide functionality to \textbf{execute a user-defined microkernel} by issuing the necessary memory requests through the execution of \ac{ld} and \ac{st} instructions.
-\item For platforms, where it is not possible to mark the \ac{pim} memory regions as uncacheable, the library should provide the necessary \textbf{cache management} operations to bypass the cache filtering and to generate the right amount of \ac{rd} and \ac{wr} \ac{dram} commands.
+\item For platforms, where it is not possible to mark the \ac{pim} memory region as uncacheable, the library should provide the necessary \textbf{cache management} operations to bypass the cache filtering and to generate the right amount of \ac{rd} and \ac{wr} \ac{dram} commands.
 \end{itemize}
 
-% - mode setting
+As already discussed in \cref{sec:vm}, for simplicity and debugability reasons, the host processor communicates with the \ac{pim} model in the \ac{dram} using a \ac{json}-based protocol.
+To achieve this, a small shared library, that defines the communication data structures as well as routines to serialize and deserialize them, is linked by both the \ac{pim} support library as well as the \ac{pim} model in DRAMSys.
+A predefined memory region is then used to differentiate these communication messages from regular the regular memory traffic.
+Ideally, this memory region is also set as uncacheable, so that the messages do not get stuck in the on-chip cache.
+Alternatively, the software library must ensure that the cache is flushed after the \ac{json} message is written to the memory region.
+
+With the mode setting implemented, the shared library also provides type definitions to represent the \ac{pim} instructions in memory and to transfer entire microkernels consisting of 32 instructions to the processing units.
+An instruction is simply represented by one of 9 different \texttt{enum} variants, each holding its necessary fields, such as the source or destination register files, as shown in \cref{lst:instruction_enums}.
+
+\begin{listing}
+\begin{minipage}[t,c]{0.45\linewidth}
+\begin{minted}{rust}
+enum Instruction {
+    NOP,
+    EXIT,
+    JUMP {
+        offset: i8,
+        count: u8,
+    },
+    MOV {
+        src: File,
+        dst: File,
+    },
+    // ...
+}
+\end{minted}
+\end{minipage}
+\begin{minipage}[t,c]{0.45\linewidth}
+\begin{minted}{rust}
+enum File {
+    GrfA { index: u8 },
+    GrfB { index: u8 },
+    SrfM { index: u8 },
+    SrfA { index: u8 },
+    Bank,
+}
+\end{minted}
+\end{minipage}
+	\caption[The \texttt{enum} definitions of the instructions and register files]{The \texttt{enum} definitions of the instructions and register files.}
+	\label{lst:instruction_enums}
+\end{listing}
+
+A microkernel is then simply an array consisting of instructions of size 32.
 
 \subsubsection{Data Structures}
-% - memory layout
-% - microkernel programming
+
+The software library provides several data structures that adhere the special memory layout requirements of \aca{fimdram}.
+In the simplest case, the \ac{pim} operations should be applied to flat arrays of \ac{fp16} numbers, adding two of them together or scaling one by a scalar value.
+For such a flat array, several things have to be considered:
+
+\begin{itemize}
+\item It must at least span over all banks of a \ac{pch}.
+\item The start of the array must lie on the first bank of the \ac{pch} and the end of the array must lie on the last bank of the \ac{pch}.
+\end{itemize}
+
+The software library introduces the \texttt{BankArray} data structure, which has the size of $\qty{32}{\byte}*\mathrm{\#\ banks\ per\ \ac{pch}}=\qty{512}{\byte}$, holding in total 256 \ac{fp16} numbers.
+To guarantee the correct placement, an alignment of 512 is explicitly enforced.
+While it may seem at first that the compiler implicitly enforces this alignment, this is not true for arrays, consisting of smaller data types - the compiler only enforces a $\qty{2}{\byte}$ alignment for the \ac{fp16} array, since a \ac{fp16} number is $\qty{2}{\byte}$ in size.
+This memory layout assumes a bank interleaving \ac{am}, where after a complete burst the memory controller addresses the next bank of the \ac{pch}.
+
+To support arrays larger than $\qty{512}{\byte}$, the \texttt{BankArray} can also be instantiated multiple times in a larger \texttt{ComputeArray}.
+This \texttt{ComputeArray} inherits the alignment requirements of the \texttt{BankArray}, so that it does not need to be explicitly aligned.
+Also, arrays smaller than $\qty{512}{\byte}$ are possible by simply not filling the entire array with values.
+However, the \texttt{BankArray} may not be smaller than this minimum size as it must at least span all banks of a \ac{pch} to reserve the memory regions, so that the compiler does not put other data in this region, as those would be overwritten with invalid data during a \ac{pim} operation.
+This \texttt{ComputeArray} and \texttt{BankArray} layout is illustrated in \cref{img:compute_array}.
+
+\begin{figure}
+	\centering
+	\includegraphics[width=\linewidth]{images/compute_array}
+	\caption[Memory layout of a flat \ac{fp16} array spanning over four banks]{Memory layout of a flat \ac{fp16} array spanning over four banks.}
+	\label{img:compute_array}
+\end{figure}
+
+To leverage \aca{fimdram} to accelerate \ac{dnn} applications however, the library must also support data structures to represent matrices and vectors in the required memory layout.
+As already discussed in \cref{sec:memory_layout}, the weight matrix must be laid out in a column-major fashion, grouped in vectors of 16 \ac{fp16} elements.
+To avoid reinventing numerous routines for initializing and manipulating matrices, the publicly available open-source linear algebra library nalgebra \cite{nalgebra} is used.
+In order to achieve the packed \ac{fp16} layout, a special \ac{simd} data type abstraction is used, while taking into account the changed dimensions of the matrix.
+Besides the weight matrices, the input vector must adhere an interleaved layout at the granularity of the 16-wide \ac{fp16} vector, as described in \cref{sec:memory_layout}.
+The number of the copies of each chunk is equal to the number of processing units in each \ac{pch}.
+While it would be possible to use the \ac{ab} mode of \aca{fimdram}, the interleaving is done in software for the purpose of debugability, since the initialization step cannot be modeled accurately regardless due to the \ac{json}-based communication protocol.
+In addition to the input vector, the output of the \ac{gemv} kernel is not a flat vector, but a 16-column matrix that must be reduced by the host after the \ac{pim} operation.
+Therefore, before the operation, the output matrix must be allocated as a vector consisting of a \ac{simd} \ac{fp16} vector for every matrix row.
+The bank interleaving of the \ac{am} leads to the correct, sequential representation in linear address space after the \ac{mac} results are written from the \ac{grf} register files to the memory banks.
+The host can then simply read the result from the pre-allocated output vector and reduce the results to prepare them for the next \ac{dnn} layer.
+
+With the introduced data structures used for addition, scalar multiplication and \ac{gemv} kernels, the software library must also support the execution of the programmed \ac{pim} microkernels.
+The implementation of the \aca{fimdram} execution model is explained in the following section.
 
 \subsubsection{Microkernel Execution}
 % - microkernel execution
diff --git a/src/chapters/pim.tex b/src/chapters/pim.tex
index 4fb0d8b..cada628 100644
--- a/src/chapters/pim.tex
+++ b/src/chapters/pim.tex
@@ -317,15 +317,20 @@ Thus, a total of 64 thread groups running in parallel can be spawned in a \ac{hb
 \label{sec:memory_layout}
 
 As already described in \cref{sec:instruction_ordering}, the use of the \ac{aam} mode requires a special memory layout so that the register indices are correctly calculated from the column and row addresses of a memory access.
-To make use of all eight \ac{grf}-A registers, the input address has to increment linearly, resulting in a row-major matrix layout.
-In a row-major matrix layout, the entries of a row are stored sequentially before switching to the next row, according to the \texttt{MATRIX[R][C]} \ac{c}-like array notation.
+To make use of all eight \ac{grf}-A registers, the input address has to increment linearly, while adhering a column-major matrix layout.
+In a column-major matrix layout, the entries of a column are stored sequentially before switching to the next column, according to the \texttt{MATRIX[R][C]} \ac{c}-like array notation.
+However, the concrete element type of the array is not a single \ac{fp16} number, but a vector of 16 \ac{fp16} numbers packed together.
+This results in 16 \ac{fp16} matrix row elements being stored sequentially before switching to the next 16 \ac{fp16} elements in the next row of the same 16 columns, ensuring that a \ac{simd} processing unit always contains the data of only one matrix row.
 
-The \aca{fimdram} architecture imposes certain dimensional constraints on the weight matrix and the input vector.
+The \aca{fimdram} architecture also imposes certain dimensional constraints on the weight matrix and the input vector.
 As all eight processing units in a \ac{pch} operate at the same time, the number of rows must be a multiple of eight to make use of the full processing bandwidth.
 Those matrix row blocks possibly span over multiple \ac{dram} rows or even other \acp{pch}.
-Furthermore, the number of columns must be set so that exactly after one matrix row, the next bank in the \ac{pch} is addressed, so that all the processing units operate on eight different rows, stored in eight different banks, at the same time.
-This does not mean that a matrix row must be the same size as a \ac{dram} row, only that the \ac{am} of the memory controller must switch to the next bank after a complete matrix row.
-Once all banks have been accessed, the mapping of the column bits can continue.
+% Furthermore, the number of columns must be set so that exactly after one matrix row, the next bank in the \ac{pch} is addressed, so that all the processing units operate on eight different rows, stored in eight different banks, at the same time.
+% This does not mean that a matrix row must be the same size as a \ac{dram} row, only that the \ac{am} of the memory controller must switch to the next bank after a complete matrix row.
+% Once all banks have been accessed, the mapping of the column bits can continue.
+Furthermore, the number of columns defines the number of iterations the \ac{mac} core of the microkernel has to perform.
+As always 16 \ac{fp16} elements are packed together in a column-major fashion, and while ensuring that the \ac{am} of the memory controller switches to the next bank after exactly one burst size, the \ac{pim} units each contain 16 different matrix row elements of the same set of 16 matrix columns.
+Note, that this interleaving of \ac{fp16} vectors is very similar to the chunking of the weight matrix of SK Hynix's Newton architecture, as illustrated in \cref{img:hynix}.
 
 The input vector must adhere also a special memory layout.
 Since a vector is essentially a single-column matrix, it is always laid out sequentially in memory.
@@ -345,15 +350,17 @@ To initialize the input vector in this way, the host processor can use \ac{ab} m
 From the processor's point of view, only the first bank is initialized, but the \ac{ab} mode ensures that the same data is written to all banks at the same time.
 
 An example with a weight matrix of dimensions (128,8), an input vector of size (128), and an output vector of size (8) will be analyzed in the following to describe how the processing units execute a \ac{gemv} microkernel.
-With the processing unit \textit{i}, the number of iterations \textit{j}, the input vector \textit{a} and the weight matrix \textit{w}, the partial sum $psum[i,0:15]$ is calculated as follows:
+With the processing unit \textit{i}, the number of iterations \textit{j}, the input vector \textit{a} and the weight matrix \textit{w}, the partial sum $psum[i,0:15]$ is calculated as described in \cref{eq:partial_sum}:
 
 \begin{equation}
 psum[i,0:15]=\sum_{j=0}^{8}(a[j*16:j*16+15]*w[i,j*16:j*16+15])
+\label{eq:partial_sum}
 \end{equation}
 
 The partial sum vector $psum[0:7,0:15]$ must then be reduced by the host processor to obtain the final output vector $b[0:7]$.
 This reduction step is mandatory because there is no means in the \aca{fimdram} architecture to reduce the output sums of the 16-wide \ac{simd} \acp{fpu}.
 In contrast, SK Hynix's Newton implements adder trees in the \ac{pim} units to reduce the partial sums directly in memory.
+Note that consequently the activation function often used in \acp{dnn}, i.e. \ac{relu} in the case of \aca{fimdram}, cannot be applied without first reducing the partial sums, since the \ac{relu} operation is a non-linear function.
 The operation of this concrete \ac{gemv} microkernel is illustrated in \cref{img:memory_layout}.
 
 \begin{figure}
@@ -365,8 +372,8 @@ The operation of this concrete \ac{gemv} microkernel is illustrated in \cref{img
 
 In the \cref{img:memory_layout} it can be seen that a processing unit is responsible for multiplying and adding one row of the matrix with the input vector in eight cycles, forming the partial sum.
 This example only demonstrates the execution of the native matrix dimensions for one \ac{pch}.
-To increase the number of rows in the matrix, simply additional iterations of this 8-cycle microkernel are required, while feeding in the other memory addresses for the subsequent matrix rows.
-As a side effect of the incremented bank address, this also results in an increment of the \ac{grf}-B index, making it possible to increase the maximum number of matrix rows to $8*8=64$ before all eight \ac{grf}-B entries are filled with partial sums, as demonstrated in \cref{lst:gemv64}.
+Increasing the number of rows in the matrix simply requires additional iterations of this 8-cycle microkernel, while feeding in the other memory addresses for the subsequent matrix rows.
+As a side effect of the incremented matrix row address, this also results in an increment of the \ac{grf}-B index, making it possible to increase the maximum number of matrix rows to $8*8=64$ before all eight \ac{grf}-B entries are filled with partial sums, as demonstrated in \cref{lst:gemv64}.
 
 \begin{listing}
 \begin{verbatim}
diff --git a/src/chapters/results.tex b/src/chapters/results.tex
index 3ba9519..e426c0a 100644
--- a/src/chapters/results.tex
+++ b/src/chapters/results.tex
@@ -1,2 +1,4 @@
 \section{Simulation Results}
 \label{sec:results}
+
+% gem5 m5ops routines/implementation in the kernel
diff --git a/src/doc.bib b/src/doc.bib
index e007e2a..c8c7498 100644
--- a/src/doc.bib
+++ b/src/doc.bib
@@ -387,6 +387,11 @@
   file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/6FI79KY6/Mutlu et al. - 2019 - Processing data where it makes sense Enabling in-.pdf}
 }
 
+@online{nalgebra,
+  title = {Linear Algebra Library for the {{Rust}} Programming Language},
+  url = {https://nalgebra.org/}
+}
+
 @book{nielsen2015,
   title = {Neural Networks and Deep Learning},
   author = {Nielsen, Michael A.},
diff --git a/src/images/compute_array.pdf b/src/images/compute_array.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..0178b659f08c5312a9cdb54446e3c1b4a730645e
GIT binary patch
literal 9019
zcmb7q2RNKt*LL)n5K+?93DFtTbxHJ|=mcRfy1^JVAw(pIk`O&ebP>Jxl4yzE2@%nP
zL>Ha!nN!|#zH|Qn{jYytF3;ZUS$plh@7e1f*Sc}vkyjLkL69Ks!Ih;M5DW|jJ6qU;
zq@+aT@K_AN84u>uaQ3ltbi|0DAW*OgpCJq;30AkmxqE@V#L#AFqzOOxmWzuc)&Ogv
zW=9Y~Aw(evG~lSFtgE3e1a`D@z=D;qmJZH<qZJ<G39)k)fg>PL2plYDi+6UyiooDd
zV2Fl_i$h>gaj=3T){=mC#)-hiATVG}9gH;wZ}&%Fevrr=yt9?NB^KD3j)NUW1ULZL
z2?zna`EAQ~&Nz7t0So4nmw-dzNGKeJf<d8ZxTqi$$_xAeej3hJ|J_AK286{~5rIJb
z1&AdE3<HVCYJf#Fo$*c>$G@DAe>utBB0Bx<sxAkHf<)9+fBS7fv}>q?ksuMBTVMj-
z9s7Tdh5rAD1p~tW*I=T51yi(hBmkI+C^`b*$YU*?t$+=vV{tYFTd*h^CL;rOBLJVc
z6UggTcl;FYGn6K1ZHK9j9U8-=a(w1JM4wQ<%4|+|M~Ll-_08H{DoP{Qv_7?&4EG{(
zl6RII+WSJ$st>-x>&}qOq^~`J9i+YAc;6;n>yr}7bVK=C?kE$i=@Z@VG1-wvFJw>q
z<+;{tBYyJtR6`ZqJhzKn$s5eiq|SU6{y`-Yxw;%Xvk@tH;j9?=?1+%VVktBrG9GU3
z&ISt9Ihm>Rlqtn)Xu<cou6H}N%2bw}#dPJr$L4C5RMS0{*$pUN(C4>IjB>NB&bP%c
zLdwbY*WVWV-`1vV9PT3DlsTOetjvmFN58LU^REaz)@^t<Icz`yD!MCa@9@~aA3;H(
zXXrG^`{8vkXRvB8J}@{;`RJ;C<7D^pFg2?Y3-5HHQ~-L{Z+_mJzMGUMj$CoW(PE_L
zYzh2a!aW9~=cO-<Rv0>D6N7{rNXCz%*m@x2T(8v@?wTKi4x)mhNjYlS-mp6y(j4q0
z>2HOYK6zi(m@;C^L*Mh=9o3dplT2Hr+&FkzI76Lc8uree+6wL*>Czv|yXee797#CG
z0i*22HDE+<wdFzmtfH;}lrkC2^F@KZb?mNIgD$TCKi{57qvLz)mkmLT@=HybH|eFG
za(+z^qg@Rd^W2na;;en^8q$CMp8Sx1PH;|G&Q&eOv^h^oZ$g&%-W64TYPfe$xZ3<p
zuVo6>$2|j&nVe`UV0%?q!Pg=5(INfRjB1Q&V^^#!`&`u+kx5tH@+k7E3mG@VFMiS$
z(kWMO{k>BRbg-XZ3hNZBkN@7OP!Gg13k{klP<}!hW?3af0V5q|k_||(k7n?dUs@8_
zW&4j4^K)(!JWs<vRk4Kn%IBQ-T-32&ynh;fxHPi5j6UBxdta$x`dC%z_Rz_57FnH5
zrVB3=9gde>J)3nro6)<N$ANYt^bho@d2-g?q>gxws?lqcYu=n>ZYd*=&`VU0`Sr71
zpFI!RT4P|(;y<yv?elKnF!aIV32D=^L_9I&|Hy$qk{|w8`sjMQV8J4HFg8HeXk*=+
z-SL)KH!whx02^8AU<qL3--t*=7wbg;i>NpOj<SC|<^Fi8n1VzUya-A<1R^~FQ<V^4
zBE4vXM3j(VB8L!rXfTlhv;lGvQFO);h<=JFFzR<3(W-&9vct$adx4FifDwrn1EbLp
zag;dFfCc1$IanOQ4NOdZAf67uG?s)r82oQWfG>XUg2nz=<d2gQhkx+?Ung}VhI0q@
z05F%Dos}Edm<Wir>0iVr{?BQ07y`!8*@n1`4S>%-R)G<B{KqoCQx6O*4u!#iqy86m
zLWzot{hglb4bMH$osT(9J&ae1>g-Ck;<?NN^{i4q<18C>^${gMVgdF<5hHuH$Nw7Q
z`AsinS)RKa3KxbPpFavE<nkGn=iiB8xf!DGP&PaqDu{EaohndsEdMMJ>`-1|Ze6k4
zbDg{#_i$?3xmROVdfMA}QF2;*vDWR@8GJw4rQ)FQ%!VY~^di-F4w4W0_ZZi#K96bo
zpOAmeiWzlRGdtZqDD{6sY3^g-UeK@*yBAIR+>$Iz$)KGv`2vR01?2CvbD)tYV0N9P
z;}B;4)6w}^rL|=(=nhTOk6*tks<5JmYiGJCpykJjt1DdXVa}wES8q<ONb0wZqCJl;
znb(j=xoI37X#NsA>@j<=Pvc*9%}?X=T#|q@dDT-ApVy^a{mINN?EWB>pN)a>V#jP;
zFO_;bNd@znbBtHczSac^m6eMQatgH#L;VA2Gpm@**{#J~z(sx6cHUWUsrjFot(zm&
zwPBOqohlWz${!Yvcxs*8qCV!pB(P9vxadb(sbW*ddbCwpzGy7kLr6JX?T|Kd2yZ+?
zSQ}`<)Ad8v&zw5l`%G_e(Af~67dG_)F-{rL@2_^~%KlNr;ho|I2K2@a4fOMAvB>l7
zH+z$hn7%Ho8<%ywHr!H;q<WqjA}=FWM&{B&d5KbaJ)POL48tbTALLNXA2U$?$;If2
zA8k!ve!ouR{<obV+gDU;(}ess=_^@+jo+4SN)mQXpRSl3SQxjPiWG~u_HIEMk{>2N
zoH|2BGyLRmFwV8_OdY!BY}RIB!J0EEBmf%MBm)T;HX%KC!p?HU;Bv$u)pA<e;CZ^%
z#O*R{5Z?a%X6i2_tGq~D-)Wo+wd0^|*R5?%w%ej{O-`wUvm{rFbw6oqt_Vk3-lpVL
zZ`-;0>K0_g+T~IyR5H+Q*2+s=9z!X5bA{f(oL2uet!Z#Ni$h-kt*tyO%kw6N<dlv(
z+<Q>Q`S=i!$rEz@G+LIvx1W^x0=6?NGGGqR`ko9|U0R`68a?GS4n!G-qa+q7%idgM
zxa;02cKw|^l|(VMX^pOA%{7dKzQvPO%c~!?D5>Mp$=D3TcWeV6+Y?@&OFlS03PW7G
zrncx?F3oX$>S1@JglXdyh6PJ9eG8r@j>X7XYrk`>V;R#Tll$Tw?V|oKJ$S-uc^i?p
zPyCd3^;la=#U7uWGpw(oXiC4_m-0Sx-IuMFPO_ophmKf$vxlX~>Ruk+q4=WYIy538
zy|w64(t4y9Q{W!^O`VQN4vguOL&3Vu`K<3wJ`|p_^zYtJi9K<zqBF-?$+S#$aXOm&
zHRGD|_o%ilZ{#;?9wU9@V=>~m?kk27Xu^XYTMmyI$tn`*MU|u2ArABXA7?ARaTGD$
zFQR1GpDpr3i|^1|yEX=K?<1tzYc`~%B6tZg!s#__5+1@^*WZUtl6{4|YcmZV6d|#u
zx<76pzjaFUMA$@fv)Z7XRrMHIebsnFkoCjh_x6=IZ}Y4(!2<pATku{15GMz&ChU>R
zdyTl3+irdGaZQCA^w+u0;*J|qtG?c1W_}~b^_c$SU9X4?w`~=#n#1%y!tTST5d6a{
z+LOXI=v2%*n@cHn;-y<%Zx9cD)vl>~oGy%BtK1o7I&$w=^1ClK=jXdOx1NAOi~UGt
zV?H0UN~f7UnZ4wxk-hxI*{Anu*7WJBr^cxs(BYi-TXEn`(q#C&ooZ&8bP}jho$p*T
zZMAK@&YLE<eQ<93N1w^=)(qz^;>N30O=*Rh=AP!1TOLpplRE8$@3r0HrsAM>i3lf|
ziqo+-mWmrH%TkjM4W+y8(hU~yD*HZqcl&v#&vlo%F8T_ksZ97=iHwvx;{$R5a_Xm+
z%a&bzmb~nisvNXZAAg+6(6ULnJ~#3*UnI%ezS$nIcs}F2%i(I$^7&~cw)@|!rOSAc
zoxW?Uq?PWK_jebjBt5e|wy>TQYZgAMX+6o(k_o1gcD9Ql$z3gy#;ZT7qU)t!;@29(
zE1ZNqJ$Xr2<*PcTiZKQE<#Jl2Uwp`-_ABoEZj(gg_>t}0<!a*=h1EQ{`Bkji%A^|b
zluE4+;jIYSD%Myr`iNO9F(_IYPcl4FmlGJ9Xd-73fKXGg)D6lNKDF~x#*YRGOvXJG
zn0zWrfD?E=Y(MT;azj#EIhE|c)1Y}1auZjHGIuJ*Rp9j<a^2sQzpoke8AvQ)CL8G<
zLFiYIm0cK`y`Dm8H`??~Q6ZNDnRRJ6uWcP3<D=)C?m>qZ(xa^xNQ{L%qq*RV@cA6U
zmXoMkc)-kM@2FjN&&eq)=N9dfU06<q3Yn&^0^?N3s>4NEzVO$^S`5^*TuvDo-V%zc
zvH?7)-!=I%2xiY7M}5zdPh$yLU6uEI6wAvn6++HH&HVyQ0peHYyLxCDTXPZXEgMiG
z)t{Wg`NA+o`8wYVWveX4I4-aKlFWYP!EbE}v@=2H3hL5MR<>3kbi*UM2Bw#2x(+8M
z)fRQ{;WvJSZk7xraJLR-U-ZuYs3V1gnH3qi36C3DNB1|m3CZ!r>V^)#eP_TR>sqZJ
zF&e|9R&H@$L&Y_pH8fY-L|YCPBN(-^Oo(DN$c)dp)T+lX6Tzwz=&EA6Bi?C#7cR!5
zg0_P!*z0se9o!8Uq8%7`5-gM}X+M{fkutI&=>AG@fGU<*e0k#~lUj{IVx@LIb4@{g
zAzEKYTb*p&Xc+Hqtgiy2EGfyGU;Ytgkux~oLy{;HNqC%aPtwLepfgP)ujI>wR~|Di
zcxa?bD#cc1;{nQ`px-EGMC0uf`{A6Vmm{_XmbJ(`QbV4LuW1D>R5kM^nHwe2N;Yg&
zTse$Ck9s~8;9I7tR!nw%FKMCT>XdX>XmsdpsCBtt|CrRyV)pfz-3hDQHO3sK=-a6^
z&l!5}(A@jX+jl1>SKm(SLwbPV^`{oTzSoREJFN%b0&`O?mRLm}QsVIk5)PJ8p)T^7
zd*;haKOrcBjc$4Cgq<d`bh|Ij2UUnMlE>Cv%atxDxfioTY1PXOiG=bno;Wz>wAl~8
zRv4S@Zz=MH24CcEWiQXZ7R9b+I3p2ep39(@5Z`u{d0s3LsoBeCqb|*JUo~`vh09l9
zF?m{Gv_LEH^Q?U@NH4T-7_C^5Tm6vWJttEO){9kiH&VgYZyAp&3z%gvBl9;v0tOkm
z+vD4}RyhVE2Mc6k1Q~v4XA0@X+I!f~PZ%z&)p%(AG`beUl;^H0L4QeU$UIy(k8(DF
zEsu>)Fh9m&78YaSV87k1t2VERGAm&IY}%bafeNQL<xbC9Fvtte@3PI!EcY>0DfcZ-
zEIXGdTzzem=_q$5z$j`Zn4|PUe*fd-Vfq;1w?Chi6=+Yv4Dtk8^;PKOlxP+8LY3x~
zjdj%|$Fq8_cr10&54&MqGa9rCoDG?)ug@wczDby^D^EpmDVf+I1lII4?nN_UOdNIR
z>(DW~s@Gz+KI*yY_2e<v^9+A4&wYZ_Q=5ESt^L^KN{qe@O&%g+q9QTu6}q8`uP{@`
zeJTePh}PxTH(lJ6m$u{Mx7O_u#PaBS7pk1IpW=JAqbjh7$_=Zm(d4x&W-8B!<HRSi
z#(d^RU5T)D7;H&b5@)25UP!3F`=Hs@q)0VX&ujT|ByE}b+iR^TnX2Xa120FzUGZN>
zS`|Y!Q7bHm$XR?FShr2mEGV|6S&cmI%ek^gv2w0PB`R+Sw{~gC1Mk8rd;7yWwu}$w
z+B!^3Yy7n4tVd}lOgx5kj=#N)`bI*jM+V<P(~##@y4ub0vw(LX5v<n1xe<B_R9V{2
za&!FDXP0#q)9vM*z;<Na18E(Ywv11;DIij5iMT=P1Fw1Tt?UFvyT>ts>WS8Bw{<ga
zGm!_z1ezo=vc@bF;rm`@?R-bFQN9imif^UTxa9_YS_~~98Bl(@|9Z)ewCgdKxYf1M
zn{+z$%Di;iz5e3gRn@ke)SJtm)@v+g<L)wxN0FVY-oB4y&L4Q4QKjz~w^DAIsc%2b
zFrQ_6-%RN3MV40rvwBxWa<BIWe(czy42I3+7~zd$5`_)yw*3|9GL|UAaWC_9@dIW<
z_jR)*-bKZdeBxvu&r)SHbKM!&bdbELry6?bxEAZ*-=MNC<9E*R%FOw@SIByq4NF68
z#EZ_>=^rGTXMa>}7TNpsX;bPE@9p)og|1GEtF@1O!>92c2s76L|HJxU8ho!LT04)}
zt<uj72JU;vou|3L8=oJNGu|P=pyfeI3i{v=@3}sH#T&#m_Jch&nXUC$Mw0h*pYGw_
z9&)BPJMyR3dadWwVB<s4tq1sro0`jMF=Y`WJZqY(nfMak8>$V7M@E~{0^}HuCZm1f
zjz<h#i|RDd$?x4*9iZ`NhuDIG{W+UND34CG8yB8@-HnUe7nCA3o7p+X-Y#=D`9VOz
z5yZ4SF6l5Sa4m&#*WRpCpUmWZBULm^gzqBfs6AdCZ`fSTt%E6;N=Xi$2!AZg6w!lG
z%FMZqEoAzk`?^rSFFUc_pi3Ys`%2j0!i(ue!rWpMEG~?_-TqXXZgtIQtd`%k5DPsW
zy!_=srQa?~$d}Hp&hAn6>Xob{pE$zqWSy4S`ljG*VW0Kpz)y`AC(gAz8sf*1HYy!X
zvAshwn*>Ghi#IV9=J%P$yS^|eqOMrY0WVDXWP%s^WAp<Y>%c{jN@I2TDaDaj!OZ31
z8`++UZiYfmMql3X9dNdZ7djg>+)B|#{<O$$7p=9>e<6F(kL_wBdtg5-p^Ez9!Ovcu
zZ-Y9W6ZfZ^cJ=eej{BU98(nYrFz%-|te5UKmamY+;12tHQy=P!-9N1O5bJAPgT~dr
z;o2>}d{1{x44S}rKCu7BSD|l}O=nqRDQt|-iqklJ)i4+Dv9kIpdw!6iyR%<sxL|_c
zF`Mou(kk23BXm5sOEc5MegreCNdAE2AvQP4DDw9D;m??uUmrhi!s%8p*v+I)TWAS;
zg)MXCR;nV-d!3tSUCsJe2Yl=CW_s98NQ>-5oZ~P&<#4T~%{Y2~Tk)r!`Nr9%`gZ34
z3Qcvz`!1RuDg3?j-|!j~{vnoA+8$p%)P)H5*X4S5#f(T<EY8#0baSz!#<g$M)pLI=
zeU-=kalHL4*^;oW9d3M&yB0hqSRVe>D)Q@##}5Ks;r;J8o#v8~MI9*ZZm-tQQE@~b
z#gq7Y+4OSV=D9IeDY?&TTCr<P+esCD$mg1q>LIm7*~sm^z7QC_nVME=Dj1~~@?t18
zdNTiRLuIz+M$(>iy#IW&>WKbVWJ0$li)24m7<|t7&hxmZv9|0~(cMhQAdOn}<1EH3
zOf>&fR01APBi#Q=s#Bpm!G7?P8++|*c3TiR%aQiIfPm<1b*zBn`d$_3thz<(ZdB@v
zokrV(`ojX%2Cr}3bAojJK5u=JtnZ!3?~7EIcBb<5@+d`#M0qwx-(Y>c@2o>Ml_rDU
zZ8I*%5%iXs;=W$Gsd%;FstIRzJO(L&&qo-IEXQ~3f8-6J?N`yHe&nhsH)P9np@Z82
zrv!=aURVIhdWZ>(K~bzhZXHU;G86OZ>>Ta~d<*GCksk>bTWvI9I8*Bo7K+*whb11D
z=7s%FpZ53_E|ywT*_PpeoFBY4CwbiPjLpZpvl`hg8rh)OX43XzErW{PXD|Q8$(yYM
z4VIGB1eMPbr=Jw!G`ELLzSMM?=SWu&<}1@ha9^11J0B8yxs+%pGe6f{6254=Oh*}o
zcu(7=`8lyCNq@R{`=pJp{be~?-Xgu9zM7)QlC*)IW}cdImK@C6&R0|rLP<78eO5`B
zkuNm~+U!^^@m?#hyX#51^&<6U+UlHXg<6xL*)+||ttLz!S_#k4elpDIA+MSn`qW<e
zP5*dy?#k_&CACME6Y2w;vis9te1Aqz(|w4JeBiY@xwe#_onN~;Rx^4tRpvN)-kqDj
z&ni(_9O*shUA%U<%Q0|Z&e2qS<Z5N9<Wk<LZnPkDetcXpc2-Cnr`FVBTX{+CV~dfn
z9icR+u#B7~Zm3mAI@a&g^ZCa?H|(w%4p^SQl$6=U&;@-e%yG9%@kyA`tIm&sZcPC<
z!58>)GUmHkL_PRFC5d`ad;FB&puX#-QaB)|c<a*9qY?A5lDVnR5$u^}$4K$ilOUNY
zp6Y}fCq}P|ud69FPz*F+S*&m*)_uawFU82#<E5jftYW)9Dg=FTDeL0_xrkM{3}HSM
z`*gXK3FsEyxq0|P6BqOjRA1Pq66E)~Z};LvL0>q?osW>))2(m&^AJ@3wv=w^M>j}V
z$M%gZB&dg<plHC(IF^6Mhg9M6SEd8QH)i39U*7de1X(W$<;lh@628zeEsvsV-3NEl
z3$QDU8QBjFcLlrMmlSXFeV|!NtYr$dW2Y$1Rez%$^!mA-w;Sc%#CFw!6{)b|_V~+Z
zU)hL8E@by!R$jE3-yeNJa2Pf_EA{AJ;`v*btTnc7TZ4`c#$&y|&mmJ=qk2C4RHC5@
z!Cdv|N!bxy6|xc;Z`_!<PbacehZEe|J08tD=8_z1vl03BOz<*KY=NPt#*g(pu1w~|
zz8m2`l-DE<Nh;4U($n>nCv_Xe$D{i7Y&}~#pQdXjwI))uhsZJuTrYnvGqjc~%M8hY
zSv<VveraHgxwfXJYzqSQ@z9MZH;#m=i4^x1c3mrqxopZ-@7~+rrM)0swzTdU-#%(8
zy0T8Tc)aGwE+(T5qEyq)u2PvKwQ3HjF)udnctS#v#xKf>8Xr4ydFlS^S(#b}`wIz$
z&=96ex#}6w-wb<?R@NR(<e5t}_&oi}lYIPjTWNbW^~?QV6^9&S#m8T$DiP7qYuBf_
z=&03=Hq|$O&KQ`TMy8&Adpt5;!Cm-`$JHS1j7~pmo#A6+XhIkCvNuIfvzW=!1gtuq
ze}->X_Zi<E`P$H&(%?yjmjY=soXi4ggtxYsp;xTkF)%a4s!x`Bbh70e!uL1owJB=p
z4ZO8IkeTZAC+#7#)g@FFbw>lGqLURko!6dgKSkj>ihi46OU+g<MISh$4H^Ur%f<Iy
zH}lOJi?*xloloXk2{>r|B<b9(^Aj}Q-ZwQH-rIMSxQ-AGHxAn=`U=IrsUK>6Rkryq
zN=v2jYM|Qwt>^M&&&rs$sc+zypLrz>(K|=|FJ>opSB9Q;DLjB}YO%bs@9gcZxZBMN
ze0y*5$gp--Bm%5UrJ>(e(3;nt%dvccjM2=&=*y2$&AKj!vJ-8+dkd%z9Rc#{mWRdD
zGoBqwmf_WW6GzUX&UsZQ^Kr)kxhS3u#s#xVzwQ2!7g-bmCO>^=D~9-g`Qpy=q;Y%Y
zPFa1-q8?`inR?t4A<1sVJlR<jBH0tkIm58Y8~tlQU@z#1)SD+gxnlEW8Yp1lK{08S
zIM=gnW^;yAX^#fVc+rz@h}62!<FF($-g-v$`Gj|mq(3=Y=X+=hLFbLOK;)$tABcfQ
zr#Pnsv@4bKIfWT=RWpn@H7<5S7`O)7IqSIw8bh}qDJv=X7-np2s<m}Wy6+F$1*Jyn
zjBUk$X6IEd=9*<paXPUGceQc?f8+CqycA*J9eBz~`)nKW!cXW#WAifIv*ygJ`>+{U
zC#)EjG>eU2&k+trA}8Q2oH3zRgCTA6vp$}mE+?Jt{W@utdp*B3aQaM9qWU;t(ENw5
z!r_J__HNMN+PMq+TRaGg%ZvN`ao(#d4QUCH?=D1z%dM?f8x*}6yrJIqw$}4{;m3ye
z>$xp@VXR%rzfxl_Cj_{ONPE2AczC9D6E*g4Edlm7Y4!IKJF&bkk9D)e+qnQb>+j`$
zO^g#DGu-;E398#!VDT6NAZY4fa6mC0knU{Uhyvm*H%p+L50uVP5V$A;30R16h$dk;
z3<eR0A)#nu=^yK)4@ROOV&Y;*gcwlF$G9kC?QCobV6>PhL|jY^4j5H{g1w#PEu4)b
zmZ(z_HzoU<x*&`~A|T>O6aoeoMnDk|I2?{fgJB3L3?hmW6$2{wqT)ahC{h%V9f;F_
z8a@^d{;kGOlrR2J7GVDpQHZ|3&-m-~;NRc@^a@#w8<yzse+5Cu-GcC&79*!3Pjn<=
z2?z+lNY`0OMP38r0>~{^02X!x@88Ti-fjR8DmZIrqL!kKwXp+49&a$;EkNJJ0%GuO
zycHI2hqD3m{W%RV>bMgnZ6_k6P@?E!KvY|dMNvcn0}6(VibBK?a45nQ3=;>#;b0UB
z4S}KH2qYMgz7e8;A_GV=;&9+4LJXJ&heKd!QE^dI0C=$I-`&N4sA32}4Tl1Vnt}mT
z#SnjsCWZnVBmR8-_l5uRF$K_<Hvprc#C`sW`QKFQ|E8?K#YO)vteC#&#RWoX+Sa^5
z9G~T4ZU*2OWGBp(S$`>|`<_vzd4`l_8X$ANLGPJdCdY96>cI(XQk=%{m7f6*tBfkX
zWBL@uO&0Q^&q_y36!B4JW;SY#%?}y1eaf;QZJiy8$5?xxZ@S?+^vqN6N6OITY|4gb
zcAp)hWmUW4e%t4~@@cU*8(>2%|CWG_&GJ&`smD!$Kl=EP_YQYl_&+yZUL5B95Ww|T
zu?KYdvkD>k>Wg{VvsyRrjk=pck;~J*if{d%n<bwsjHH;8{g5`sr$UOQ?<dnz-Im>E
z<<lTbwn&Tm4)gj_!rORncYqWAti*-26T#CP|A^WsoAZw9qjzavLmugl@K+~j9sDf(
ztiBrl(qOS~4_eh%m+v(3)wjZ<BFWzP26{Bs|KGVyROkMediMJj|Ibw=-cca8{-1YS
z0SCzP#H;w98!W8+XKq_zfD8lfJD|e-mQvcl=dSClhqEIp`9N9%TlibndEv3vASf7y
z1VR7Pz-SZ-fdX5D|6_vz8z(j}?oS(#-hig}zim*s7@*_*!-jzWYpxgy28fRT@IyiY
z-utIb^k4B{0NMQ04~7!|w;vk$uXROH@PF-D6lnj9Ck8|Q&5wY`0DOff{-=wMoezK@
pz@pmD&H$?tkpy^=NEk%UCZg&_z~BkLVJe13BS75T3bz$O{{yD*+f)Dm

literal 0
HcmV?d00001

diff --git a/src/images/input_vector.tex b/src/images/input_vector.tex
index bd2f6dc..8657ad5 100644
--- a/src/images/input_vector.tex
+++ b/src/images/input_vector.tex
@@ -1,19 +1,19 @@
 \begin{tikzpicture}
 \tiny
 
-\node[draw,outer sep=0,minimum width=1.5cm,fill=TealBlue!30]                         (inputchunk0) {a[0:127]};
-\node[draw,outer sep=0,minimum width=1.5cm,fill=TealBlue!30,right=0 of inputchunk0]  (inputchunk1) {a[0:127]};
-\node[draw,outer sep=0,minimum width=1.5cm,fill=RoyalBlue!30,right=0 of inputchunk1] (inputchunk2) {a[128:255]};
-\node[draw,outer sep=0,minimum width=1.5cm,fill=RoyalBlue!30,right=0 of inputchunk2] (inputchunk3) {a[128:255]};
-\node[draw,outer sep=0,minimum width=1.5cm,fill=Blue!30,right=0 of inputchunk3]      (inputchunk4) {a[256:383]};
-\node[draw,outer sep=0,minimum width=1.5cm,fill=Blue!30,right=0 of inputchunk4]      (inputchunk5) {a[256:383]};
+\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=TealBlue!30]                         (inputchunk0) {a[0:15]};
+\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=TealBlue!30,right=0 of inputchunk0]  (inputchunk1) {a[0:15]};
+\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=RoyalBlue!30,right=0 of inputchunk1] (inputchunk2) {a[16:31]};
+\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=RoyalBlue!30,right=0 of inputchunk2] (inputchunk3) {a[16:31]};
+\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=Blue!30,right=0 of inputchunk3]      (inputchunk4) {a[32:47]};
+\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=Blue!30,right=0 of inputchunk4]      (inputchunk5) {a[32:47]};
 
-\node[draw,outer sep=0,minimum width=1.5cm,fill=Green!30,below=0 of inputchunk0]       {Bank 0};
-\node[draw,outer sep=0,minimum width=1.5cm,fill=SpringGreen!30,below=0 of inputchunk1] {Bank 1};
-\node[draw,outer sep=0,minimum width=1.5cm,fill=Green!30,below=0 of inputchunk2]       {Bank 0};
-\node[draw,outer sep=0,minimum width=1.5cm,fill=SpringGreen!30,below=0 of inputchunk3] {Bank 1};
-\node[draw,outer sep=0,minimum width=1.5cm,fill=Green!30,below=0 of inputchunk4]       {Bank 0};
-\node[draw,outer sep=0,minimum width=1.5cm,fill=SpringGreen!30,below=0 of inputchunk5] {Bank 1};
+\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=Green!30,below=0 of inputchunk0]       {Bank 0};
+\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=SpringGreen!30,below=0 of inputchunk1] {Bank 1};
+\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=Green!30,below=0 of inputchunk2]       {Bank 0};
+\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=SpringGreen!30,below=0 of inputchunk3] {Bank 1};
+\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=Green!30,below=0 of inputchunk4]       {Bank 0};
+\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=SpringGreen!30,below=0 of inputchunk5] {Bank 1};
 
 \node[right=of inputchunk5.south east,anchor=east] (inputchunk6) {\normalsize\dots};