diff --git a/src/chapters/conclusion.tex b/src/chapters/conclusion.tex index bf08b4e..3fab4b4 100644 --- a/src/chapters/conclusion.tex +++ b/src/chapters/conclusion.tex @@ -19,6 +19,10 @@ For a better evaluation of the performance gains of \aca{fimdram}, it should be Effects such as the initialization overhead of \aca{fimdram} can only be evaluated in such an environment. Furthermore, the integration of \aca{fimdram} should be extended to \acp{gpu} or \acp{tpu}, so that the comparison can be extended to the deployment of the real \ac{dnn} applications. +Further research could also investigate whether the library-based approach of leveraging \ac{pim} could be replaced by a compiler-based approach. +A special compiler extension would be able to generate the necessary \ac{ld} and \ac{st} instructions by analyzing the data types of the operands. +This extension might also make use of so-called non-temporal instructions that bypass the cache hierarchy on a per-instruction basis. + In conclusion, \ac{pim} is a promising approach to address the future processing needs of \ac{ai} and possibly other applications. Not only the architecture itself has to be considered, but also the integration of \ac{pim} into the applications at the software level. By overcoming these challenges, \ac{pim} could be part of the solution to increase the performance and energy efficiency of future computing platforms. diff --git a/src/chapters/pim.tex b/src/chapters/pim.tex index c01dad3..6567486 100644 --- a/src/chapters/pim.tex +++ b/src/chapters/pim.tex @@ -352,7 +352,7 @@ An example with a weight matrix of dimensions (128$\times$8), an input vector of With the processing unit \textit{i}, the number of iterations \textit{j}, the input vector \textit{a} and the weight matrix \textit{w}, the partial sum $psum[i,0:15]$ is calculated as described in \cref{eq:partial_sum}: \begin{equation} -psum[i,0:15]=\sum_{j=0}^{8}(a[j \cdot 16:j \cdot 16+15] \cdot w[i,j \cdot 16:j \cdot 16+15]) +psum[i,0:15]=\sum_{j=0}^{7}(a[j \cdot 16:j \cdot 16+15] \cdot w[i,j \cdot 16:j \cdot 16+15]) \label{eq:partial_sum} \end{equation} @@ -365,7 +365,7 @@ The operation of this concrete \ac{gemv} microkernel is illustrated in \cref{img \begin{figure} \centering \includegraphics[width=0.8\linewidth]{images/memory_layout} - \caption[Procedure to perform a (128)$\times$(128$\times$8) \ac{gemv} operation]{Procedure to perform a (128)$\times$(128$\times$8) \ac{gemv} operation. One cell represents 16 \ac{fp16} elements forming a $\qty{32}{\byte}$ block \cite{kang2022}.} + \caption[Procedure to perform a (128$\times$8)$\times$(128) \ac{gemv} operation]{Procedure to perform a (128$\times$8)$\times$(128) \ac{gemv} operation. One cell represents 16 \ac{fp16} elements, forming a $\qty{32}{\byte}$ block \cite{kang2022}.} \label{img:memory_layout} \end{figure} diff --git a/src/images/input_vector.tex b/src/images/input_vector.tex index 8657ad5..f52701f 100644 --- a/src/images/input_vector.tex +++ b/src/images/input_vector.tex @@ -1,19 +1,25 @@ \begin{tikzpicture} \tiny -\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=TealBlue!30] (inputchunk0) {a[0:15]}; -\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=TealBlue!30,right=0 of inputchunk0] (inputchunk1) {a[0:15]}; -\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=RoyalBlue!30,right=0 of inputchunk1] (inputchunk2) {a[16:31]}; -\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=RoyalBlue!30,right=0 of inputchunk2] (inputchunk3) {a[16:31]}; -\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=Blue!30,right=0 of inputchunk3] (inputchunk4) {a[32:47]}; -\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=Blue!30,right=0 of inputchunk4] (inputchunk5) {a[32:47]}; +\definecolor{_darkblue}{RGB}{68, 114, 196} +\definecolor{_blue}{RGB}{91, 155, 213} +\definecolor{_green}{RGB}{112, 173, 71} +\definecolor{_orange}{RGB}{237, 125, 49} +\definecolor{_yellow}{RGB}{255, 192, 0} -\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=Green!30,below=0 of inputchunk0] {Bank 0}; -\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=SpringGreen!30,below=0 of inputchunk1] {Bank 1}; -\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=Green!30,below=0 of inputchunk2] {Bank 0}; -\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=SpringGreen!30,below=0 of inputchunk3] {Bank 1}; -\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=Green!30,below=0 of inputchunk4] {Bank 0}; -\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=SpringGreen!30,below=0 of inputchunk5] {Bank 1}; +\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=_orange] (inputchunk0) {a[0:15]}; +\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=_orange,right=0 of inputchunk0] (inputchunk1) {a[0:15]}; +\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=_yellow,right=0 of inputchunk1] (inputchunk2) {a[16:31]}; +\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=_yellow,right=0 of inputchunk2] (inputchunk3) {a[16:31]}; +\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=_green, right=0 of inputchunk3] (inputchunk4) {a[32:47]}; +\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=_green, right=0 of inputchunk4] (inputchunk5) {a[32:47]}; + +\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=_darkblue!80,below=0 of inputchunk0] {Bank 0}; +\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=_blue!80, below=0 of inputchunk1] {Bank 1}; +\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=_darkblue!80,below=0 of inputchunk2] {Bank 0}; +\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=_blue!80, below=0 of inputchunk3] {Bank 1}; +\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=_darkblue!80,below=0 of inputchunk4] {Bank 0}; +\node[draw,outer sep=0,minimum width=1.5cm,minimum height=4mm,fill=_blue!80, below=0 of inputchunk5] {Bank 1}; \node[right=of inputchunk5.south east,anchor=east] (inputchunk6) {\normalsize\dots};