diff --git a/src/acronyms.tex b/src/acronyms.tex index 15bb4ee..f4e845c 100644 --- a/src/acronyms.tex +++ b/src/acronyms.tex @@ -151,6 +151,18 @@ short = pCH, long = pseudo channel, } +\DeclareAcronym{blas}{ + short = BLAS, + long = Basic Linear Algebra Subprograms, +} +\DeclareAcronym{gemv}{ + short = GEMV, + long = matrix vector multiply, +} +\DeclareAcronym{gemm}{ + short = GEMM, + long = matrix matrix multiply, +} \DeclareAcronym{tlm}{ short = TLM, long = transaction-level modeling, diff --git a/src/chapters/dram.tex b/src/chapters/dram.tex index dbffc4c..2cd8b0b 100644 --- a/src/chapters/dram.tex +++ b/src/chapters/dram.tex @@ -21,7 +21,7 @@ Because the charge stored in each cell is very small, so-called \acp{psa} are ne \begin{figure} \centering - \includegraphics{images/psa} + \includegraphics[width=\linewidth]{images/psa} \caption[\ac{psa} of an open bitline architecture]{\ac{psa} of an open bitline architecture \cite{jacob2008} \cite{jung2017a}.} \label{img:psa} \end{figure} @@ -38,7 +38,7 @@ The Figure \ref{img:bank} summarizes the basic architecture of a single storage \begin{figure} \centering - \includegraphics{images/bank} + \includegraphics[width=\linewidth]{images/bank} \caption[Architecture of a single DRAM device]{Architecture of a single DRAM device \cite{jung2017a}.} \label{img:bank} \end{figure} @@ -102,7 +102,7 @@ What differentiates \ac{hbm} from other types of memory is its \ac{sip} approach Several \ac{dram} dies are stacked on top of each other and connected with \acp{tsv} to form a cube of memory dies consisting of many layers and a buffer die at the bottom, as shown in Figure \ref{img:sip}. \begin{figure} \centering - \includegraphics[width=0.7\linewidth]{images/sip} + \includegraphics[width=0.8\linewidth]{images/sip} \caption[Cross-section view of an \ac{hbm} \ac{sip}]{Cross-section view of a \ac{hbm} \ac{sip} \cite{lee2021}.} \label{img:sip} \end{figure} @@ -123,7 +123,7 @@ In the center of the die, the \acp{tsv} connect to the next die above or the pre \begin{figure} \centering - \includegraphics[width=0.7\linewidth]{images/hbm} + \includegraphics[width=0.8\linewidth]{images/hbm} \caption[\aca{hbm} memory die architecture]{\aca{hbm} memory die architecture \cite{lee2021}} \label{img:hbm} \end{figure} diff --git a/src/chapters/pim.tex b/src/chapters/pim.tex index 2d766da..fc74c7c 100644 --- a/src/chapters/pim.tex +++ b/src/chapters/pim.tex @@ -5,9 +5,83 @@ % wird seit 70ern diskutiert... % durch DNNs neuer Aufwind... -\subsection{Applicable Problems} -\label{sec:pim_problems} +\subsection{Applicable Workloads} +\label{sec:pim_workloads} +As already discussed in Section \ref{sec:introduction}, \ac{pim} is a good fit for accelerating memory-bound workloads. +In contrast, compute-bound workloads tend to have high data reuse and can make excessive use of the on-chip cache, and therefore do not need to use the full memory bandwidth. +For problems like this, \ac{pim} is of only limited use. + +Many layers of modern \acp{dnn} can be expressed as a matrix-vector multiplication. +The layer inputs can be represented as a vector and the model weights can be viewed as a matrix, where the number of columns is equal to the size of the input vector and the number of rows is equal to the size of the output vector. +Pairwise multiplication of the input vector and a row of the matrix can be used to calculate an entry of the output vector. +This process is illustrated for in Figure \ref{img:dnn}. + +\begin{figure} + \centering + \begin{tikzpicture} + \node[circle,thick,draw=red!60,fill=blue!20,minimum size=5mm,anchor=center] (inode0) at (0,0) {$i_0$}; + \node[circle,thick,draw=red!60,fill=blue!30,minimum size=5mm] (inode1) [below of=inode0] {$i_1$}; + \node[circle,thick,draw=red!60,fill=blue!40,minimum size=5mm] (inode2) [below of=inode1] {$i_2$}; + \node[circle,thick,draw=red!60,fill=blue!50,minimum size=5mm] (inode3) [below of=inode2] {$i_3$}; + + \node[circle,draw=black,fill=ForestGreen!20,minimum size=5mm,anchor=center] (onode0) at (2cm,0.5cm) {$o_0$}; + \node[circle,thick,draw=red!60,fill=ForestGreen!30,minimum size=5mm] (onode1) [below of=onode0] {$o_1$}; + \node[circle,draw=black,fill=ForestGreen!40,minimum size=5mm] (onode2) [below of=onode1] {$o_2$}; + \node[circle,draw=black,fill=ForestGreen!50,minimum size=5mm] (onode3) [below of=onode2] {$o_3$}; + \node[circle,draw=black,fill=ForestGreen!60,minimum size=5mm] (onode4) [below of=onode3] {$o_4$}; + + \draw (inode0.east) to (onode0.west); + \draw (inode1.east) to (onode0.west); + \draw (inode2.east) to (onode0.west); + \draw (inode3.east) to (onode0.west); + + \draw (inode0.east) to (onode2.west); + \draw (inode1.east) to (onode2.west); + \draw (inode2.east) to (onode2.west); + \draw (inode3.east) to (onode2.west); + + \draw (inode0.east) to (onode3.west); + \draw (inode1.east) to (onode3.west); + \draw (inode2.east) to (onode3.west); + \draw (inode3.east) to (onode3.west); + + \draw (inode0.east) to (onode4.west); + \draw (inode1.east) to (onode4.west); + \draw (inode2.east) to (onode4.west); + \draw (inode3.east) to (onode4.west); + + \draw[red!60,thick] (inode0.east) to (onode1.west); + \draw[red!60,thick] (inode1.east) to (onode1.west); + \draw[red!60,thick] (inode2.east) to (onode1.west); + \draw[red!60,thick] (inode3.east) to (onode1.west); + + \matrix (matrix) [matrix of nodes,left delimiter=(,right delimiter=),right of=onode2,node distance=4cm] { + $w_{0,0}$ & $w_{0,1}$ & $w_{0,2}$ & $w_{0,3}$ \\ + $w_{1,0}$ & $w_{1,1}$ & $w_{1,2}$ & $w_{1,3}$ \\ + $w_{2,0}$ & $w_{2,1}$ & $w_{2,2}$ & $w_{2,3}$ \\ + $w_{3,0}$ & $w_{3,1}$ & $w_{3,2}$ & $w_{3,3}$ \\ + $w_{4,0}$ & $w_{4,1}$ & $w_{4,2}$ & $w_{4,3}$ \\ + }; + + \node[draw,thick,red!60,rounded corners,inner sep=0,fit=(matrix-2-1) (matrix-2-4)] {}; + + \node (prod) [right of=matrix,node distance=2.6cm] {$*$}; + + \matrix (vector) [matrix of nodes,left delimiter=(,right delimiter=),right of=prod] { + $i_{0}$ \\ + $i_{1}$ \\ + $i_{2}$ \\ + $i_{3}$ \\ + }; + + \node (eq) [right of=vector,node distance=1.2cm] {$=$}; + \end{tikzpicture} + \caption[]{\cite{he2020}} + \label{img:dnn} +\end{figure} + +Such an operation, defined in the widely used \ac{blas} library \cite{blas1979}, is also known as a \acs{gemv} routine. % hier matrixoperationen für dnns beschreiben % memory-boundness % BLAS kernel und so weiter... diff --git a/src/images/sip.pdf b/src/images/sip.pdf index b96a62b..905db23 100644 Binary files a/src/images/sip.pdf and b/src/images/sip.pdf differ diff --git a/src/index.tex b/src/index.tex index 46f1061..368f9b1 100644 --- a/src/index.tex +++ b/src/index.tex @@ -23,6 +23,8 @@ \usepackage{bytefield} % Configurations +\usetikzlibrary{matrix} +\usetikzlibrary{fit} \setlength\textheight{24cm} \setkomafont{paragraph}{\footnotesize} \numberwithin{table}{section}