diff --git a/src/acronyms.tex b/src/acronyms.tex index e0ada98..62ff78e 100644 --- a/src/acronyms.tex +++ b/src/acronyms.tex @@ -1,12 +1,48 @@ \DeclareAcronym{dnn}{ short = DNN, - long = Deep Neural Network, + long = deep neural network, +} +\DeclareAcronym{cnn}{ + short = CNN, + long = convolutional neural network, +} +\DeclareAcronym{mlp}{ + short = MLP, + long = multi-layer perceptron, +} +\DeclareAcronym{rnn}{ + short = RNN, + long = recurrent neural network, } \DeclareAcronym{llm}{ short = LLM, - long = Large Language Model, + long = large language model, +} +\DeclareAcronym{ai}{ + short = AI, + long = artificial intelligence, +} +\DeclareAcronym{gpu}{ + short = GPU, + long = graphics processing unit, +} +\DeclareAcronym{tpu}{ + short = TPU, + long = tensor processing unit, +} +\DeclareAcronym{dram}{ + short = DRAM, + long = dynamic random-access memory, +} +\DeclareAcronym{hbm}{ + short = HBM, + long = High Bandwidth Memory, +} +\DeclareAcronym{pim}{ + short = PIM, + long = processing-in-memory, } \DeclareAcronym{tlm}{ short = TLM, - long = Transaction Level Modeling, + long = transaction level modeling, } diff --git a/src/chapters/energy_chart.tex b/src/chapters/energy_chart.tex deleted file mode 100644 index d69dafe..0000000 --- a/src/chapters/energy_chart.tex +++ /dev/null @@ -1,41 +0,0 @@ -% This file was created with tikzplotlib v0.10.1. -\begin{tikzpicture} - -\definecolor{darkgray176}{RGB}{176,176,176} -\definecolor{steelblue}{RGB}{70,130,180} -\definecolor{tomato}{RGB}{255,99,71} - -\begin{axis}[ -log basis y={10}, -tick align=outside, -tick pos=left, -x grid style={darkgray176}, -xmajorgrids, -xmin=2010, xmax=2050, -xtick style={color=black}, -y grid style={darkgray176}, -ymajorgrids, -ymin=1e+16, ymax=1e+22, -ymode=log, -ytick style={color=black} -] -\addplot [semithick, tomato] -table {% -2010 5e+20 -2050 1e+21 -}; -\addplot [semithick, steelblue] -table {% -2010 5e+17 -2020 5e+18 -2030 4e+19 -}; -\addplot [semithick, steelblue, dashed] -table {% -2030 4e+19 -2040 1.1e+20 -2050 1.2e+20 -}; -\end{axis} - -\end{tikzpicture} diff --git a/src/chapters/introduction.tex b/src/chapters/introduction.tex index 8a6e70a..7a51aea 100644 --- a/src/chapters/introduction.tex +++ b/src/chapters/introduction.tex @@ -6,18 +6,50 @@ An important compound of these models make use of \acp{dnn}, which are a type of Consequently, \acp{dnn} make it possible to tackle many new classes of problems that were previously beyond the reach of conventional algorithms. However, the ever-increasing use of these technologies poses new challenges for hardware architectures, as the energy required to train and run these models reaches unprecedented levels. -Recently published numbers approximate that the development and training of Meta's LLaMA model over a period of about 5 months consumed around $\qty{2638}{\mega\watt\hour}$ of electrical energy and caused a total emission of $\qty{1015}{tCO_2eq}$ \cite{touvron2023}. -As these numbers are expected to increase in the future, it is clear that the energy footprint current deployment of artificial intelligence applications is not sustainable \cite{blott2023}. -In a more general view, the energy demand of computing for new applications continues to grow exponentially, doubling about every two years, while the world's energy production grows only linearly, at about $\qty{2}{\percent}$ per year \cite{src2021}. +Recently published numbers approximate that the development and training of Meta's LLaMA model over a period of about 5 months consumed around $\qty{2638}{\mega\watt\hour}$ of electrical energy and caused a total emission of $\qty{1015}{tCO_2eq}$ \cite{touvron2023}. +As these numbers are expected to increase in the future, it is clear that the energy footprint of current deployment of \ac{ai} applications is not sustainable \cite{blott2023}. + + +In a more general view, the energy demand of computing for new applications continues to grow exponentially, doubling about every two years, while the world's energy production only grows linearly, at about $\qty{2}{\percent}$ per year \cite{src2021}. This dramatic increase in energy consumption is due to the fact that while the energy efficiency of compute processor units has continued to improve, the ever-increasing demand for computing is outpacing this progress. In addition, Moore's Law is slowing down as further device scaling approaches physical limits. -% TODO move in correct directory -\input{chapters/energy_chart} +\begin{figure}[!ht] + \centering + \input{plots/energy_chart} + \caption[Total energy of computing]{Total energy of computing \cite{src2021}} + \label{plt:enery_chart} +\end{figure} The exponential grow in compute energy will eventually be constrained by market dynamics, flattening the energy curve and making it impossible to meet future computing demands. -It is therefore required to achieve radical improvements in energy efficiency to avoid this scenario. +It is therefore required to achieve radical improvements in energy efficiency in order to avoid such a scenario. -% -> effizierntere systeme -% diskussion bezieht sich vor allem auf prozessoren -% -> muss vor allem memory beachten, movement cost diagram +In recent years, domain-specific accelerators, such as \acp{gpu} or \acp{tpu} have become very popular, as they provide orders of magnitude higher performance and energy efficiency for \ac{ai} applications \cite{kwon2021}. +However, research must also consider the off-chip memory - the date movement between the computation unit and the \ac{dram} has a high cost as fetching operands costs more than doing the computation on them. +While performing a double precision floating point operation on a $\qty{28}{\nano\meter}$ technology might consume an energy of about $\qty{20}{\pico\joule}$, fetching the operands from \ac{dram} consumes almost 3 orders of magnitude more energy at about $\qty{16}{\nano\joule}$ \cite{dally2010}. + +Furthermore, many types of \ac{dnn} used for language and speech processing such as \acp{rnn}, \acp{mlp} and some layers of \acp{cnn} are severely limited by the memory-bandwidth that the \ac{dram} can provide, in contrast to compute-intensive workloads such as visual processing \cite{he2020}. +Such workloads are referred to as \textit{memory-bound}. + +\begin{figure}[!ht] + \centering + \input{plots/roofline} + \caption[Roofline model of GPT revisions]{Roofline model of GPT revisions \cite{ivobolsens2023}} + \label{plt:roofline} +\end{figure} + +In the past, specialized types of \ac{dram} such as \ac{hbm} have been able to meet high bandwidth requirements. +However, recent AI technologies require even greater bandwidth than \ac{hbm} can provide \cite{kwon2021}. + +All things considered, to meet the need for energy-efficient computing systems, which are increasingly becoming memory-bound, new approaches to computing are required. +This has led researchers to reconsider past \ac{pim} architectures and advance them further \cite{lee2021}. +\Ac{pim} integrates computational logic into the \ac{dram} itself, to exploit minimal data movement cost and extensive internal data parallelism \cite{sudarshan2022}. + +This work analyzes various \ac{pim} architectures, identifies the challenges of integrating them into state-of-the-art \acp{dram}, examines the changes required in the way applications lay out their data in memory and explores a \ac{pim} implementation from one of the leading \ac{dram} vendors. +The remainder of it is structured as follows: +Section \ref{sec:dram} gives a brief overview of the architecture of \acp{dram}, in detail that of \acp{hbm}. +In section \ref{sec:pim} various types of \ac{pim} architectures are presented, with some concrete examples discussed in detail. +Section \ref{sec:vp} is an introduction to virtual prototyping and system-level hardware simulation. +After explaining the necessary prerequisites, section \ref{sec:implementation} implements a concrete \ac{pim} architecture in software and provides a development library that applications can use to take advantage of in-memory processing. +The section \ref{sec:results} demonstrates the possible performance enhancement of \ac{pim} by simulating a typical neural-network inference. +Finally, section \ref{sec:conclusion} concludes the findings and identifies future improvements in \ac{pim} architectures. diff --git a/src/doc.bib b/src/doc.bib index a69af0c..1019652 100644 --- a/src/doc.bib +++ b/src/doc.bib @@ -122,6 +122,14 @@ file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/7M7QNRVN/He et al. - 2020 - Newton A DRAM-maker’s Accelerator-in-Memory (AiM).pdf} } +@inproceedings{ivobolsens2023, + title = {Scalable {{AI Architectures}} for {{Edge}} and {{Cloud}}}, + booktitle = {{{HiPEAC23}}}, + author = {{Ivo Bolsens}}, + year = {2023}, + month = jan +} + @book{jacob2008, title = {Memory Systems: {{Cache}}, {{DRAM}}, {{Disk}}}, shorttitle = {Memory Systems}, @@ -450,6 +458,5 @@ urldate = {2024-01-23}, abstract = {We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train our models on trillions of tokens, and show that it is possible to train state-of-the-art models using publicly available datasets exclusively, without resorting to proprietary and inaccessible datasets. In particular, LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA-65B is competitive with the best models, Chinchilla-70B and PaLM-540B. We release all our models to the research community.}, archiveprefix = {arxiv}, - keywords = {Computer Science - Computation and Language}, file = {/home/derek/Nextcloud/Verschiedenes/Zotero/storage/MGQYNDPQ/Touvron et al. - 2023 - LLaMA Open and Efficient Foundation Language Mode.pdf;/home/derek/Nextcloud/Verschiedenes/Zotero/storage/YDAT8K7L/2302.html} } diff --git a/src/index.tex b/src/index.tex index 41d1d4d..d903fb0 100644 --- a/src/index.tex +++ b/src/index.tex @@ -2,7 +2,7 @@ \usepackage[small,bf,hang]{caption} \usepackage[english]{babel} \usepackage{wrapfig} -\usepackage{xcolor} +\usepackage[usenames,dvipsnames]{xcolor} \usepackage{graphicx} \usepackage{amsmath} \usepackage{amssymb} @@ -107,6 +107,7 @@ % \clearpage % List of Abbreviations +% TODO näher beieinander und serifen für abkürzungen \begingroup \phantomsection \addcontentsline{toc}{section}{List of Abbreviations} diff --git a/src/plots/energy_chart.tex b/src/plots/energy_chart.tex new file mode 100644 index 0000000..b7cc462 --- /dev/null +++ b/src/plots/energy_chart.tex @@ -0,0 +1,50 @@ +\begin{tikzpicture} + + \definecolor{darkgray176}{RGB}{176,176,176} + \definecolor{steelblue}{RGB}{70,130,180} + \definecolor{tomato}{RGB}{255,99,71} + + \begin{axis}[ + width=12cm, + height=8cm, + axis background/.style={fill=gray!8}, + axis line style={white}, + tick style={draw=none}, + grid=both, + grid style={white}, + ymode=log, + log basis y={10}, + xtick={2010,2020,2030,2040,2050}, + ytick={1e16,1e18,1e20,1e22}, + xticklabel style={/pgf/number format/1000 sep=}, + ylabel={Compute Energy in $\si{\joule\per Year}$}, + xmin=2010, + xmax=2050, + ymin=5e15, + ymax=2e22, + legend pos=south east, + legend style={draw=none, fill=none} + ] + \addplot [very thick, tomato] + table { + 2010 5e+20 + 2050 8e+20 + }; + \addlegendentry{world's energy production} + \addplot [very thick, steelblue] + table { + 2010 5e+17 + 2020 5e+18 + 2030 4e+19 + }; + \addlegendentry{total compute energy} + \addplot [very thick, steelblue, dashed] + table { + 2030 4e+19 + 2035 8e+19 + 2040 1.1e+20 + 2050 1.2e+20 + } + node[above,pos=0.5,scale=0.8] {"market dynamics limited" scenario}; + \end{axis} +\end{tikzpicture} diff --git a/src/plots/roofline.tex b/src/plots/roofline.tex new file mode 100644 index 0000000..249a8fb --- /dev/null +++ b/src/plots/roofline.tex @@ -0,0 +1,63 @@ +\begin{tikzpicture} + \begin{axis}[ + width=12cm, + height=8cm, + axis lines=middle, + % enlargelimits={abs=5}, + ticks=none, + grid=both, + grid style={white}, + xlabel={Operational Intensity in $\si{OPS \per Byte}$ (log)}, + ylabel={Achievable Performance in $\si{\giga OPS \per\second}$ (log)}, + xlabel near ticks, + ylabel near ticks, + xmin=0, + xmax=100, + ymin=0, + ymax=100 + ] + \addplot [thick, dashed, gray] + table { + 40 0 + 40 100 + }; + + \addplot [very thick, BrickRed] + table { + 0 20 + 40 70 + 100 70 + } + node[above,sloped,pos=0.25,scale=0.8] {\textit{memory-bound}} + node[above,pos=0.75,scale=0.8] {\textit{compute-bound}}; + + \addplot [very thick, dashed, BrickRed] + table { + 40 70 + 60 95 + }; + + \addplot [very thick, dashed, BrickRed] + table { + 0 70 + 40 70 + }; + + \addplot [only marks, mark=o, mark options={color=NavyBlue}] + table { + 60 30 + 65 45 + 55 55 + 50 35 + }; + \node (gpt2) at (57, 43) {GPT-2}; + + \addplot [only marks, mark=o, mark options={color=NavyBlue}] + table { + 28 49 + }; + \node (gpt3) at (28, 43) {GPT-3}; + + \draw[-latex](gpt2)--(gpt3); + \end{axis} +\end{tikzpicture}