diff --git a/acronyms.tex b/acronyms.tex new file mode 100644 index 0000000..ff19b35 --- /dev/null +++ b/acronyms.tex @@ -0,0 +1,109 @@ +\DeclareAcronym{pim}{ + short = PIM, + long = processing-in-memory, +} +\DeclareAcronym{dnn}{ + short = DNN, + long = deep neural network, +} +\DeclareAcronym{cnn}{ + short = CNN, + long = convolutional neural network, +} +\DeclareAcronym{mlp}{ + short = MLP, + long = multilayer perceptron, +} +\DeclareAcronym{rnn}{ + short = RNN, + long = recurrent neural network, +} +\DeclareAcronym{blas}{ + short = BLAS, + long = Basic Linear Algebra Subprograms, +} +\DeclareAcronym{gemv}{ + short = GEMV, + long = matrix-vector multiplication, +} +\DeclareAcronym{dram}{ + short = DRAM, + long = Dynamic Random Access Memory, +} +\DeclareAcronym{fimdram}{ + short = PIM-HBM, + alt = FIMDRAM, + long = Function-In-Memory DRAM, +} +\DeclareAcronym{hbm2}{ + short = HBM2, + long = High Bandwidth Memory 2, +} +\DeclareAcronym{simd}{ + short = SIMD, + long = single-instruction multiple-data, +} +\DeclareAcronym{pch}{ + short = pCH, + long = pseudo channel, +} +\DeclareAcronym{fpu}{ + short = FPU, + long = floating-point unit, +} +\DeclareAcronym{fp}{ + short = FP, + long = floating-point, +} +\DeclareAcronym{crf}{ + short = CRF, + long = command register file, +} +\DeclareAcronym{grf}{ + short = GRF, + long = general register file, +} +\DeclareAcronym{srf}{ + short = SRF, + long = scalar register file, +} +\DeclareAcronym{fp16}{ + short = FP16, + long = 16-bit floating-point, +} +\DeclareAcronym{ssa}{ + short = SSA, + long = secondary sense amplifier, +} +\DeclareAcronym{pu}{ + short = PU, + long = processing unit, +} +\DeclareAcronym{sb}{ + short = SB, + long = Single-Bank, +} +\DeclareAcronym{ab}{ + short = AB, + long = All-Bank, +} +\DeclareAcronym{abp}{ + short = AB-PIM, + long = All-Bank-PIM, +} +\DeclareAcronym{act}{ + short = ACT, + long = activate, +} +\DeclareAcronym{pre}{ + short = PRE, + long = precharge, +} +\DeclareAcronym{rd}{ + short = RD, + long = read, +} +\DeclareAcronym{wr}{ + short = WR, + long = write, +} diff --git a/references.bib b/references.bib new file mode 100644 index 0000000..24d7363 --- /dev/null +++ b/references.bib @@ -0,0 +1,95 @@ +@misc{blas1979, + title = {{{BLAS}} ({{Basic Linear Algebra Subprograms}})}, + author = {{BLAS}}, + year = {1979}, + urldate = {2024-01-08}, + howpublished = {https://www.netlib.org/blas/} +} + +@inproceedings{he2020, + title = {Newton: {{A DRAM-maker}}'s {{Accelerator-in-Memory}} ({{AiM}}) {{Architecture}} for {{Machine Learning}}}, + shorttitle = {Newton}, + booktitle = {2020 53rd {{Annual IEEE}}/{{ACM International Symposium}} on {{Microarchitecture}} ({{MICRO}})}, + author = {He, Mingxuan and Song, Choungki and Kim, Ilkon and Jeong, Chunseok and Kim, Seho and Park, Il and Thottethodi, Mithuna and Vijaykumar, T. N.}, + year = {2020}, + month = oct, + pages = {372--385}, + publisher = {IEEE}, + address = {Athens, Greece}, + doi = {10.1109/MICRO50266.2020.00040}, + urldate = {2024-01-09}, + isbn = {978-1-72817-383-2}, + keywords = {reviewed}, + file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\7M7QNRVN\He et al. - 2020 - Newton A DRAM-maker’s Accelerator-in-Memory (AiM).pdf} +} + +@inproceedings{kang2022, + title = {An {{FPGA-based RNN-T Inference Accelerator}} with {{PIM-HBM}}}, + booktitle = {Proceedings of the 2022 {{ACM}}/{{SIGDA International Symposium}} on {{Field-Programmable Gate Arrays}}}, + author = {Kang, Shinhaeng and Lee, Sukhan and Kim, Byeongho and Kim, Hweesoo and Sohn, Kyomin and Kim, Nam Sung and Lee, Eojin}, + year = {2022}, + month = feb, + pages = {146--152}, + publisher = {ACM}, + address = {Virtual Event USA}, + doi = {10.1145/3490422.3502355}, + urldate = {2024-01-08}, + abstract = {In this paper, we implemented a world-first RNN-T inference accelerator using FPGA with PIM-HBM that can multiply the internal bandwidth of the memory. The accelerator offloads matrix-vector multiplication (GEMV) operations of LSTM layers in RNN-T into PIM-HBM, and PIM-HBM reduces the execution time of GEMV significantly by exploiting HBM internal bandwidth. To ensure that the memory commands are issued in a pre-defined order, which is one of the most important constraints in exploiting PIM-HBM, we implement a direct memory access (DMA) module and change configuration of the on-chip memory controller by utilizing the flexibility and reconfigurability of the FPGA. In addition, we design the other hardware modules for acceleration such as non-linear functions (i.e., sigmoid and hyperbolic tangent), element-wise operation, and ReLU module, to operate these compute-bound RNN-T operations on FPGA. For this, we prepare FP16 quantized weight and MLPerf input datasets, and modify the PCIe device driver and C++ based control codes. On our evaluation, our accelerator with PIM-HBM reduces the execution time of RNN-T by 2.5 {\texttimes} on average with 11.09\% reduced LUT size and improves energy efficiency up to 2.6 {\texttimes} compared to the baseline.}, + isbn = {978-1-4503-9149-8}, + langid = {english}, + keywords = {reviewed}, + file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\YPD3XGJ6\Kang et al. - 2022 - An FPGA-based RNN-T Inference Accelerator with PIM.pdf} +} + +@inproceedings{kwon2021, + title = {25.4 {{A}} 20nm {{6GB Function-In-Memory DRAM}}, {{Based}} on {{HBM2}} with a 1.{{2TFLOPS Programmable Computing Unit Using Bank-Level Parallelism}}, for {{Machine Learning Applications}}}, + booktitle = {2021 {{IEEE International Solid- State Circuits Conference}} ({{ISSCC}})}, + author = {Kwon, Young-Cheon and Lee, Suk Han and Lee, Jaehoon and Kwon, Sang-Hyuk and Ryu, Je Min and Son, Jong-Pil and Seongil, O and Yu, Hak-Soo and Lee, Haesuk and Kim, Soo Young and Cho, Youngmin and Kim, Jin Guk and Choi, Jongyoon and Shin, Hyun-Sung and Kim, Jin and Phuah, BengSeng and Kim, HyoungMin and Song, Myeong Jun and Choi, Ahn and Kim, Daeho and Kim, SooYoung and Kim, Eun-Bong and Wang, David and Kang, Shinhaeng and Ro, Yuhwan and Seo, Seungwoo and Song, JoonHo and Youn, Jaeyoun and Sohn, Kyomin and Kim, Nam Sung}, + year = {2021}, + month = feb, + pages = {350--352}, + publisher = {IEEE}, + address = {San Francisco, CA, USA}, + doi = {10.1109/ISSCC42613.2021.9365862}, + urldate = {2024-01-08}, + isbn = {978-1-72819-549-0}, + langid = {english}, + keywords = {reviewed}, + file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\UMUTRR6K\Kwon et al. - 2021 - 25.4 A 20nm 6GB Function-In-Memory DRAM, Based on .pdf} +} + +@inproceedings{lee2021, + title = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM Based}} on {{Commercial DRAM Technology}} : {{Industrial Product}}}, + shorttitle = {Hardware {{Architecture}} and {{Software Stack}} for {{PIM Based}} on {{Commercial DRAM Technology}}}, + booktitle = {2021 {{ACM}}/{{IEEE}} 48th {{Annual International Symposium}} on {{Computer Architecture}} ({{ISCA}})}, + author = {Lee, Sukhan and Kang, Shin-haeng and Lee, Jaehoon and Kim, Hyeonsu and Lee, Eojin and Seo, Seungwoo and Yoon, Hosang and Lee, Seungwon and Lim, Kyounghwan and Shin, Hyunsung and Kim, Jinhyun and Seongil, O and Iyer, Anand and Wang, David and Sohn, Kyomin and Kim, Nam Sung}, + year = {2021}, + month = jun, + pages = {43--56}, + publisher = {IEEE}, + address = {Valencia, Spain}, + doi = {10.1109/ISCA52012.2021.00013}, + urldate = {2024-01-08}, + abstract = {Emerging applications such as deep neural network demand high off-chip memory bandwidth. However, under stringent physical constraints of chip packages and system boards, it becomes very expensive to further increase the bandwidth of off-chip memory. Besides, transferring data across the memory hierarchy constitutes a large fraction of total energy consumption of systems, and the fraction has steadily increased with the stagnant technology scaling and poor data reuse characteristics of such emerging applications. To cost-effectively increase the bandwidth and energy efficiency, researchers began to reconsider the past processing-in-memory (PIM) architectures and advance them further, especially exploiting recent integration technologies such as 2.5D/3D stacking. Albeit the recent advances, no major memory manufacturer has developed even a proof-of-concept silicon yet, not to mention a product. This is because the past PIM architectures often require changes in host processors and/or application code which memory manufacturers cannot easily govern. In this paper, elegantly tackling the aforementioned challenges, we propose an innovative yet practical PIM architecture. To demonstrate its practicality and effectiveness at the system level, we implement it with a 20nm DRAM technology, integrate it with an unmodified commercial processor, develop the necessary software stack, and run existing applications without changing their source code. Our evaluation at the system level shows that our PIM improves the performance of memory-bound neural network kernels and applications by 11.2{\texttimes} and 3.5{\texttimes}, respectively. Atop the performance improvement, PIM also reduces the energy per bit transfer by 3.5{\texttimes}, and the overall energy efficiency of the system running the applications by 3.2{\texttimes}.}, + isbn = {978-1-66543-333-4}, + langid = {english}, + keywords = {reviewed}, + file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\YWUR6TWQ\Lee et al. - 2021 - Hardware Architecture and Software Stack for PIM B.pdf} +} + +@incollection{sudarshan2022, + title = {A {{Critical Assessment}} of {{DRAM-PIM Architectures}} - {{Trends}}, {{Challenges}} and {{Solutions}}}, + booktitle = {Embedded {{Computer Systems}}: {{Architectures}}, {{Modeling}}, and {{Simulation}}}, + author = {Sudarshan, Chirag and Sadi, Mohammad Hassani and Steiner, Lukas and Weis, Christian and Wehn, Norbert}, + editor = {Orailoglu, Alex and Reichenbach, Marc and Jung, Matthias}, + year = {2022}, + volume = {13511}, + pages = {362--379}, + publisher = {Springer International Publishing}, + address = {Cham}, + doi = {10.1007/978-3-031-15074-6_23}, + urldate = {2024-01-21}, + isbn = {978-3-031-15073-9 978-3-031-15074-6}, + langid = {english}, + file = {C:\Users\christ\Nextcloud2\Verschiedenes\Zotero\storage\73HULZKB\Sudarshan et al. - 2022 - A Critical Assessment of DRAM-PIM Architectures - .pdf} +} diff --git a/samplepaper.tex b/samplepaper.tex index 13e5f96..53a70c1 100644 --- a/samplepaper.tex +++ b/samplepaper.tex @@ -5,6 +5,9 @@ \documentclass[runningheads]{llncs} % \usepackage{graphicx} +\usepackage{siunitx} +\usepackage[nameinlink,capitalize,noabbrev]{cleveref} +\usepackage{acro} % Used for displaying a sample figure. If possible, figure files should % be included in EPS format. % @@ -12,6 +15,9 @@ % to display URLs in blue roman font according to Springer's eBook style: % \renewcommand\UrlFont{\color{blue}\rmfamily} +\sisetup{per-mode = symbol} +\input{acronyms} + \begin{document} % \title{Contribution Title\thanks{Supported by organization x.}} @@ -50,12 +56,13 @@ The abstract should briefly summarize the contents of the paper in % % \section{Introduction} +\label{sec:intro} % TODO Lukas/Matthias Contributions: \begin{itemize} \item First time Full System Simulation of SAMSUNG-PIM \item VP consisting of gem5 and DRAMSys - \item Experimantal verification of VP + \item Experimantal verification of VP \end{itemize} % \section{Related Work} @@ -64,6 +71,46 @@ Samsung DRAMSim2 % TODO Derek/Lukas \section{Background DRAM-PIM} % TODO Derek +Many types of \acp{dnn} used for language and speech processing, such as \acp{rnn}, \acp{mlp} and some layers of \acp{cnn}, are severely limited by the memory bandwidth that the DRAM can provide, making them \textit{memory-bound} \cite{he2020}. +As already discussed in \cref{sec:intro}, PIM is a good fit for accelerating memory-bound workloads with low operational intensity. +In contrast, compute-bound workloads tend to have high data reuse and can make excessive use of the on-chip cache and therefore do not need to utilize the full memory bandwidth. + +Many layers of modern \acp{dnn} can be expressed as a matrix-vector multiplication. +The layer inputs can be represented as a vector and the model weights can be viewed as a matrix, where the number of columns is equal to the size of the input vector and the number of rows is equal to the size of the output vector. +Pairwise multiplication of the input vector and a row of the matrix are be used to calculate an entry of the output vector. +Such an operation, defined in the widely used \ac{blas} library \cite{blas1979}, is also known as a \acs{gemv} routine. +Because one matrix element is only used exactly once in the calculation the output vector, there is no data reuse of the matrix. +Further, as the weight matrices tend to be too large to fit on the on-chip cache, such a \ac{gemv} operation is deeply memory-bound \cite{he2020}. +As a result, such an operation is a good fit for \ac{pim}. + +Many different \ac{pim} architectures have been proposed by research in the past, and more recently real implementations have been presented by hardware vendors. +These proposals differ largely in the positioning of the processing operation applied, ranging from the analog distribution of capacitor charges at the \ac{dram}'s subarray level to additional processing units at the global I/O level. +Each of these approaches comes with different advantages and disadvantages. +In short, the closer the processing is to the \ac{dram}'s subarray, the higher the energy efficiency and the achievable processing bandwidth. +On the other hand, the integration of the \ac{pim} units inside the bank becomes more difficult as area and power constraints limit the integration \cite{sudarshan2022}. + +One real \ac{pim} implementation of the major \ac{dram} manufacturer Samsung, called \acf{fimdram}, has been presented in 2021 \cite{kwon2021,lee2021}. +\Ac{fimdram} is based on the \ac{hbm2} memory standard, and it integrates 16-wide \ac{simd} engines directly into the memory banks, exploiting bank-level parallelism, while preserving the highly optimized memory subarray \cite{kwon2021}. +A special feature of \aca{fimdram} is that it does not require any changes to components of modern processors, such as the memory controller, i.e., it is agnostic to existing \aca{hbm2} platforms. +Consequently, for the operation of the \acp{pu}, mode switching is required for \aca{fimdram}, which makes it less useful for interleaved \ac{pim} and non-\ac{pim} traffic and small batch sizes. + +At the heart of \aca{fimdram} lie the \ac{pim} execution units, which are shared by two banks each of a \ac{pch}. +They include 16 16-bit wide \ac{simd} \acp{fpu}, \acp{crf}, \acp{grf} and \acp{srf} \cite{lee2021}. +The 16-wide \ac{simd} units correspond to the 256-bit prefetch architecture of \aca{hbm2}, where 16 16-bit floating-point operands are passed directly from the \acp{ssa} to the \acp{fpu} from a single memory access. +As all \ac{pim} units operate in parallel, with 16 banks per \ac{pch}, a singular memory access loads a total of $\qty{256}{\bit}\cdot\qty{8}{\acp{pu}}=\qty{2048}{\bit}$ into the \acp{fpu}. +As a result, the theoretical internal bandwidth of \aca{fimdram} is $\qty{8}{\times}$ higher than the external bus bandwidth to the host processor. + +\Ac{fimdram} defines three operating modes: +The default \textbf{\ac{sb} mode}, where \aca{fimdram} has identical behavior to normal \aca{hbm2} memory. +To switch to another mode, a specific sequence of \ac{act} and \ac{pre} commands must be sent by the memory controller to specific row addresses. +The \textbf{\ac{ab} mode} is an extension to the \ac{sb} mode where the \ac{pim} execution units allow for concurrent access to half of the \ac{dram} banks at the same time. +This provides $\qty{8}{\times}$ more bandwidth than the standard operation mode, which can be used for the initialization of memory regions across all banks. +With another predefined \ac{dram} access sequence, the memory switches to the \textbf{\ac{abp} mode}. +In this mode, a single memory access initiates the concurrent execution of the next instruction across all processing units. +In addition, the I/O circuits of the \ac{dram} are completely disabled in this mode, reducing the power required during \ac{pim} operation. +Both in \ac{ab} mode and in \ac{abp} mode, the total \aca{hbm2} bandwidth per \ac{pch} of $\qty{16}{\giga\byte\per\second}$ is $\qty{8}{\times}$ higher with $\qty{128}{\giga\byte\per\second}$ or in total $\qty{2}{\tera\byte\per\second}$ for 16 \acp{pch}. + + \section{VP} % TODO Derek \section{Results} @@ -71,4 +118,8 @@ Samsung DRAMSim2 \section{Conclusion} % TODO Lukas/Matthias % + +\bibliographystyle{IEEEtran} % TODO change style? +\bibliography{references.bib} + \end{document}