Compare commits

...

10 Commits

Author SHA1 Message Date
a971b7445a Add csv tables 2025-03-24 21:50:28 +01:00
Matthias Jung
c0fc5cedca Update on Overleaf. 2025-03-24 20:48:09 +00:00
aed31f282d Update on Overleaf. 2024-09-14 13:22:32 +00:00
8623e108f6 Update on Overleaf. 2024-08-20 16:44:41 +00:00
Anonymous
1a093a6509 Update on Overleaf. 2024-07-29 13:36:46 +00:00
0741703581 Update on Overleaf. 2024-06-30 13:37:15 +00:00
382c028c78 Update on Overleaf. 2024-06-12 14:35:50 +00:00
304ff3c48a Apply ACM layout 2024-05-24 10:13:39 +02:00
92682a55d6 Update on Overleaf. 2024-05-22 10:29:41 +00:00
Matthias Jung
1e98b3b48e Update on Overleaf. 2024-04-03 20:38:01 +00:00
8 changed files with 482 additions and 144 deletions

View File

@@ -55,7 +55,7 @@
\node[draw,outer sep=0,minimum width=25mm,minimum height=6mm,fill=lightgray,right=0 of b0e5] (b0e6) {$\cdots$};
\node[draw,outer sep=0,minimum width=25mm,minimum height=6mm,fill=_green,right=0 of b0e6] (b0e7) {w[8,112:127]};
\node[minimum width=10cm,minimum height=12mm,below right=0 of b0e4.south west] {$\cdots$};
\node[minimum width=10cm,minimum height=12mm,below right=0 of b0e4.south west] {$\vdots$};
\begin{pgfonlayer}{bank1}
\node[draw,outer sep=0,minimum width=10cm,minimum height=24mm,fill=white,above right=15mm of bank0.south west] (bank1) {};

View File

@@ -1,7 +1,7 @@
\begin{tikzpicture}
\pgfplotstableread[col sep=comma]{plots/speedup_tables/matrix.csv}\csv
\begin{axis}[
width=5cm,
width=0.8\columnwidth,
height=4cm,
ybar=1pt,
bar width = 5pt,

View File

@@ -0,0 +1,5 @@
level,gemv,dnn
X1,8.725110246753701,0.5853017926410354
X2,8.926639006288317,3.6909334536122427
X3,9.010099560986427,5.380703318160134
X4,9.208111243015697,6.012517728019782
1 level gemv dnn
2 X1 8.725110246753701 0.5853017926410354
3 X2 8.926639006288317 3.6909334536122427
4 X3 9.010099560986427 5.380703318160134
5 X4 9.208111243015697 6.012517728019782

View File

@@ -0,0 +1,5 @@
level,vadd,vmul,haxpy
X1,12.912945086743383,10.707228337727948,17.57341416054572
X2,12.657264796496554,10.41017271260676,17.530771651728568
X3,12.858101352840125,10.179728788420332,17.287022013303083
X4,12.5175927651105,10.158740110546228,17.568375657167437
1 level vadd vmul haxpy
2 X1 12.912945086743383 10.707228337727948 17.57341416054572
3 X2 12.657264796496554 10.41017271260676 17.530771651728568
4 X3 12.858101352840125 10.179728788420332 17.287022013303083
5 X4 12.5175927651105 10.158740110546228 17.568375657167437

View File

@@ -1,7 +1,7 @@
\begin{tikzpicture}
\pgfplotstableread[col sep=comma]{plots/speedup_tables/vector.csv}\csv
\begin{axis}[
width=5cm,
width=0.8\columnwidth,
height=4cm,
ybar=1pt,
bar width = 5pt,

View File

@@ -1,7 +1,7 @@
\begin{tikzpicture}
\pgfplotstableread[col sep=comma]{plots/wallclock_time.csv}\csv
\begin{axis}[
width=10cm,
width=0.8\columnwidth,
height=4cm,
ybar=1pt,
bar width = 5pt,

View File

@@ -281,6 +281,7 @@
urldate = {2024-03-20},
langid = {english},
}
@article{kim2016a,
title = {Ramulator: {{A Fast}} and {{Extensible DRAM Simulator}}},
shorttitle = {Ramulator},
@@ -296,11 +297,15 @@
urldate = {2024-03-20},
langid = {english},
}
@misc{rust,
title = {The {{Rust Programming Language}}},
author = {{Rust Foundation}},
year = {2015},
howpublished = {https://www.rust-lang.org/}
}@article{forlin2022,
}
@article{forlin2022,
title = {Sim 2 {{PIM}}: {{A}} Complete Simulation Framework for {{Processing-in-Memory}}},
shorttitle = {Sim 2 {{PIM}}},
author = {Forlin, Bruno E. and others},
@@ -449,3 +454,74 @@
abstract = {Many applications heavily use bitwise operations on large bitvectors as part of their computation. In existing systems, performing such bulk bitwise operations requires the processor to transfer a large amount of data on the memory channel, thereby consuming high latency, memory bandwidth, and energy. In this paper, we describe Ambit, a recently-proposed mechanism to perform bulk bitwise operations completely inside main memory. Ambit exploits the internal organization and analog operation of DRAM-based memory to achieve low cost, high performance, and low energy. Ambit exposes a new bulk bitwise execution model to the host processor. Evaluations show that Ambit significantly improves the performance of several applications that use bulk bitwise operations, including databases.},
archiveprefix = {arxiv},
}
@article{jeong2024,
title = {{{PipePIM}}: {{Maximizing Computing Unit Utilization}} in {{ML-Oriented Digital PIM}} by {{Pipelining}} and {{Dual Buffering}}},
shorttitle = {{{PipePIM}}},
author = {Jeong, Taeyang and others},
year = {2024},
journal = {IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems},
pages = {1--1},
issn = {0278-0070, 1937-4151},
doi = {10.1109/TCAD.2024.3410842},
urldate = {2024-06-10},
abstract = {A digital Processing-in-Memory (PIM) that integrates computing units (CUs) with DRAM banks emerges as a promising technique for accelerating matrix-vector multiplication (MV). However, activating and precharging all banks incur significant overheads in a digital PIM based on conventional DRAM, which is limited to activating only a single subarray in a bank. Moreover, a digital PIM utilizes a vector buffer to store and reuse the input vector. This necessitates repeated buffer writes, incurring substantial overhead for large MV. Consequently, these overheads reduce CU utilization in a digital PIM, degrading the performance. To overcome these issues, we propose PipePIM, which maximizes CU utilization in a digital PIM by pipelining and dual buffering. PipePIM consists of two primary schemes: subarray-level pipelining (SAPI) and a dual vector buffer. They exploit and extend the features of a multitude of activated subarrays (MASA) introduced by subarray-level parallelism (SALP). SAPI enables a digital PIM to perform activation, precharging, and computation on different subarrays in a pipelined manner. Through SAPI, these operations are overlapped, and activation and precharging overheads are hidden. A dual vector buffer employs two vector buffers and manages them as ping-pong buffering, one for computation and another for buffer write simultaneously. To facilitate it, PipePIM proposes a half-division mode (HDM) enabling independent access to two activated subarrays with marginal area increase. We demonstrate the improvements by PipePIM on the state-of-the-art digital PIMs, Newton and HBM-PIM. Our simulation results indicate that the average speedups of Newton and HBM-PIM on MV are 2.16x and 1.74x, respectively.},
copyright = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html},
langid = {english},
keywords = {,PIM}
}
@inproceedings{wang2016,
title = {An {{Overview}} of {{Micron}}'s {{Automata Processor}}},
booktitle = {Proceedings of the {{Eleventh IEEE}}/{{ACM}}/{{IFIP International Conference}} on {{Hardware}}/{{Software Codesign}} and {{System Synthesis}}},
author = {Wang, Ke and others},
year = {2016},
month = oct,
pages = {1--3},
publisher = {ACM},
address = {Pittsburgh Pennsylvania},
doi = {10.1145/2968456.2976763},
urldate = {2024-08-12},
isbn = {978-1-4503-4483-8},
langid = {english},
keywords = {DRAM,PIM}
}
@article{esmaili-dokht2024a,
title={$\mathcal{O}(n)$ KeyValue Sort With Active Compute Memory},
author = {{Esmaili-Dokht}, Pouya and others},
year = {2024},
month = may,
journal = {IEEE Transactions on Computers},
volume = {73},
number = {5},
pages = {1341--1356},
issn = {0018-9340, 1557-9956, 2326-3814},
doi = {10.1109/TC.2024.3371773},
urldate = {2024-08-12},
copyright = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/IEEE.html},
keywords = {DRAM,PIM}
}
@ARTICLE{li2020,
author={Li, Shang and others},
journal={IEEE Computer Architecture Letters},
title={DRAMsim3: A Cycle-Accurate, Thermal-Capable DRAM Simulator},
year={2020},
volume={19},
number={2},
pages={106-109},
keywords={Random access memory;Thermal conductivity;Protocols;Thermal resistance;Computational modeling;Integrated circuit modeling;Three-dimensional displays;DRAM;cycle-accurate;simulation;3D-modeling;thermal modeling},
doi={10.1109/LCA.2020.2973991}}
@ARTICLE{finkbeiner2017,
author={Finkbeiner, Tim and others},
journal={IEEE Micro},
title={In-Memory Intelligence},
year={2017},
volume={37},
number={4},
pages={30-38},
keywords={Random access memory;Computer architecture;VLIW;Vectors;Moore's Law;Computational modeling;Process control;Microprocessors;Memory management;processor in memory;non-Von Neumann;computer architecture;SIMD;vector processing},
doi={10.1109/MM.2017.3211117}}

View File

@@ -1,19 +1,130 @@
% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.20 of 2017/10/04
%
\documentclass[runningheads]{llncs}
%
%%
%% This is file `sample-sigconf.tex',
%% generated with the docstrip utility.
%%
%% The original source files were:
%%
%% samples.dtx (with options: `all,proceedings,bibtex,sigconf')
%%
%% IMPORTANT NOTICE:
%%
%% For the copyright see the source file.
%%
%% Any modified versions of this file must be renamed
%% with new filenames distinct from sample-sigconf.tex.
%%
%% For distribution of the original source see the terms
%% for copying and modification in the file samples.dtx.
%%
%% This generated file may be distributed as long as the
%% original source files, as listed above, are part of the
%% same distribution. (The sources need not necessarily be
%% in the same archive or directory.)
%%
%%
%% Commands for TeXCount
%TC:macro \cite [option:text,text]
%TC:macro \citep [option:text,text]
%TC:macro \citet [option:text,text]
%TC:envir table 0 1
%TC:envir table* 0 1
%TC:envir tabular [ignore] word
%TC:envir displaymath 0 word
%TC:envir math 0 word
%TC:envir comment 0 0
%%
%%
%% The first command in your LaTeX source must be the \documentclass
%% command.
%%
%% For submission and review of your manuscript please change the
%% command to \documentclass[manuscript, screen, review]{acmart}.
%%
%% When submitting camera ready or to TAPS, please change the command
%% to \documentclass[sigconf]{acmart} or whichever template is required
%% for your publication.
%%
%%
% \documentclass[manuscript, screen, review]{acmart}
% \documentclass[sigconf, review, anonymous]{acmart}
\documentclass[sigconf]{acmart}
%\setcopyright{none}
%%
%% \BibTeX command to typeset BibTeX logo in the docs
\AtBeginDocument{%
\providecommand\BibTeX{{%
Bib\TeX}}}
% TODO...
%% Rights management information. This information is sent to you
%% when you complete the rights form. These commands have SAMPLE
%% values in them; it is your responsibility as an author to replace
%% the commands and values with those provided to you when you
%% complete the rights form.
\copyrightyear{2024}
\acmYear{2024}
\setcopyright{rightsretained}
\acmConference[MEMSYS '24]{The International Symposium on Memory Systems}{September 30-October 3, 2024}{Washington, DC, USA}
\acmBooktitle{The International Symposium on Memory Systems (MEMSYS '24), September 30-October 3, 2024, Washington, DC, USA}
\acmDOI{10.1145/3695794.3695797}
\acmISBN{979-8-4007-1091-9/24/09}
%%
%% Uncomment \acmBooktitle if the title of the proceedings is different
%% from ``Proceedings of ...''!
%%
%%\acmBooktitle{Woodstock '18: ACM Symposium on Neural Gaze Detection,
%% June 03--05, 2018, Woodstock, NY}
% \acmISBN{978-1-4503-XXXX-X/18/06}
%%
%% Submission ID.
%% Use this when submitting an article to a sponsored event. You'll
%% receive a unique submission ID from the organizers
%% of the event, and this ID should be used as the parameter to this command.
%%\acmSubmissionID{123-A56-BU3}
%%
%% For managing citations, it is recommended to use bibliography
%% files in BibTeX format.
%%
%% You can then either use BibTeX with the ACM-Reference-Format style,
%% or BibLaTeX with the acmnumeric or acmauthoryear sytles, that include
%% support for advanced citation of software artefact from the
%% biblatex-software package, also separately available on CTAN.
%%
%% Look at the sample-*-biblatex.tex files for templates showcasing
%% the biblatex styles.
%%
%%
%% The majority of ACM publications use numbered citations and
%% references. The command \citestyle{authoryear} switches to the
%% "author year" style.
%%
%% If you are preparing content for an event
%% sponsored by ACM SIGGRAPH, you must use the "author year" style of
%% citations and references.
%% Uncommenting
%% the next command will enable that style.
%%\citestyle{acmauthoryear}
\usepackage{siunitx}
\usepackage[nameinlink,capitalize,noabbrev]{cleveref}
\usepackage{acro}
\usepackage[usenames,dvipsnames]{xcolor}
% \usepackage[usenames,dvipsnames]{xcolor}
\usepackage{tikz}
\usepackage{circuitikz}
\usepackage{mathdots}
\usepackage{tabularray}
\usepackage{pgfplotstable}
\usepackage{subfig}
\usepackage{csquotes}
\usetikzlibrary{math,perspective,intersections,arrows,arrows.meta}
\usepackage{listing}
\usepackage{minted}
\usepackage{graphicx}
% Used for displaying a sample figure. If possible, figure files should
@@ -23,6 +134,8 @@
% to display URLs in blue roman font according to Springer's eBook style:
% \renewcommand\UrlFont{\color{blue}\rmfamily}
\pgfplotsset{compat=1.8}
\sisetup{per-mode = symbol}
\usetikzlibrary{positioning}
@@ -38,94 +151,166 @@
\begin{document}
%
\title{PIMSys: A Virtual Prototype for Processing in Memory}
%
%\titlerunning{Abbreviated paper title}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{%
Derek Christ\inst{1}%\orcidID{0000-1111-2222-3333}
\and
Lukas Steiner\inst{2}%\orcidID{1111-2222-3333-4444}
\and
Matthias Jung\inst{1,3}%\orcidID{2222--3333-4444-5555}
\and
Norbert Wehn\inst{2}%\orcidID{2222--3333-4444-5555}
}
%
\authorrunning{D. Christ et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{
Fraunhofer IESE, Germany\\
\email{\{firstname.lastname\}@iese.fraunhofer.de}\\
\and
RPTU Kaiserslautern-Landau, Germany\\
\email{\{firstname.lastname\}@rptu.de}\\
\and
JMU Würzburg, Germany\\
\email{m.jung@uni-wuerzburg.de}
}
%
\maketitle
%
\begin{abstract}
Data-driven applications are increasingly central to our information technology society, propelled by AI techniques reshaping various sectors of our economy and society. Despite their transformative potential, these applications demand immense data processing, leading to significant energy consumption primarily in communication and data storage rather than computation. The concept of \ac{pim} offers a solution by processing data within memory, reducing energy overheads associated with data transfer. \Ac{pim} has been an enduring idea, with recent advancements in DRAM test chips integrating \ac{pim} functionality, indicating potential market adoption.
This paper introduces a virtual prototype of Samsung's PIM-HBM architecture, leveraging open-source tools like gem5 and DRAMSys, along with a custom Rust software library facilitating easy utilization of \ac{pim} functionality. Key contributions include the first full-system simulation of PIM-HBM, experimental validation of the virtual platform with benchmarks, and the development of a Rust library enabling \ac{pim} functionality at the software level.
TODO: Benchmark results
\keywords{DRAM \and PIM \and Virtual Platforms}
%%
%% The "author" command and its associated commands are used to define
%% the authors and their affiliations.
%% Of note is the shared affiliation of the first two authors, and the
%% "authornote" and "authornotemark" commands
%% used to denote shared contribution to the research.
\author{Derek Christ}
\email{derek.christ@iese.fraunhofer.de}
\orcid{0009-0005-4234-6362}
\affiliation{%
\institution{Fraunhofer IESE}
\city{Kaiserslautern}
\country{Germany}
}
\author{Lukas Steiner}
\email{lukas.steiner@rptu.de}
\orcid{0000-0003-2677-6475}
\affiliation{%
\institution{RPTU Kaiserslautern-Landau}
\city{Kaiserslautern}
\country{Germany}}
\author{Matthias Jung}
\email{m.jung@uni-wuerzburg.de}
\orcid{0000-0003-0036-2143}
\affiliation{%
\institution{JMU Würzburg}
\city{Würzburg}
\country{Germany}}
\author{Norbert Wehn}
\email{norbert.wehn@rptu.de}
\orcid{0000-0002-9010-086X}
\affiliation{%
\institution{RPTU Kaiserslautern-Landau}
\city{Kaiserslautern}
\country{Germany}}
%%
%% By default, the full list of authors will be used in the page
%% headers. Often, this list is too long, and will overlap
%% other information printed in the page headers. This command allows
%% the author to define a more concise list
%% of authors' names for this purpose.
\renewcommand{\shortauthors}{D. Christ et al.}
%%
\begin{abstract}
Data-driven applications are increasingly central to our information technology society, propelled by AI techniques reshaping various sectors of our economy. Despite their transformative potential, these applications demand immense data processing, leading to significant energy consumption primarily in communication and data storage rather than computation. The concept of \ac{pim} offers a solution by processing data within memory, reducing energy overheads associated with data transfer. \Acs{pim} has been an enduring idea, with recent advancements in DRAM test chips integrating \acs{pim} functionality, indicating potential market adoption.
This paper introduces a virtual prototype of Samsung's PIM-HBM architecture, leveraging open-source tools like gem5 and DRAMSys, along with a custom Rust software library facilitating easy utilization of \acs{pim} functionality. Key contributions include the first gem5 based full-system simulation of PIM-HBM, experimental validation of the virtual platform with benchmarks, and the development of a Rust library enabling \acs{pim} functionality at the software level.
Our benchmarks evaluated speedup for \acs{pim} in the range of \qtyrange{6.0}{17.5}{\times} compared to a respective non-\acs{pim} system for different memory-bound workloads.
\end{abstract}
%
%
%
\begin{CCSXML}
<ccs2012>
<concept>
<concept_id>10010583.10010786.10010809</concept_id>
<concept_desc>Hardware~Memory and dense storage</concept_desc>
<concept_significance>500</concept_significance>
</concept>
<concept>
<concept_id>10010520.10010521.10010528</concept_id>
<concept_desc>Computer systems organization~Parallel architectures</concept_desc>
<concept_significance>300</concept_significance>
</concept>
<concept>
<concept_id>10010147.10010178</concept_id>
<concept_desc>Computing methodologies~Artificial intelligence</concept_desc>
<concept_significance>300</concept_significance>
</concept>
</ccs2012>
\end{CCSXML}
\ccsdesc[500]{Hardware~Memory and dense storage}
\ccsdesc[300]{Computer systems organization~Parallel architectures}
\ccsdesc[300]{Computing methodologies~Artificial intelligence}
%%
%% Keywords. The author(s) should pick words that accurately describe
%% the work being presented. Separate the keywords with commas.
\keywords{DRAM, PIM, Virtual Platforms}
% TODO?
% \received{20 February 2007}
% \received[revised]{12 March 2009}
% \received[accepted]{5 June 2009}
%%
%% This command processes the author and affiliation and title
%% information and builds the first part of the formatted document.
\maketitle
\section{Introduction}
\label{sec:intro}
% TODO Matthias
Data-driven applications are increasingly becoming the focal point of our information technology society, with AI techniques fundamentally altering various sectors of our society and economy. A common characteristic of these applications is the vast amount of data they require to be captured, stored, and processed. Consequently, many of these applications, e.\,g., \acp{llm} or other artificial intelligence workloads are bound by the memory performance.
Furthermore, a significant portion of energy is consumed by communication and data storage rather than computation. As demonstrated by Jouppi et al.~\cite{jouhyu_21}, in a 7nm process, a 32-bit floating-point multiplication requires \qty{1.31}{\pico\joule}, whereas a 64-bit DRAM memory access demands \qty{1300}{\pico\joule}. This energy is expended in transferring data from memory through the network on chip, arbiters, and various levels of caches. Hence, it would be considerably more energy-efficient to process data where it resides, particularly within the memory itself. In other words, rather than transmitting data to computational units, the computational instructions should be sent to the memory housing the data.
Furthermore, a significant portion of energy is consumed by communication and data storage rather than computation. As demonstrated by Jouppi et al.~\cite{jouhyu_21}, in a 7nm process, a 32-bit floating-point multiplication requires \qty{1.31}{\pico\joule}, whereas a 64-bit DRAM memory access demands \qty{1300}{\pico\joule}. This energy is expended in transferring data from memory through the network on chip, arbiters, and various levels of caches. Hence, it would be considerably more energy-efficient to process data where it resides, particularly within the memory itself. This approach works very well with data-flow oriented applications. In other words, rather than transmitting data to computational units, the computational instructions should be sent to the memory housing the data.
This concept, known as \ac{pim}, has been around for many years. For instance, Stone already proposed it in the 1970s~\cite{sto_70}. Since then, similar to the field of artificial intelligence, this idea has experienced \enquote{summer} and \enquote{winter} periods in research over the past decades. However, recently, different companies have developed DRAM test chips with integrated \ac{pim} functionality, showing promising potential for entry into the commodity market.
This concept, known as \ac{pim}, has been around for many years. For instance, Stone already proposed it in the 1970s~\cite{sto_70}. Since then, similar to the field of artificial intelligence, this idea has experienced \enquote{summer} and \enquote{winter} periods in research over the past decades. However, recently, different companies have developed DRAM test chips with integrated \ac{pim} functionality, showing promising potential for entry into the market.
For instance, UPMEM introduced the first publicly available real-world \ac{pim} architecture~\cite{gomhaj_21}. UPMEM integrates standard DDR4 DIMM-based DRAM with a series of PIM-enabled UPMEM DIMMs containing multiple \ac{pim} chips. Each \ac{pim} chip houses eight \acp{dpu}, each with dedicated access to a 64 MiB memory bank, a 24 KiB instruction memory, and a 64 KiB scratchpad memory. These \acp{dpu} function as multithreaded 32-bit \ac{risc} cores, featuring a complete set of general-purpose registers and a 14-stage pipeline~\cite{gomhaj_21}.
In 2020, SK Hynix, a leading DRAM manufacturer, unveiled its \ac{pim} technology, named Newton, utilizing \ac{hbm}~\cite{he2020}. Unlike UPMEM, Newton integrates small MAC units and buffers into the bank area to mitigate the space and power overhead of a fully programmable processor core. Following SK Hynix's lead, Samsung, another major DRAM manufacturer, announced its own \ac{pim} DRAM implementation named \ac{fimdram} one year later~\cite{lee2021}.
For instance, UPMEM introduced the first publicly available real-world general-purpose \ac{pim} architecture~\cite{gomhaj_21}.
UPMEM integrates standard DDR4 DIMM-based DRAM with a series of PIM-enabled UPMEM DIMMs containing multiple \ac{pim} chips.
Each \ac{pim} chip houses eight \acp{dpu}, each with dedicated access to a 64 MiB memory bank, a 24 KiB instruction memory, and a 64 KiB scratchpad memory.
These \acp{dpu} function as multithreaded 32-bit \ac{risc} cores, featuring a complete set of general-purpose registers and a 14-stage pipeline~\cite{gomhaj_21}.
Even prior to UPMEM, Micron introduced its automata processor \cite{wang2016}.
It features a nondeterministic finite automaton (NFA) inside the DRAM to accelerate certain algorithms.
In 2020, SK Hynix, a leading DRAM manufacturer, unveiled its \ac{pim} technology, named Newton, utilizing \ac{hbm}~\cite{he2020}.
Unlike UPMEM, Newton integrates small MAC units and buffers into the bank area of the DRAM to mitigate the area and power overhead of a fully programmable processor core.
Following SK Hynix's lead, Samsung, another major DRAM manufacturer, announced its own \ac{pim} DRAM implementation named \ac{fimdram} one year later~\cite{lee2021}.
With these new architectures on the horizon, it becomes crucial for system-level designers to assess whether these promising developments can enhance their applications. Furthermore, these emerging hardware architectures necessitate new software paradigms. It remains unclear whether libraries, compilers, or operating systems will effectively manage these new devices at the software level. Therefore, it is imperative to establish comprehensive virtual platforms for these devices, enabling real applications to be tested within a realistic architectural and software platform context.
This paper introduces a virtual prototype of Samsung's \ac{fimdram}, developed using open-source tools such as gem5~\cite{lowahm_20} and the memory simulator \mbox{DRAMSys~\cite{stejun_20}}. Additionally, the virtual prototype is accompanied by a custom Rust software library, simplifying the utilization of \ac{pim} functionality at the software level.
This paper introduces a virtual prototype of Samsung's \ac{fimdram}, developed using open-source tools such as gem5~\cite{lowahm_20} and the DRAM simulator \mbox{DRAMSys~\cite{stejun_20}}. Additionally, the virtual prototype is accompanied by a custom Rust software library, simplifying the utilization of \ac{pim} functionality at the software level.
In summary, this paper makes the following contributions:
\begin{itemize}
\item We propose, to the best of our knowledge, for the first time full system simulation of \ac{fimdram} with a virtual platform consisting of gem5 and DRAMSys
\item We provide an experimental verification of VP with benchmarks
\item We propose a modern Rust library to provide the \ac{pim} functionality up to the software level
\item We propose, to the best of our knowledge, for the first time full-system simulation of \ac{fimdram} with a virtual platform consisting of gem5 and DRAMSys.
\item We provide an experimental verification of the virtual prototype with benchmarks.
\item We propose a modern Rust library to provide the \ac{pim} functionality up to the software level.
\end{itemize}
The paper is structured as follows. Section 2 shows the related work in the area of \ac{pim}-Simulation. Section 3 gives a brief background on the relative \ac{pim}-Architectures, whereas Section 4 explains the proposed \ac{pim} Virtual Platform. The Sections 5 and 6 show experimental simulation setup and the results, which are compared with already published results from \ac{pim} vendors. The paper is finally concluded in Section 7.
Using this novel full-system simulation framework, it is possible to evaluate the effectiveness of \ac{fimdram} for real-world applications in a detailed and realistic manner and to examine the implications of integrating this \ac{pim} solution into these applications.
The paper is structured as follows. Section 2 shows the related work in the area of \ac{pim} simulation. Section 3 gives a brief background on the relative \ac{pim} architectures, whereas Section 4 explains the proposed \ac{pim} virtual platform. The Sections 5 and 6 show experimental simulation setup and the results, which are compared with already published results from \ac{pim} vendors. The paper is finally concluded in Section 7.
%
\section{Related Work}
Several virtual prototypes of \ac{pim} architectures have been object to research in the past.
The authors of \cite{singh2019} and \cite{kim2016a} used Ramulator-PIM, which is based on the processor simulator ZSim \cite{sanchez2013} and the DRAM simulator Ramulator \cite{kim2016a}, to build high-level performance and energy estimation frameworks.
C. Yu et al. \cite{yu2021} introduced MultiPIM, a high-level \ac{pim} simulator capable of simulating parallel \ac{pim} cores, which is also based on Ramulator and ZSim.
However, these three publications focus primarily on \ac{hmc} DRAM, which has seen limited adoption.
The authors of NAPEL~\cite{singh2019} used Ramulator-PIM, which is based on the processor simulator ZSim \cite{sanchez2013} and the DRAM simulator Ramulator \cite{kim2016a}, to build a high-level performance and energy estimation framework.
Yu et al. \cite{yu2021} introduced \mbox{MultiPIM}, a \ac{pim} simulator, which is also based on Ramulator and ZSim, capable of simulating parallel \ac{pim} cores, distributed over a memory network.
However, these publications evaluate the \ac{pim} systems only from a high level of abstraction.
With PIMSim \cite{xu2019}, the authors provide a configurable \ac{pim} simulation framework that enables a full-system simulation of user-specified \ac{pim} logic cores.
The authors of DP-Sim \cite{zhou2021} present a full-stack infrastructure for \ac{pim}, based on a front-end that generates \ac{pim} instructions by instrumenting a host application and executing them in a \ac{pim}-enabled memory model.
Similarly, Sim\textsuperscript{2}PIM \cite{santos2021,forlin2022} uses instrumentation to simulate only the \ac{pim} side of a host application.
The MPU-Sim \cite{xie2022} simulator focuses on general-purpose near-bank processing units based on 3D DRAM technology, while neglecting the data transfers between the host CPU and the \ac{pim} devices.
The authors of DP-Sim~\cite{zhou2021} present a full-stack infrastructure for \ac{pim}, based on a front-end that generates \ac{pim} instructions by instrumenting a host application and executing them in a \ac{pim}-enabled memory model.
In a similar way, Sim\textsuperscript{2}PIM~\cite{santos2021,forlin2022} uses instrumentation to simulate only the \ac{pim} side of a host application.
The MPU-Sim~\cite{xie2022} simulator focuses on general-purpose near-bank processing units based on 3D DRAM technology, while neglecting the data transfers between the host CPU and the \ac{pim} devices.
These instrumentation approaches are less accurate when it comes to integration with the host processor because they primarily focus on simulating the \ac{pim} units.
Recently, the authors of \cite{esmaili-dokht2024a} presented a novel Active Compute Memory (ACM) architecture that allows for key-value sorting within the \ac{dram}.
To investigate the performance and energy improvements, they implemented a virtual prototype based on \mbox{ZSim} and \mbox{DRAMSim3}~\cite{li2020}.
A slightly different approach is taken by PiMulator \cite{mosanu2022}, which does not simulate but emulates \ac{pim} implementations such as RowClone \cite{seshadri2013} or Ambit \cite{seshadri2020} by implementing a soft-model in an FPGA.
Besides research \ac{pim} architectures, there are also virtual prototypes of industry architectures.
In addition to \ac{pim} architectures from research, there are also virtual prototypes of industry architectures.
Very recently, the authors of \cite{hyun2024} introduced uPIMulator, a cycle-accurate simulator that models UPMEM's real-world general-purpose \ac{pim} architecture.
To analyze the potential performance and power impact of Newton, SK Hynix developed a virtual prototype based on the DRAMSim2 \cite{rosenfeld2011} cycle-accurate memory simulator, which models a \ac{hbm2} memory and the extended Newton DRAM protocol. However, DRAMSym2 is more than 10 years old and several orders of magnitude slower than DRAMSys~\cite{steiner2022a}.
The simulated system is compared to two different non-\ac{pim} systems: an ideal non-\ac{pim} host with infinite compute bandwidth and a GPU model of a high-end Titan-V graphics card using a cycle-accurate GPU simulator.
SK Hynix finds that Newton achieves a \qty{54}{\times} speedup over the Titan-V GPU model and a speedup of \qty{10}{\times} for the ideal non-\ac{pim} case, setting a lower bound on the acceleration for every possible non-\ac{pim} architecture.
With PIMSimulator~\cite{shin-haengkang2023}, Samsung provides a virtual prototype of \ac{fimdram}, also based on DRAMSim2.
In addition to its automata processor, Micron introduced another \ac{pim} architecture called In-Memory Intelligence~\cite{finkbeiner2017}.
The new architecture places bit-serial computing elements at the sense amplifier level of the memory array.
Evaluations of In-Memory Intelligence are based on a custom Micron discrete event simulator that implements the hardware models.
Similarly, to analyze the potential performance and power impact of Newton, SK~Hynix developed a virtual prototype based on the DRAMSim2~\cite{rosenfeld2011} cycle-accurate memory simulator, which models a \ac{hbm2} memory and the extended Newton DRAM protocol.
However, \mbox{DRAMSim2} is more than 10 years old and several orders of magnitude slower than DRAMSys~\cite{steiner2022a}.
The simulated system is compared with two different non-\ac{pim} systems: an ideal non-\ac{pim} host with infinite compute bandwidth and a GPU model of a high-end Titan-V graphics card using a cycle-accurate GPU simulator.
SK~Hynix finds that Newton achieves a \qty{54}{\times} speedup over the Titan-V GPU model and a speedup of \qty{10}{\times} for the ideal non-\ac{pim} case, setting a lower bound on the acceleration for every possible non-\ac{pim} architecture.
With \mbox{PIMSimulator~\cite{shin-haengkang2023}}, Samsung provides a virtual prototype of \ac{fimdram}, also based on DRAMSim2.
PIMSimulator offers two simulation modes: it can either accept pre-recorded memory traces or generate very simplified memory traffic using a minimal host processor model that essentially executes only the \ac{pim}-related program regions.
However, neither approach accurately models a complete system consisting of a host processor running a real compiled binary and the memory system that integrates \ac{fimdram}.
As a result, only limited conclusions can be made about the performance impact of \ac{fimdram} and the changes that are required in the application code to support the new architecture.
In Samsung's findings, the simulated \ac{fimdram} system provides a speedup in the range of \qtyrange{2.1}{2.6}{\times} depending on the simulated workload with an average speedup of \qty{2.5}{\times} compared to standard \ac{hbm2} memory.
However, both approaches do not accurately model a complete system consisting of a host processor running a real compiled binary and a memory system that integrates \ac{fimdram}.
As a result, only limited conclusions can be drawn about the performance improvements of \ac{fimdram} and the necessary modifications to the application code to support the new architecture.
In Samsung's findings, the simulated \ac{fimdram} system provides a speedup in the range of \qtyrange{2.1}{2.6}{\times} depending on the simulated workload with an average speedup of \qty{2.5}{\times} compared to the system with standard \ac{hbm2} memory.
Based on both the Newton and \ac{fimdram} architectures, PipePIM~\cite{jeong2024} pipelines the operation of the bank-level processing units, achieving speedups of \qty{2.16}{\times} and \qty{1.74}{\times}, respectively, over the base PIM architectures.
The simulation environment is based on Ramulator, but few details are given about how detailed the host is simulated.
Looking beyond the simulation frameworks presented, this work aims to provide a virtual prototype of an existing \ac{pim} architecture to enable functionally correct full-system simulations: from the integration of the \ac{pim} software stack into the application, over the detailed simulation of a processor running the real compiled binary, to the simulation of a model of \ac{fimdram}, while obeying the complex \ac{dram}-related timing dependencies.
\section{Background DRAM-PIM}
\label{sec:dram_pim}
@@ -135,41 +320,65 @@ In contrast, compute-bound workloads tend to have high data reuse and can make e
A large number of modern \ac{dnn} layers can be expressed as a matrix-vector multiplication.
The layer inputs can be represented as a vector and the model weights can be viewed as a matrix, where the number of columns is equal to the size of the input vector and the number of rows is equal to the size of the output vector.
Pairwise multiplication of the input vector and a row of the matrix are be used to calculate an entry of the output vector.
Pairwise multiplication of the input vector and a row of the matrix are used to calculate an entry of the output vector.
Such an operation, defined in the widely used \ac{blas} library \cite{blas1979}, is also known as a \acs{gemv} routine.
Because one matrix element is only used exactly once in the calculation the output vector, there is no data reuse of the matrix.
Further, as the weight matrices tend to be too large to fit on the on-chip cache, such a \ac{gemv} operation is deeply memory-bound \cite{he2020}.
As a result, such an operation is a good fit for \ac{pim}.
Because one matrix element is only used exactly once in the calculation of the output vector, there is no data reuse in the matrix.
Further, as the weight matrices tend to be too large to fit into the on-chip cache, such a \ac{gemv} operation is deeply memory-bound \cite{he2020}.
Consequently, such an operation is a good fit for \ac{pim}.
Many different \ac{pim} architectures have been proposed by research in the past, and more recently real implementations have been presented by hardware vendors.
These proposals differ largely in the positioning of the processing operation applied, ranging from the analog distribution of capacitor charges at the DRAM's subarray level to additional processing units at the global I/O level.
Many different \ac{pim} architectures have been proposed by researchers in the past, and more recently real implementations have been introduced by hardware vendors.
These proposals differ largely in the location of the processing operation, ranging from analog distribution of capacitor charges at the DRAM subarray level to additional processing units at the global I/O level.
Each of these approaches comes with different advantages and disadvantages.
In short, the closer the processing is to the DRAM's subarray, the higher the energy efficiency and the achievable processing bandwidth.
On the other hand, the integration of the \ac{pim} units inside the bank becomes more difficult as area and power constraints limit the integration \cite{sudarshan2022}.
The closer the processing is located to the DRAM subarray, the higher the energy efficiency and achievable processing bandwidth, as a higher level of parallelism can be achieved.
This is because the processing bandwidth is not limited by the narrow data bus, but by the respective hierarchical level of the processing units.
On the other hand, the integration of the \ac{pim} units inside the memory array becomes more difficult as area and power constraints limit the integration \cite{sudarshan2022}.
One real \ac{pim} implementation of the DRAM manufacturer Samsung, called \acf{fimdram}, has been presented in 2021 \cite{kwon2021,lee2021}.
\Ac{fimdram} is based on the \ac{hbm2} memory standard, and it integrates 16-wide \ac{simd} engines directly into the memory banks, exploiting bank-level parallelism, while preserving the highly optimized memory subarray \cite{kwon2021}.
A special feature of \ac{fimdram} is that it does not require any changes to components of modern processors, such as the memory controller, i.e., it is agnostic to existing \ac{hbm2} platforms.
One real \ac{pim} implementation of the DRAM manufacturer Samsung, called \acf{fimdram}, was presented in 2021 \cite{kwon2021,lee2021}.
\Ac{fimdram} is based on the \ac{hbm2} memory standard and it integrates 16-wide \ac{simd} engines directly into the memory banks, exploiting bank-level parallelism, while preserving the highly optimized memory subarray \cite{kwon2021}.
A special feature of \ac{fimdram} is that it does not require any modifications to components of modern processors, such as the memory controller, i.e., it is agnostic to existing \ac{hbm2} platforms.
Consequently, for the operation of the \acp{pu}, mode switching is required for \ac{fimdram}, which makes it less useful for interleaved \ac{pim} and non-\ac{pim} traffic and small batch sizes.
At the heart of \ac{fimdram} lie the \acp{pu}, where one of which is shared by two banks of the same \ac{pch}.
The architecture of such a \ac{pu} is illustrated in \cref{fig:pu}.
\begin{figure}
\centering
\includegraphics{images/processing_unit.pdf}
\caption{The architecture of a \ac{pu} \cite{lee2021}.}
%\includegraphics{images/processing_unit.pdf}
\resizebox{\linewidth}{!}{\begin{tikzpicture}
\draw(0,0) node [draw, minimum width=8cm, minimum height=3cm, anchor={north west}](main){};
\draw(main.north) ++(0,+0.1) node [draw, fill=_blue, minimum width=8cm, minimum height=0.75cm, anchor=south](even){Even Bank Interface};
\draw(main.south) ++(0,-0.1) node [draw, fill=_blue, minimum width=8cm, minimum height=0.75cm, anchor=north](odd){Odd Bank Interface};
\draw(0.2cm,-0.2cm) node [draw, minimum width=1.25cm, minimum height=2.6cm, anchor={north west}, fill=_blue!50](control){\rotatebox{90}{Control}};
\draw(1.7cm,-0.2cm) node [draw, minimum width=1.25cm, minimum height=2.6cm, anchor={north west}, align=center, fill=_green](){CRF\\SRF};
\draw(3.15cm,-0.2cm) node [draw, minimum width=4.7cm, minimum height=0.5cm, anchor={north west}, align=center, fill=_green](){GRF\_A};
\draw(3.15cm,-2.30cm) node [draw, minimum width=4.7cm, minimum height=0.5cm, anchor={north west}, align=center, fill=_green](){GRF\_B};
\draw(3.15cm,-0.8cm) node [draw, minimum width=4.7cm, minimum height=1.4cm, anchor={north west}, align=center, fill=gray](){};
\draw(3.35cm,-0.9cm) node [draw, minimum width=2.0cm, minimum height=1.2cm, anchor={north west}, align=center, fill=gray!50](){FP16MULT};
\draw(5.60cm,-0.9cm) node [draw, minimum width=2.0cm, minimum height=1.2cm, anchor={north west}, align=center, fill=gray!50](){FP16ADD};
%
\draw[Triangle-Triangle](odd.south) -- ++(0,-0.5cm);
\draw(odd.south) to [open, name={h1}] ++(0,-0.5cm);
\draw(h1.center) node[anchor=west](){Local Bus to Odd Bank};
%
\draw[Triangle-Triangle](even.north) -- ++(0,+0.5cm);
\draw(even.north) to [open, name={h1}] ++(0,+0.5cm);
\draw(h1.center) node[anchor=west](){Local Bus to Even Bank};
%
\draw[Triangle-](control.130) -- ++(-0.5cm,0) coordinate(h1);
\draw[Triangle-](control.230) -- ++(-0.5cm,0) coordinate(h2);
\draw(h2) node[rotate=90, anchor=south](){Address};
\draw(h1) node[rotate=90, anchor=south, align=center](){Internal\\Commands};
\end{tikzpicture}}
\caption{The architecture of a \ac{pu}, according to~\cite{lee2021}.}
\label{fig:pu}
\end{figure}
A \ac{pu} contains two sets of \ac{simd} \acp{fpu}, one for addition and one for multiplication, where each set contains 16 16-bit wide \acp{fpu} each.
Besides the \acp{fpu}, a \ac{pu} contains a \ac{crf}, a \ac{grf} and a \ac{srf} \cite{lee2021}.
The 16-wide \ac{simd} units correspond to the 256-bit prefetch architecture of \ac{hbm2}, where 16 16-bit floating-point operands are passed directly from the \acp{ssa} to the \acp{fpu} from a single memory access.
The 16-wide \ac{simd} units correspond to the 256-bit prefetch architecture of \ac{hbm2}, where 16 16-bit floating-point operands are passed directly from the \acp{ssa} to the \acp{fpu} as the result of a single memory access.
As all \ac{pim} units operate in parallel, with 16 banks per \ac{pch}, a singular memory access loads a total of $\qty{256}{\bit}\cdot\qty{8}{\acp{pu}}=\qty{2048}{\bit}$ into the \acp{fpu}.
As a result, the theoretical internal bandwidth of \ac{fimdram} is $\qty{8}{\times}$ higher than the external bus bandwidth to the host processor.
\Ac{fimdram} defines three operating modes:
The default \textbf{\ac{sb} mode}, where \ac{fimdram} has identical behavior to normal \ac{hbm2} memory.
In the default \textbf{\ac{sb} mode}, the \ac{fimdram} has identical behavior to normal \ac{hbm2} memory.
To switch to another mode, a specific sequence of \ac{act} and \ac{pre} commands must be sent by the memory controller to specific row addresses.
The \textbf{\ac{ab} mode} is an extension to the \ac{sb} mode where the \ac{pim} execution units allow for concurrent access to half of the DRAM banks at the same time.
This provides $\qty{8}{\times}$ more bandwidth than the standard operation mode, which can be used for the initialization of memory regions across all banks.
@@ -179,7 +388,7 @@ In addition, the I/O circuits of the DRAM for the data bus are completely disabl
Both in \ac{ab} mode and in \ac{abp} mode, the total \ac{hbm2} bandwidth per \ac{pch} of $\qty{16}{\giga\byte\per\second}$ is $\qty{8}{\times}$ higher with $\qty{128}{\giga\byte\per\second}$ or in total $\qty{2}{\tera\byte\per\second}$ for 16 \acp{pch}.
Due to the focus on \ac{dnn} applications in \ac{fimdram}, the native data type for the \acp{fpu} are \ac{fp16} numbers, which is motivated by the significantly lower area and power requirements for \acp{fpu} compared to 32-bit floating-point numbers.
The \ac{simd} \acp{fpu} of the processing units is implemented once as a \ac{fp16} multiplier unit, and once as a \ac{fp16} adder unit, providing support for these basic algorithmic operations.
The \ac{simd} \acp{fpu} of the processing units are implemented as both an 16-wide \ac{fp16} multiplier unit and an 16-wide \ac{fp16} adder unit, providing support for these basic algorithmic operations.
The \ac{crf} acts as an instruction buffer, holding the 32 32-bit instructions to be executed by the processor when performing a memory access.
A program that is stored in the \ac{crf} is called a \textit{microkernel}.
@@ -189,94 +398,116 @@ Finally, in the \acp{srf}, a 16-bit scalar value is replicated $\qty{16}{\times}
It is also divided into two halves (\ac{srf}-A and \ac{srf}-M) for addition and multiplication with eight entries each.
The \ac{fimdram} instruction set provides a total of 9 32-bit \ac{risc} instructions, each of which falls into one of three groups: control flow instructions (NOP, JUMP, EXIT), arithmetic instructions (ADD, MUL, MAC, MAD) and data movement instructions (MOV, FILL).
Since the execution of an instruction in the microkernel is initiated by a memory access, the host processor must execute \ac{ld} or \ac{st} store instructions in a sequence that perfectly matches the loaded \ac{pim} microkernel.
Since the execution of an instruction in the microkernel is initiated by a memory access, the host processor must execute \acf{ld} or \acf{st} instructions in a sequence that perfectly matches the loaded \ac{pim} microkernel.
When an instruction executes directly on data that is provided by a memory bank, the addresses of these memory accesses specify the exact row and column where the data should be loaded from or stored to.
This means that the order of the respective memory accesses for such instructions is important and must not be reordered by the processor or memory controller, as it must match the corresponding instruction in the microkernel.
One solution to this problem would be to introduce memory barriers between each \ac{ld} and \ac{st} instruction of the processor, to prevent any reordering, however this comes at a significant performance cost and results in memory bandwidth being underutilized.
To solve this overhead, Samsung has introduced the \ac{aam} mode for arithmetic instructions.
In the \ac{aam} mode, the register indices of an instruction are ignored and decoded from the column and row address of the memory access itself.
With this method, the register indices and the bank addresses cannot get out of sync, as they are tightly coupled, even if the memory controller reorders the order of the accesses.
Using this approach, the register indices and bank addresses remain synchronized, even if the memory controller reorders the access order.
\section{PIM Virtual Plattform}
To build a virtual prototype of \ac{fimdram}, an accurate model for \ac{hbm2} is needed, where the additional \ac{pim}-\acp{pu} are integrated.
For this, the cycle-accurate DRAM simulator DRAMSys \cite{steiner2022a} was used and its \ac{hbm2} model was extended to include the \acp{pu} in the \acp{pch} of the \ac{pim} activated channels.
\section{PIM Virtual Platform}
To build a virtual prototype of \ac{fimdram}, an accurate model for \ac{hbm2} is needed, in which the additional \ac{pim}-\acp{pu} can be integrated.
For this, the cycle-accurate DRAM simulator DRAMSys \cite{steiner2022a} is used and its \ac{hbm2} model is extended to include the previously described \acp{pu} into the \acp{pch} of the \ac{pim}-activated channels.
The \ac{fimdram} model itself does not need to model any timing behavior:
its submodel is essentially untimed, since it is already synchronized with the operation of the DRAM model of DRAMSys.
Consequently, the model focuses on implementing the functional behavior of \ac{fimdram}, while implicitly being accurate with respect to \ac{dram} timing constraints.
To achieve a full-system simulation, detailed processor and cache models are required in addition to the \ac{pim}-enabled memory system.
For this, the gem5 simulator was used, which generates memory requests by executing the instructions of a compiled workload binary.
For this, the gem5 simulator is used, which generates memory requests by executing the instructions of a compiled workload binary.
While \ac{fimdram} operates in the default \ac{sb} mode, it behaves exactly like a normal \ac{hbm2} memory.
Only when the host initiates a mode switch of one of the \ac{pim}-enabled \acp{pch}, the processing units become active.
When entering \ac{ab} mode, the DRAM model ignores the specific bank address of incoming \ac{wr} commands and internally performs the write operation for either all even or all odd banks of the \ac{pch}, depending on the parity of the original bank index.
After the transition to the \ac{ab} mode, the DRAM can further transition to the \ac{abp} mode, which allows the execution of instructions in the processing units.
The \ac{abp} mode is similar to the \ac{ab} mode in that it also ignores the concrete bank address except for its parity, while additionally passing the column and row address and, in the case of a read, also the respective fetched bank data to the processing units.
Only then, the \ac{pu} model executes the instructions of the microkernel that operate on the read input data.
In the case of a write access, the output of the processing unit is written directly into the corresponding bank, ignoring the actual data of the transaction object coming from the host processor.
This is equivalent to the real \ac{fimdram} implementation, where the global I/O bus of the memory is not actually driven, and all data movement is done internally in the banks.
The model's internal state of a processing unit consists of the \ac{grf} register files \ac{grf}-A and \ac{grf}-B, the \ac{srf} register files \ac{srf}-A and \ac{srf}-M, the program counter, and a jump counter that keeps track of the current iteration of a JUMP instruction.
Depending on a \ac{rd} or \ac{wr} command received from the DRAM model, the control flow is dispatched into one of two functions that execute an instruction in the \ac{crf} and increment the program counter of the corresponding \ac{pim} unit.
Both functions calculate the register indices used by the \ac{aam} execution mode followed by a branch table that dispatches to the handler of the current instruction.
In case of the data movement instructions MOV and FILL, a simple move operation that loads to value of one register or the bank data and assigns it to the destination register is performed.
Depending on a \acs{rd} or \acs{wr} command received from the DRAM model, the control flow is dispatched into one of two functions that execute an instruction in the \ac{crf} and increment the program counter of the corresponding \ac{pim} unit.
Both functions calculate the register indices from the memory address that are used by the \ac{aam} execution mode, and dispatch using a branch table to the handler of the current instruction.
In case of the data movement instructions MOV and FILL, the model executes a move operation that loads the value of one register or the bank data and assigns it to the destination register.
The arithmetic instructions fetch the operand data from their respective sources and perform the operation, and write back the result by modifying the internal state of the \ac{pu}.
Note that while the MAC instruction can iteratively add to the same destination register, it does not reduce the 16-wide \ac{fp16} vector itself in any way.
Instead it is the host processor's responsibility to reduce these 16 floating point numbers into one \ac{fp16} number.
Note that while the MAC instruction can iteratively add to the same destination register, it can not reduce the 16-wide \ac{fp16} vector itself.
Instead, it is the responsibility of the host processor to reduce these 16 floating point numbers to a single \ac{fp16} number that represents an entry in the output vector.
With this implementation of \ac{fimdram}, it is now possible to write a user program that controls the execution of the \ac{pim}-\acp{pu} directly in the \ac{hbm2} model.
With this implementation of a \ac{fimdram} model, it is now possible to write a user program that controls the execution of the \ac{pim}-\acp{pu} directly in the \ac{hbm2} model.
However, correctly placing the input data in the DRAM and arbitrating its execution is a non-trivial task.
% TODO Lukas/Matthias
Therefore, a software library based on the Rust programming language \cite{rust} is provided.
Due to its strict aliasing rules, Rust allows for a safe execution of the microkernels, as it can guarantee that the \ac{pim} data is not accessed by the program during operation of the \acp{pu}.
The following functionality is implemented in the library:
It implements the \textbf{mode switching} logic, that switches between \ac{sb}, \ac{ab} and \ac{abp} modes.
For the programming of the \textbf{microkernels}, the library provides data structures for their assembly and transfer to the \ac{pim} units.
Data structures are also provided for the layout of the input operands in a \ac{pim}-specific \textbf{memory layout}.
After mode switching and programming of the microkernel, the library implements functionality to \textbf{execute a user-defined microkernel} by issuing the necessary memory requests through the execution of \ac{ld} and \ac{st} instructions.
The library contains the logic to safely switch between \ac{sb}, \ac{ab} and \ac{abp} modes by writing to a designated memory region.
Additionally, it offers data structures to facilitate the assembly and transfer of microkernels to the \ac{pim} units.
In order to place input operands in a specific memory layout required for \ac{pim}, data structures are also provided to facilitate this.
After mode switching and programming of the microkernel, the library implements functionality to execute a user-defined microkernel by issuing the necessary memory requests through the execution of \ac{ld} and \ac{st} instructions.
The use of \ac{aam} requires a special memory layout so that the register indices are correctly calculated from the column and row addresses of a memory access.
The memory layout of a weight matrix used for e.g., a \ac{gemv} operation is illustrated in \cref{img:matrix_layout}.
The mapping of an exemplary weight matrix used for a \ac{gemv} operation is illustrated in \cref{img:matrix_layout}.
\begin{figure}
\centering
\resizebox{0.8\linewidth}{!}{\input{images/matrix_layout}}
\caption{Mapping of the weight matrix onto the memory banks.}
\label{img:matrix_layout}
\end{figure}
To make use of all eight \ac{grf}-A registers, the input address has to increment linearly, while adhering a column-major matrix layout.
The actual memory layout in the linear address space required to achieve this mapping depends on the address mapping of the memory controller.
To use all eight \ac{grf}-A registers of a \ac{pu}, each matrix row must be placed in its own bank.
Because most memory controllers implement bank interleaving, where adjacent memory accesses cycle between banks, the matrix must adhere to a column-major layout.
In a column-major matrix layout, the entries of a column are stored sequentially before switching to the next column, according to the \texttt{MATRIX[R][C]} C-like array notation.
However, the concrete element type of the array is not a single \ac{fp16} element, but a vector of 16 \acp{fp16} packed together.
This results in 16 \ac{fp16} matrix row elements being stored sequentially before switching to the next 16 \ac{fp16} elements in the next row of the same 16 columns, ensuring that a \ac{simd} processing unit always contains the data of only one matrix row.
However, the concrete element type of such an array is not a single \ac{fp16} element, but a vector of 16 \acp{fp16} packed together, since this corresponds to a \qty{32}{\byte} memory access.
This results in 16 \ac{fp16} matrix row elements being stored sequentially before switching to the next 16 \ac{fp16} elements in the next row of the same 16 columns, ensuring that a \ac{simd} processing unit always contains the data of only its associated matrix row.
To guarantee the correct placement of the first matrix element at the boundary of the first bank of the \ac{pch}, an alignment for the matrix data structure of $\qty{512}{\byte}$ would need to be explicitly enforced.
However, when using the \ac{aam} execution mode, this is not sufficient.
As already mentioned in \cref{sec:dram_pim}, the \ac{grf}-A and \ac{grf}-B indices are calculated from the column and row address of the triggering memory access.
With an alignment of $\qty{512}{\byte}$, no assumptions can be made about the initial value of the \ac{grf}-A and \ac{grf}-B indices, while for the execution of a complete \ac{gemv} kernel, both indices should start with zero.
Therefore, the larger alignment requirement of ${2^6 \cdot \qty{512}{\byte} = \qty{32768}{\byte}}$ must be ensured for the weight matrix.
Therefore, to accommodate the additional six address bits corresponding to the indices, the weight matrix must be aligned to a stricter requirement of $2^6 \cdot \qty{512}{\byte} = \qty{32768}{\byte}$.
The simplified pseudo code for defining a matrix with $R$ rows and $C$ columns is given in \cref{lst:pseudo_code}.
It is important to note that while the matrix itself follows a column-major layout, 16 \ac{fp16} elements are packed together.
The operand initialization, the host processor executes the \ac{pim} microkernel by first switching to the \ac{abp} mode and then issuing the required \ac{rd} and \ac{wr} memory requests by executing \ac{ld} and \ac{st} instructions.
When executing control instructions or data movement instructions that operate only on the register files, the \ac{rd} and \ac{wr} requests must be located in a dummy region of memory where no actual data is stored, but which must be allocated beforehand.
\begin{listing}
\begin{minted}{rust}
#[repr(C, align(32768))]
struct Matrix<const R: usize, const C: usize>(
[[F16x16; R]; C / 16],
);
\end{minted}
\caption{Pseudo code for the definition of a PIM-enabled \ac{fp16} matrix.}
\label{lst:pseudo_code}
\end{listing}
Following operand initialization, the host processor proceeds to execute the \ac{pim} microkernel.
It begins by transitioning to the \ac{abp} mode and subsequently issues the necessary memory \acs{rd} and \acs{wr} requests through the execution of \acs{ld} and \acs{st} instructions.
When executing control instructions or data movement instructions that operate only on the register files, the \ac{rd} and \ac{wr} requests must be located in a dummy region of memory where no actual data is stored, but which must be reserved for that purpose.
Further, when data is read from or written to the memory banks, these memory requests are issued with the correct address for the data.
As half the banks in a \ac{pch} operate at the same time, from the viewpoint of the host processor, the data accesses occur very sparsely.
In the case of the input vector, where one 16-wide \ac{simd} vector of \ac{fp16} elements is repeated as often as there are banks in a \ac{pch}, a burst access must occur every $\qty{32}{\byte}\cdot\mathrm{number\ of\ banks\ per\ \ac{pch}}=\qty{512}{\byte}$, over the entire interleaved input vector for a maximum of $\qty{8}{\times}$.
In the case of the input vector, where one 16-wide \ac{simd} vector of \ac{fp16} elements is repeated as often as there are banks in a \ac{pch}, a burst access must occur every $\qty{32}{\byte}\cdot\mathrm{number\ of\ banks\ per\ \ac{pch}}=\qty{512}{\byte}$ over the entire interleaved input vector for a maximum of $\qty{8}{\times}$.
To then perform the repeated MAC operation with the weight matrix as bank data, a similar logic must be applied.
Since each row of the matrix resides on its own memory bank, with an interleaving of the size of a 16-wide \ac{simd} vector of \ac{fp16} elements, also one memory access must be issued every $\qty{512}{\byte}$.
Since each row of the matrix resides in its own memory bank, with an interleaving of the size of a 16-wide \ac{simd} vector of \ac{fp16} elements, also one memory access must be issued every $\qty{512}{\byte}$.
As the input address of the weight matrix grows, the \ac{grf}-A and \ac{grf}-B indices are incremented in such a way that the \ac{grf}-A registers are read repeatedly to multiply the weights by the input vector, while the \ac{grf}-B registers are incremented in the outer loop to hold the results of additional matrix rows.
Besides generating memory requests, an important task of the software library is to maintain the data coherence of the program.
The compiler may introduce invariants with respect to the value of the output vector, since it does not see that the value of the vector has changed without the host explicitly writing to it.
The compiler may introduce invariants with respect to the value of the output vector, since it does not observe that the value of the vector has changed without the host explicitly writing to it.
As a result, the compiler may make optimizations that are not obvious to the programmer, such as reordering memory accesses, that cause the program to execute incorrectly.
To avoid this, not only between non-\ac{aam} instructions in the microkernel, but also after initializing the input operands and before reading the output vector, memory barriers must be introduced to ensure that all memory accesses and \ac{pim} operations are completed.
When performing a gem5 simulation, there are three options to choose from: syscall emulation mode, full-system Linux mode, and full-system bare-metal mode.
Due to the added system complexity of simulating a complete operating system, the bare-metal option was chosen over the full-system Linux mode.
A self-written kernel provides full control for implementing a minimal example using \ac{fimdram}, but some setup is required, such as initializing page tables for memory management.
\section{Simulations}
Our simulations are based on the gem5 simulator and the DRAMSys memory simulator.
The comparison between non-\ac{pim} and \ac{pim} architectures considers a hypothetical host processor with infinite compute capacity.
In this ideal approach, memory bandwidth is the only limiting component, allowing only memory-bound effects to be considered.
This provides a lower bound on the possible speedups achieved by \ac{pim}, independent of the host architecture.
The comparison between non-\ac{pim} and \ac{pim} architectures considers a hypothetical ARM host processor with infinite compute capacity.
In this ideal approach, memory bandwidth is the only limiting constraint, so only memory-bound effects are considered.
This approach provides a lower bound on the possible speedups \ac{pim} can achieve:
As the memory bound can only become less significant, real systems will see higher speedups due to the additional compute overhead.
The configuration of \ac{hbm2} DRAM is summarized in \cref{tab:memspec}.
\begin{table}
\centering
\begin{tblr}{
hlines,
vlines,
%vlines,
column{3} = {r},
row{1} = {l},
hline{2} = {2}{-}{solid,black},
@@ -298,13 +529,16 @@ Our benchmarks are divided into two classes: vector benchmarks, which perform le
Both classes of benchmarks are typically memory-bound, since little or no data is reused during the operation.
For the first class of benchmarks, two \ac{fp16} vectors are added (VADD), multiplied (VMUL), or combined in a \ac{haxpy} fashion.
The second class of benchmarks performs a \ac{gemv} matrix-vector multiplication or models a simple fully connected neural network with multiple layers and applying the activation function \ac{relu} in between.
Each benchmark is executed with variable operand dimensions, which are listed in \cref{tab:dimensions}.
The \ac{relu} operation is executed in \ac{fimdram} during a MOV instruction, by setting a specific instruction flag.
Between the network layers, control is switched back to the host, since it must first reduce the partial sums computed by \ac{fimdram} to produce the input vector of the next layer.
Each benchmark is executed with a set of different operand dimensions, called levels, which are listed in \cref{tab:dimensions}.
The column for the vector benchmark describes the dimension of both operand vectors, while the columns for the \ac{gemv} and \ac{dnn} benchmarks describe the matrix dimensions.
\begin{table}
\centering
\begin{tblr}{
hlines,
vlines,
%vlines,
column{1} = {c},
column{2} = {r},
column{3} = {r},
@@ -318,35 +552,41 @@ Each benchmark is executed with variable operand dimensions, which are listed in
X3 & 8M & (4096 $\times$ 8192) & (1024 $\times$ 1024) \\
X4 & 16M & (8192 $\times$ 8192) & (2048 $\times$ 2048)
\end{tblr}
\caption{Input operand dimensions.}
\caption{Operand dimensions.}
\label{tab:dimensions}
\end{table}
The benchmarks focus lies on the achievable performance gain of \ac{pim}.
The benchmarks' focus lies on the achievable performance gain of \ac{pim}.
In each run simulation, the relative performance (speedup) of \ac{pim} compared to non-\ac{pim} is analyzed.
\section{Results}
The results in \cref{fig:speedups} show significant speedups for all vector benchmarks in all simulated operand dimensions, with the following average values: $\qty{12.7}{\times}$ for VADD, $\qty{10.4}{\times}$ for VMUL and $\qty{17.5}{\times}$ for \ac{haxpy}.
On the other hand, the achieved speedup for the matrix-vector simulations varied with the simulated operand dimensions.
The \ac{gemv} benchmark achieved a speedup in the range $\qtyrange{8.7}{9.2}{\times}$ with an average value of $\qty{9.0}{\times}$, while the fully connected neural network layers experienced a higher variance:
With a range of $\qtyrange{0.6}{6.0}{\times}$, the \ac{dnn} benchmark experienced both a slowdown and an acceleration of the inference time.
On the other hand, the achieved speedup for the matrix-vector simulations varies with the simulated operand dimensions.
The \ac{gemv} benchmark achieved a speedup in the range $\qtyrange{8.7}{9.2}{\times}$ with an average value of $\qty{9.0}{\times}$, while the fully connected neural network layers experience a higher variance:
With a range of $\qtyrange{0.6}{6.0}{\times}$, the \ac{dnn} benchmark experiences both a slowdown and an acceleration of the inference time.
Therefore, there is a break-even point between dimensions X1 and X2 where \ac{pim} can be expected to become viable.
\begin{figure}
\centering
\subfloat[\centering Vector Benchmarks]{{\input{plots/vector_infinite}}}
\subfloat[\centering Vector Benchmarks]{{\input{plots/vector_infinite}}}
\subfloat[\centering Matrix-Vector Benchmarks]{{\input{plots/matrix_infinite}}}
\caption{Comparison between non-\ac{pim} and \ac{pim}.}
\caption{Speedup of \ac{pim} compared to non-\ac{pim}.}
\label{fig:speedups}
\end{figure}
Besides it's own virtual prototype, Samsung used a real hardware accelerator platform for its analyses, which is based on a Xilinx Zynq Ultrascale+ FPGA and uses real manufactured \ac{fimdram} memory packages.
Similar to the previous simulations, Samsung has used different input dimensions for its microbenchmarks for both its \ac{gemv} and its vector ADD workloads, which are equivalent.
In addition to its own virtual prototype, Samsung used a real hardware accelerator platform for its analysis, based on a unmodified high-end processor with 60 compute units and using real manufactured \ac{fimdram} memory packages.
Similar to the simulation setup of this paper, Samsung has used different input dimensions for its microbenchmarks for both its \ac{gemv} and its vector ADD workloads.
These are consistent with the previous dimension levels.
The performed ADD microbenchmark of Samsung shows an average speedup of around $\qty{1.6}{\times}$ for the real system and \qty{2.6}{\times} for the virtual prototype.
Compared to this paper, where the speedup is approximately $\qty{12.7}{\times}$, this result almost an order of magnitude lower.
Samsung explains its low value by the fact the processor has to introduce memory barrier instructions, resulting in a severe performance hits.
However, this memory barrier has also been implemented in our VADD kernel, which still shows a significant performance gain.
Compared to this paper, where the speedup is approximately $\qty{12.7}{\times}$, this result is almost an order of magnitude lower.
Samsung explains the low speedup by the fact the processor has to introduce memory barrier instructions between every 8 ADD instructions, resulting in a severe performance degradation.
However, this memory barrier was also implemented in our VADD kernel.
One possible explanation for the deviation could be architectural differences between the simulated ARM-based system and Samsung's GPU-based system.
The simulated platform can speculatively execute instructions, which may result in better utilization of memory bandwidth.
In addition, the vector benchmarks require more memory barriers relative to the number of arithmetic instructions, as their microkernels do not contain any loops.
So the effects of architectural differences caused by these memory barriers would affect the vector benchmarks more than the matrix benchmarks.
The \ac{gemv} microbenchmark on the other hand shows a more matching result with an average speedup value of $\qty{8.3}{\times}$ for Samsung's real system and \qty{2.6}{\times} for their virtual prototype, while this paper achieved an average speedup of $\qty{9.0}{\times}$, which is well within the reach of the real hardware implementation.
@@ -359,14 +599,26 @@ The \ac{gemv} microbenchmark on the other hand shows a more matching result with
\Cref{fig:wallclock_time} shows the simulation runtimes of the various workloads on the host system.
With \ac{pim} enabled, the runtime drops by about an order of magnitude for some workloads, indicating the reduced simulation effort on gem5's complex processor model, as only new memory requests are issued by the model during operation of \ac{pim}.
Therefore, exploring the effectiveness of different \ac{pim}-enabled workloads may be less time-consuming than a traditional workloads due to the reduced simulation complexity.
Therefore, exploring the effectiveness of different \ac{pim}-enabled workloads may be less time-consuming than traditional workloads due to the reduced simulation complexity.
\section{Conclusion}
% TODO Lukas/Matthias
%
In this paper, we presented a virtual prototype of Samsungs \ac{fimdram} architecture for simulation and evaluation of real-world applications.
Leveraging the open-source tools gem5 and DRAMSys, the \ac{fimdram} implementation integrates seamlessly into sophisticated simulation frameworks that enable the realistic exploration of a wide-range of workloads using full-system simulation.
In addition to the hardware perspective, the analysis includes considerations from a software point of view and identifies the necessary modifications to the data layout in applications in order to efficiently make use of \ac{fimdram}.
Using this simulation framework, we conducted an analysis of the potential feasibility and effectiveness of \ac{pim} across a range of microbenchmarks.
The simulations demonstrated a reduction in execution time by \qty{9.2}{\times} for matrix-vector operations, and for simplified neural network tasks, a reduction by up to a factor of \qty{6.0}{\times}.
These findings are largely consistent with the results reported by Samsung, with the exception of a deviation observed in the vector microbenchmarks.
Furthermore, an examination of the wallclock time for simulations comparing non-PIM and PIM approaches showed that the decreased complexity of simulations can lead to a reduction by up to an order of magnitude.
In this work, the first system-level virtual prototype of Samsung's \ac{fimdram} is presented, enabling the rapid exploration and feasibility analysis of various workloads in a realistic and detailed manner.
Looking ahead, future work should focus on providing estimations on the energy efficiency of the \ac{pim} architecture and on expanding the software framework to a Linux implementation, enabling further research on real-world AI applications.
% TODO teilweise doppelte Einträge!
\bibliographystyle{IEEEtran}
\bibliography{references.bib}
\section*{Acknowledgments}
%
This work was partly funded by the German Federal Ministry of Education and
Research (BMBF) under grant 16ME0934K (DI-DERAMSys).
\bibliographystyle{ACM-Reference-Format}
\bibliography{references}
\end{document}